1
0
Fork 0
forked from Chay/scrp

add error handler by internal request

This commit is contained in:
lelkins 2024-04-20 14:07:34 +03:00
parent 53609fbb90
commit e42f68947e

105
main.py
View file

@ -15,42 +15,50 @@ class NewsScraper:
self.content_selector = content_selector self.content_selector = content_selector
def scrape(self): def scrape(self):
if self.enable_js: try:
session = HTMLSession() if self.enable_js:
response = session.get(self.url) session = HTMLSession()
response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed) response = session.get(self.url)
html = response.html.html response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed)
else: html = response.html.html
response = requests.get(self.url) else:
html = response.content response = requests.get(self.url)
html = response.content
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
news_items = [] news_items = []
for item in soup.select(self.item_selector): for item in soup.select(self.item_selector):
title_element = item.select_one(self.title_selector) title_element = item.select_one(self.title_selector)
time_element = item.select_one(self.time_selector) time_element = item.select_one(self.time_selector)
if title_element and time_element: if title_element and time_element:
title = title_element.text.strip() title = title_element.text.strip()
time = time_element.text.strip() time = time_element.text.strip()
if self.show_content: if self.show_content:
if self.content_selector: if self.content_selector:
content_element = item.select_one(self.content_selector) content_element = item.select_one(self.content_selector)
else: else:
content_element = item.find('div', class_='field-content') content_element = item.find('div', class_='field-content')
if content_element: if content_element:
content = content_element.decode_contents(formatter="html") content = content_element.decode_contents(formatter="html")
news_item_html = """ news_item_html = """
<div class="news-item"> <div class="news-item">
<h2 class="news-title">{title}</h2> <h2 class="news-title">{title}</h2>
<p class="news-time">{time}</p> <p class="news-time">{time}</p>
<div class="news-content">{content}</div> <div class="news-content">{content}</div>
</div> </div>
""".format(title=title, time=time, content=content) """.format(title=title, time=time, content=content)
else:
news_item_html = """
<div class="news-item">
<h2 class="news-title">{title}</h2>
<p class="news-time">{time}</p>
</div>
""".format(title=title, time=time)
else: else:
news_item_html = """ news_item_html = """
<div class="news-item"> <div class="news-item">
@ -58,27 +66,34 @@ class NewsScraper:
<p class="news-time">{time}</p> <p class="news-time">{time}</p>
</div> </div>
""".format(title=title, time=time) """.format(title=title, time=time)
else:
news_item_html = """
<div class="news-item">
<h2 class="news-title">{title}</h2>
<p class="news-time">{time}</p>
</div>
""".format(title=title, time=time)
news_items.append(news_item_html) news_items.append(news_item_html)
return news_items return news_items
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return []
except AttributeError as e:
print(f"Error parsing HTML: {e}")
return []
except Exception as e:
print(f"Unexpected error: {e}")
return []
def write_to_html(self, template_path, output_path, news_items, heading): def write_to_html(self, template_path, output_path, news_items, heading):
with open(template_path, 'r') as f: try:
html_template = f.read() with open(template_path, 'r') as f:
html_template = f.read()
news_html = '\n'.join(news_items) news_html = '\n'.join(news_items)
html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>') html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
with open(output_path, 'a') as f: with open(output_path, 'a') as f:
f.write(html_output) f.write(html_output)
except IOError as e:
print(f"Error writing to file: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
# Define the scrapers for each news source # Define the scrapers for each news source