From e42f68947e9ab859a33401e31a8036fb15cab4df Mon Sep 17 00:00:00 2001 From: lelkins Date: Sat, 20 Apr 2024 14:07:34 +0300 Subject: [PATCH] add error handler by internal request --- main.py | 105 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 60 insertions(+), 45 deletions(-) diff --git a/main.py b/main.py index fb403d7..bf95f5c 100644 --- a/main.py +++ b/main.py @@ -15,42 +15,50 @@ class NewsScraper: self.content_selector = content_selector def scrape(self): - if self.enable_js: - session = HTMLSession() - response = session.get(self.url) - response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed) - html = response.html.html - else: - response = requests.get(self.url) - html = response.content + try: + if self.enable_js: + session = HTMLSession() + response = session.get(self.url) + response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed) + html = response.html.html + else: + response = requests.get(self.url) + html = response.content - soup = BeautifulSoup(html, 'html.parser') + soup = BeautifulSoup(html, 'html.parser') - news_items = [] + news_items = [] - for item in soup.select(self.item_selector): - title_element = item.select_one(self.title_selector) - time_element = item.select_one(self.time_selector) + for item in soup.select(self.item_selector): + title_element = item.select_one(self.title_selector) + time_element = item.select_one(self.time_selector) - if title_element and time_element: - title = title_element.text.strip() - time = time_element.text.strip() + if title_element and time_element: + title = title_element.text.strip() + time = time_element.text.strip() - if self.show_content: - if self.content_selector: - content_element = item.select_one(self.content_selector) - else: - content_element = item.find('div', class_='field-content') + if self.show_content: + if self.content_selector: + content_element = item.select_one(self.content_selector) + else: + content_element = item.find('div', class_='field-content') - if content_element: - content = content_element.decode_contents(formatter="html") - news_item_html = """ -
-

{title}

-

{time}

-
{content}
-
- """.format(title=title, time=time, content=content) + if content_element: + content = content_element.decode_contents(formatter="html") + news_item_html = """ +
+

{title}

+

{time}

+
{content}
+
+ """.format(title=title, time=time, content=content) + else: + news_item_html = """ +
+

{title}

+

{time}

+
+ """.format(title=title, time=time) else: news_item_html = """
@@ -58,27 +66,34 @@ class NewsScraper:

{time}

""".format(title=title, time=time) - else: - news_item_html = """ -
-

{title}

-

{time}

-
- """.format(title=title, time=time) - news_items.append(news_item_html) + news_items.append(news_item_html) - return news_items + return news_items + except requests.exceptions.RequestException as e: + print(f"Error fetching URL: {e}") + return [] + except AttributeError as e: + print(f"Error parsing HTML: {e}") + return [] + except Exception as e: + print(f"Unexpected error: {e}") + return [] def write_to_html(self, template_path, output_path, news_items, heading): - with open(template_path, 'r') as f: - html_template = f.read() + try: + with open(template_path, 'r') as f: + html_template = f.read() - news_html = '\n'.join(news_items) - html_output = html_template.replace('', f'

{heading}

\n{news_html}\n') + news_html = '\n'.join(news_items) + html_output = html_template.replace('', f'

{heading}

\n{news_html}\n') - with open(output_path, 'a') as f: - f.write(html_output) + with open(output_path, 'a') as f: + f.write(html_output) + except IOError as e: + print(f"Error writing to file: {e}") + except Exception as e: + print(f"Unexpected error: {e}") # Define the scrapers for each news source