add error handler by internal request

2024-04-20 14:07:34 +03:00 · 2024-04-20 14:07:34 +03:00 · e42f68947e
parent 53609fbb90
commit e42f68947e
1 changed files with 60 additions and 45 deletions
--- a/main.py
+++ b/main.py
@ -15,42 +15,50 @@ class NewsScraper:
        self.content_selector = content_selector

    def scrape(self):
-        if self.enable_js:
-            session = HTMLSession()
-            response = session.get(self.url)
-            response.html.render(timeout=20)  # Render the JavaScript content (adjust the timeout as needed)
-            html = response.html.html
-        else:
-            response = requests.get(self.url)
-            html = response.content
+        try:
+            if self.enable_js:
+                session = HTMLSession()
+                response = session.get(self.url)
+                response.html.render(timeout=20)  # Render the JavaScript content (adjust the timeout as needed)
+                html = response.html.html
+            else:
+                response = requests.get(self.url)
+                html = response.content

-        soup = BeautifulSoup(html, 'html.parser')
+            soup = BeautifulSoup(html, 'html.parser')

-        news_items = []
+            news_items = []

-        for item in soup.select(self.item_selector):
-            title_element = item.select_one(self.title_selector)
-            time_element = item.select_one(self.time_selector)
+            for item in soup.select(self.item_selector):
+                title_element = item.select_one(self.title_selector)
+                time_element = item.select_one(self.time_selector)

-            if title_element and time_element:
-                title = title_element.text.strip()
-                time = time_element.text.strip()
+                if title_element and time_element:
+                    title = title_element.text.strip()
+                    time = time_element.text.strip()

-                if self.show_content:
-                    if self.content_selector:
-                        content_element = item.select_one(self.content_selector)
-                    else:
-                        content_element = item.find('div', class_='field-content')
+                    if self.show_content:
+                        if self.content_selector:
+                            content_element = item.select_one(self.content_selector)
+                        else:
+                            content_element = item.find('div', class_='field-content')

-                    if content_element:
-                        content = content_element.decode_contents(formatter="html")
-                        news_item_html = """
-                            <div class="news-item">
-                                <h2 class="news-title">{title}</h2>
-                                <p class="news-time">{time}</p>
-                                <div class="news-content">{content}</div>
-                            </div>
-                        """.format(title=title, time=time, content=content)
+                        if content_element:
+                            content = content_element.decode_contents(formatter="html")
+                            news_item_html = """
+                                <div class="news-item">
+                                    <h2 class="news-title">{title}</h2>
+                                    <p class="news-time">{time}</p>
+                                    <div class="news-content">{content}</div>
+                                </div>
+                            """.format(title=title, time=time, content=content)
+                        else:
+                            news_item_html = """
+                                <div class="news-item">
+                                    <h2 class="news-title">{title}</h2>
+                                    <p class="news-time">{time}</p>
+                                </div>
+                            """.format(title=title, time=time)
                    else:
                        news_item_html = """
                            <div class="news-item">
@ -58,27 +66,34 @@ class NewsScraper:
                                <p class="news-time">{time}</p>
                            </div>
                        """.format(title=title, time=time)
-                else:
-                    news_item_html = """
-                        <div class="news-item">
-                            <h2 class="news-title">{title}</h2>
-                            <p class="news-time">{time}</p>
-                        </div>
-                    """.format(title=title, time=time)

-                news_items.append(news_item_html)
+                    news_items.append(news_item_html)

-        return news_items
+            return news_items
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching URL: {e}")
+            return []
+        except AttributeError as e:
+            print(f"Error parsing HTML: {e}")
+            return []
+        except Exception as e:
+            print(f"Unexpected error: {e}")
+            return []

    def write_to_html(self, template_path, output_path, news_items, heading):
-        with open(template_path, 'r') as f:
-            html_template = f.read()
+        try:
+            with open(template_path, 'r') as f:
+                html_template = f.read()

-        news_html = '\n'.join(news_items)
-        html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
+            news_html = '\n'.join(news_items)
+            html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')

-        with open(output_path, 'a') as f:
-            f.write(html_output)
+            with open(output_path, 'a') as f:
+                f.write(html_output)
+        except IOError as e:
+            print(f"Error writing to file: {e}")
+        except Exception as e:
+            print(f"Unexpected error: {e}")

 # Define the scrapers for each news source