Separate into different files

2024-04-20 15:56:25 +03:00 · 2024-04-20 15:56:25 +03:00 · 12dac26bb8
parent 66c6ffae71
commit 12dac26bb8
4 changed files with 617 additions and 99 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-env/
+env
+__pycache__
--- a/index.html
+++ b/index.html
--- a/main.py
+++ b/main.py
@ -1,104 +1,7 @@
 import os
-import requests
-import requests_html
-from requests_html import HTMLSession
-from bs4 import BeautifulSoup
-
-logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
-
-class NewsScraper:
-    def __init__(self, url, title_selector, time_selector, item_selector, show_content=False, enable_js=False, content_selector=None):
-        self.url = url
-        self.title_selector = title_selector
-        self.time_selector = time_selector
-        self.item_selector = item_selector
-        self.show_content = show_content
-        self.enable_js = enable_js
-        self.content_selector = content_selector
-
-    def scrape(self):
-        try:
-            if self.enable_js:
-                session = HTMLSession()
-                response = session.get(self.url)
-                response.html.render(timeout=20)  # Render the JavaScript content (adjust the timeout as needed)
-                html = response.html.html
-            else:
-                response = requests.get(self.url)
-                html = response.content
-
-            soup = BeautifulSoup(html, 'html.parser')
-
-            news_items = []
-
-            for item in soup.select(self.item_selector):
-                title_element = item.select_one(self.title_selector)
-                time_element = item.select_one(self.time_selector)
-
-                if title_element and time_element:
-                    title = title_element.text.strip()
-                    time = time_element.text.strip()
-
-                    if self.show_content:
-                        if self.content_selector:
-                            content_element = item.select_one(self.content_selector)
-                        else:
-                            content_element = item.find('div', class_='field-content')
-
-                        if content_element:
-                            content = content_element.decode_contents(formatter="html")
-                            news_item_html = """
-                                <div class="news-item">
-                                    <h2 class="news-title">{title}</h2>
-                                    <p class="news-time">{time}</p>
-                                    <div class="news-content">{content}</div>
-                                </div>
-                            """.format(title=title, time=time, content=content)
-                        else:
-                            news_item_html = """
-                                <div class="news-item">
-                                    <h2 class="news-title">{title}</h2>
-                                    <p class="news-time">{time}</p>
-                                </div>
-                            """.format(title=title, time=time)
-                    else:
-                        news_item_html = """
-                            <div class="news-item">
-                                <h2 class="news-title">{title}</h2>
-                                <p class="news-time">{time}</p>
-                            </div>
-                        """.format(title=title, time=time)
-
-                    news_items.append(news_item_html)
-
-            return news_items
-        except requests.exceptions.RequestException as e:
-            print(f"Error fetching URL: {e}")
-            return []
-        except AttributeError as e:
-            print(f"Error parsing HTML: {e}")
-            return []
-        except Exception as e:
-            print(f"Unexpected error: {e}")
-            return []
-
-    def write_to_html(self, template_path, output_path, news_items, heading):
-        try:
-            with open(template_path, 'r') as f:
-                html_template = f.read()
-
-            news_html = '\n'.join(news_items)
-            html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
-
-            with open(output_path, 'a') as f:
-                f.write(html_output)
-        except IOError as e:
-            print(f"Error writing to file: {e}")
-        except Exception as e:
-            print(f"Unexpected error: {e}")
+from news_scraper import NewsScraper

 # Define the scrapers for each news source
-
 almayadeen_scraper = NewsScraper(
    url="https://english.almayadeen.net/shortnews",
    title_selector="h4",
--- a/news_scraper.py
+++ b/news_scraper.py
@ -0,0 +1,99 @@
+import os
+import requests
+import requests_html
+from requests_html import HTMLSession
+from bs4 import BeautifulSoup
+import logging
+
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+
+class NewsScraper:
+    def __init__(self, url, title_selector, time_selector, item_selector, show_content=False, enable_js=False, content_selector=None):
+        self.url = url
+        self.title_selector = title_selector
+        self.time_selector = time_selector
+        self.item_selector = item_selector
+        self.show_content = show_content
+        self.enable_js = enable_js
+        self.content_selector = content_selector
+
+    def scrape(self):
+        try:
+            if self.enable_js:
+                session = HTMLSession()
+                response = session.get(self.url)
+                response.html.render(timeout=20)  # Render the JavaScript content (adjust the timeout as needed)
+                html = response.html.html
+            else:
+                response = requests.get(self.url)
+                html = response.content
+
+            soup = BeautifulSoup(html, 'html.parser')
+
+            news_items = []
+
+            for item in soup.select(self.item_selector):
+                title_element = item.select_one(self.title_selector)
+                time_element = item.select_one(self.time_selector)
+
+                if title_element and time_element:
+                    title = title_element.text.strip()
+                    time = time_element.text.strip()
+
+                    if self.show_content:
+                        if self.content_selector:
+                            content_element = item.select_one(self.content_selector)
+                        else:
+                            content_element = item.find('div', class_='field-content')
+
+                        if content_element:
+                            content = content_element.decode_contents(formatter="html")
+                            news_item_html = """
+                                <div class="news-item">
+                                    <h2 class="news-title">{title}</h2>
+                                    <p class="news-time">{time}</p>
+                                    <div class="news-content">{content}</div>
+                                </div>
+                            """.format(title=title, time=time, content=content)
+                        else:
+                            news_item_html = """
+                                <div class="news-item">
+                                    <h2 class="news-title">{title}</h2>
+                                    <p class="news-time">{time}</p>
+                                </div>
+                            """.format(title=title, time=time)
+                    else:
+                        news_item_html = """
+                            <div class="news-item">
+                                <h2 class="news-title">{title}</h2>
+                                <p class="news-time">{time}</p>
+                            </div>
+                        """.format(title=title, time=time)
+
+                    news_items.append(news_item_html)
+
+            return news_items
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching URL: {e}")
+            return []
+        except AttributeError as e:
+            print(f"Error parsing HTML: {e}")
+            return []
+        except Exception as e:
+            print(f"Unexpected error: {e}")
+            return []
+
+    def write_to_html(self, template_path, output_path, news_items, heading):
+        try:
+            with open(template_path, 'r') as f:
+                html_template = f.read()
+
+            news_html = '\n'.join(news_items)
+            html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
+
+            with open(output_path, 'a') as f:
+                f.write(html_output)
+        except IOError as e:
+            print(f"Error writing to file: {e}")
+        except Exception as e:
+            print(f"Unexpected error: {e}")