From 53609fbb90e1c1d1e63fa9567b703767499d17c5 Mon Sep 17 00:00:00 2001
From: lelkins <lelkins@noreply.altesq.net>
Date: Sat, 20 Apr 2024 11:34:06 +0300
Subject: [PATCH] add js toggle and merge RequestsHTMLNewsScraper(js) and
 NewsScraper(non js) into NewsScraper

---
 main.py | 76 ++++++++++++++++++---------------------------------------
 1 file changed, 24 insertions(+), 52 deletions(-)
diff --git a/main.py b/main.py
index f37e80b..fb403d7 100644
--- a/main.py
+++ b/main.py
@@ -5,16 +5,25 @@ from requests_html import HTMLSession
 from bs4 import BeautifulSoup
 
 class NewsScraper:
-    def __init__(self, url, title_selector, time_selector, item_selector, show_content=False):
+    def __init__(self, url, title_selector, time_selector, item_selector, show_content=False, enable_js=False, content_selector=None):
         self.url = url
         self.title_selector = title_selector
         self.time_selector = time_selector
         self.item_selector = item_selector
         self.show_content = show_content
+        self.enable_js = enable_js
+        self.content_selector = content_selector
 
     def scrape(self):
-        response = requests.get(self.url)
-        html = response.content
+        if self.enable_js:
+            session = HTMLSession()
+            response = session.get(self.url)
+            response.html.render(timeout=20)  # Render the JavaScript content (adjust the timeout as needed)
+            html = response.html.html
+        else:
+            response = requests.get(self.url)
+            html = response.content
+
         soup = BeautifulSoup(html, 'html.parser')
 
         news_items = []
@@ -28,7 +37,11 @@ class NewsScraper:
                 time = time_element.text.strip()
 
                 if self.show_content:
-                    content_element = item.find('div', class_='field-content')
+                    if self.content_selector:
+                        content_element = item.select_one(self.content_selector)
+                    else:
+                        content_element = item.find('div', class_='field-content')
+
                     if content_element:
                         content = content_element.decode_contents(formatter="html")
                         news_item_html = """
@@ -67,50 +80,6 @@ class NewsScraper:
         with open(output_path, 'a') as f:
             f.write(html_output)
 
-class RequestsHTMLNewsScraper(NewsScraper):
-    def __init__(self, url, title_selector, time_selector, item_selector, content_selector, show_content=True):
-        super().__init__(url, title_selector, time_selector, item_selector, show_content)
-        self.content_selector = content_selector
-
-    def scrape(self):
-        session = HTMLSession()
-        response = session.get(self.url)
-        response.html.render(timeout=20)  # Render the JavaScript content (adjust the timeout as needed)
-
-        soup = BeautifulSoup(response.html.html, 'html.parser')
-
-        news_items = []
-
-        for item in soup.select(self.item_selector):
-            title_element = item.select_one(self.title_selector)
-            time_element = item.select_one(self.time_selector)
-            content_element = item.select_one(self.content_selector)
-
-            if title_element and time_element and content_element:
-                title = title_element.text.strip()
-                time = time_element.text.strip()
-                content = content_element.decode_contents(formatter="html")
-                news_item_html = """
-                    <div class="news-item">
-                        <h2 class="news-title">{title}</h2>
-                        <p class="news-time">{time}</p>
-                        <div class="news-content">{content}</div>
-                    </div>
-                """.format(title=title, time=time, content=content)
-                news_items.append(news_item_html)
-
-        return news_items
-
-    def write_to_html(self, template_path, output_path, news_items, heading):
-        with open(template_path, 'r') as f:
-            html_template = f.read()
-
-        news_html = '\n'.join(news_items)
-        html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
-
-        with open(output_path, 'a') as f:
-            f.write(html_output)
-
 # Define the scrapers for each news source
 
 almayadeen_scraper = NewsScraper(
@@ -118,7 +87,8 @@ almayadeen_scraper = NewsScraper(
     title_selector="h4",
     time_selector="div.post-tag.day-time",
     item_selector="div.item",
-    show_content=False
+    show_content=False,
+    enable_js=False
 )
 
 middleeasteye_scraper = NewsScraper(
@@ -126,16 +96,18 @@ middleeasteye_scraper = NewsScraper(
     title_selector=".views-field.views-field-title-1 span.field-content a",
     time_selector=".views-field.views-field-changed span.field-content",
     item_selector=".views-row",
-    show_content=True
+    show_content=True,
+    enable_js=False
 )
 
-aljazeera_scraper = RequestsHTMLNewsScraper(
+aljazeera_scraper = NewsScraper(
     url="https://www.aljazeera.com/news/liveblog/2024/4/20/israels-war-on-gaza-live-israelis-continue-intense-raid-on-nur-shams-camp",
     title_selector="h2",
     time_selector=".date-relative__time",
     item_selector=".card-live",
     content_selector=".wysiwyg-content",
-    show_content=True
+    show_content=True,
+    enable_js=True
 )
 
 # Scrape and write to HTML for each news source