Clean up & add requirements.txt

2024-04-20 15:13:29 +03:00 · 2024-04-20 15:13:29 +03:00 · c284a84b13
parent 7f682e6daf
commit c284a84b13
2 changed files with 3 additions and 119 deletions
--- a/main.py.old
+++ b/main.py.old
@ -1,119 +0,0 @@
-import os
-import requests
-import requests_html
-from requests_html import HTMLSession
-from bs4 import BeautifulSoup
-
-class NewsScraper:
-    def __init__(self, url, title_selector, time_selector, item_selector):
-        self.url = url
-        self.title_selector = title_selector
-        self.time_selector = time_selector
-        self.item_selector = item_selector
-
-    def scrape(self):
-        response = requests.get(self.url)
-        html = response.content
-        soup = BeautifulSoup(html, 'html.parser')
-
-        news_items = []
-
-        for item in soup.select(self.item_selector):
-            title_element = item.select_one(self.title_selector)
-            time_element = item.select_one(self.time_selector)
-
-            if title_element and time_element:
-                title = title_element.text.strip()
-                time = time_element.text.strip()
-                news_item_html = """
-                    <div class="news-item">
-                        <h2 class="news-title">{title}</h2>
-                        <p class="news-time">{time}</p>
-                    </div>
-                """.format(title=title, time=time)
-                news_items.append(news_item_html)
-
-        return news_items
-
-    def write_to_html(self, template_path, output_path, news_items, heading):
-        with open(template_path, 'r') as f:
-            html_template = f.read()
-
-        news_html = '\n'.join(news_items)
-        html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
-
-        with open(output_path, 'a') as f:
-            f.write(html_output)
-
-class RequestsHTMLNewsScraper(NewsScraper):
-    def __init__(self, url, title_selector, time_selector, item_selector, content_selector):
-        super().__init__(url, title_selector, time_selector, item_selector)
-        self.content_selector = content_selector
-
-    def scrape(self):
-        session = HTMLSession()
-        response = session.get(self.url)
-        response.html.render(timeout=20)  # Render the JavaScript content (adjust the timeout as needed)
-
-        soup = BeautifulSoup(response.html.html, 'html.parser')
-
-        news_items = []
-
-        for item in soup.select(self.item_selector):
-            title_element = item.select_one(self.title_selector)
-            time_element = item.select_one(self.time_selector)
-            content_element = item.select_one(self.content_selector)
-
-            if title_element and time_element and content_element:
-                title = title_element.text.strip()
-                time = time_element.text.strip()
-                content = content_element.decode_contents(formatter="html")
-                news_item_html = """
-                    <div class="news-item">
-                        <h2 class="news-title">{title}</h2>
-                        <p class="news-time">{time}</p>
-                        <div class="news-content">{content}</div>
-                    </div>
-                """.format(title=title, time=time, content=content)
-                news_items.append(news_item_html)
-
-        return news_items
-
-# Define the scrapers for each news source
-
-almayadeen_scraper = NewsScraper(
-    url="https://english.almayadeen.net/shortnews",
-    title_selector="h4",
-    time_selector="div.post-tag.day-time",
-    item_selector="div.item"
-)
-
-middleeasteye_scraper = NewsScraper(
-    url="https://www.middleeasteye.net/live/israels-war-gaza-live-israel-pounds-rafah-overnight-strikes",
-    title_selector=".views-field.views-field-title-1 span.field-content a",
-    time_selector=".views-field.views-field-changed span.field-content",
-    item_selector=".views-row"
-)
-
-aljazeera_scraper = RequestsHTMLNewsScraper(
-    url="https://www.aljazeera.com/news/liveblog/2024/4/20/israels-war-on-gaza-live-israelis-continue-intense-raid-on-nur-shams-camp",
-    title_selector="h2",
-    time_selector=".date-relative__time",
-    item_selector=".card-live",
-    content_selector=".wysiwyg-content"
-)
-
-
-# Scrape and write to HTML for each news source
-
-if os.path.exists('./index.html'):
-    os.remove('./index.html')
-
-news_items = almayadeen_scraper.scrape()
-almayadeen_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Mayadeen')
-
-news_items = aljazeera_scraper.scrape()
-aljazeera_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Jazeera')
-
-news_items = middleeasteye_scraper.scrape()
-aljazeera_scraper.write_to_html('./template.html', './index.html', news_items, 'Middle East Eye')
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+requests
+requests_html
+lxml_html_clean