init

2024-04-20 11:10:12 +03:00 · 2024-04-20 11:10:12 +03:00 · f8548e32cc
commit f8548e32cc
4 changed files with 317 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+env/
--- a/main.py
+++ b/main.py
@ -0,0 +1,153 @@
+import os
+import requests
+import requests_html
+from requests_html import HTMLSession
+from bs4 import BeautifulSoup
+
+class NewsScraper:
+    def __init__(self, url, title_selector, time_selector, item_selector, show_content=False):
+        self.url = url
+        self.title_selector = title_selector
+        self.time_selector = time_selector
+        self.item_selector = item_selector
+        self.show_content = show_content
+
+    def scrape(self):
+        response = requests.get(self.url)
+        html = response.content
+        soup = BeautifulSoup(html, 'html.parser')
+
+        news_items = []
+
+        for item in soup.select(self.item_selector):
+            title_element = item.select_one(self.title_selector)
+            time_element = item.select_one(self.time_selector)
+
+            if title_element and time_element:
+                title = title_element.text.strip()
+                time = time_element.text.strip()
+
+                if self.show_content:
+                    content_element = item.find('div', class_='field-content')
+                    if content_element:
+                        content = content_element.decode_contents(formatter="html")
+                        news_item_html = """
+                            <div class="news-item">
+                                <h2 class="news-title">{title}</h2>
+                                <p class="news-time">{time}</p>
+                                <div class="news-content">{content}</div>
+                            </div>
+                        """.format(title=title, time=time, content=content)
+                    else:
+                        news_item_html = """
+                            <div class="news-item">
+                                <h2 class="news-title">{title}</h2>
+                                <p class="news-time">{time}</p>
+                            </div>
+                        """.format(title=title, time=time)
+                else:
+                    news_item_html = """
+                        <div class="news-item">
+                            <h2 class="news-title">{title}</h2>
+                            <p class="news-time">{time}</p>
+                        </div>
+                    """.format(title=title, time=time)
+
+                news_items.append(news_item_html)
+
+        return news_items
+
+    def write_to_html(self, template_path, output_path, news_items, heading):
+        with open(template_path, 'r') as f:
+            html_template = f.read()
+
+        news_html = '\n'.join(news_items)
+        html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
+
+        with open(output_path, 'a') as f:
+            f.write(html_output)
+
+class RequestsHTMLNewsScraper(NewsScraper):
+    def __init__(self, url, title_selector, time_selector, item_selector, content_selector, show_content=True):
+        super().__init__(url, title_selector, time_selector, item_selector, show_content)
+        self.content_selector = content_selector
+
+    def scrape(self):
+        session = HTMLSession()
+        response = session.get(self.url)
+        response.html.render(timeout=20)  # Render the JavaScript content (adjust the timeout as needed)
+
+        soup = BeautifulSoup(response.html.html, 'html.parser')
+
+        news_items = []
+
+        for item in soup.select(self.item_selector):
+            title_element = item.select_one(self.title_selector)
+            time_element = item.select_one(self.time_selector)
+            content_element = item.select_one(self.content_selector)
+
+            if title_element and time_element and content_element:
+                title = title_element.text.strip()
+                time = time_element.text.strip()
+                content = content_element.decode_contents(formatter="html")
+                news_item_html = """
+                    <div class="news-item">
+                        <h2 class="news-title">{title}</h2>
+                        <p class="news-time">{time}</p>
+                        <div class="news-content">{content}</div>
+                    </div>
+                """.format(title=title, time=time, content=content)
+                news_items.append(news_item_html)
+
+        return news_items
+
+    def write_to_html(self, template_path, output_path, news_items, heading):
+        with open(template_path, 'r') as f:
+            html_template = f.read()
+
+        news_html = '\n'.join(news_items)
+        html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
+
+        with open(output_path, 'a') as f:
+            f.write(html_output)
+
+# Define the scrapers for each news source
+
+almayadeen_scraper = NewsScraper(
+    url="https://english.almayadeen.net/shortnews",
+    title_selector="h4",
+    time_selector="div.post-tag.day-time",
+    item_selector="div.item",
+    show_content=False
+)
+
+middleeasteye_scraper = NewsScraper(
+    url="https://www.middleeasteye.net/live/israels-war-gaza-live-israel-pounds-rafah-overnight-strikes",
+    title_selector=".views-field.views-field-title-1 span.field-content a",
+    time_selector=".views-field.views-field-changed span.field-content",
+    item_selector=".views-row",
+    show_content=True
+)
+
+aljazeera_scraper = RequestsHTMLNewsScraper(
+    url="https://www.aljazeera.com/news/liveblog/2024/4/20/israels-war-on-gaza-live-israelis-continue-intense-raid-on-nur-shams-camp",
+    title_selector="h2",
+    time_selector=".date-relative__time",
+    item_selector=".card-live",
+    content_selector=".wysiwyg-content",
+    show_content=True
+)
+
+# Scrape and write to HTML for each news source
+
+if os.path.exists('./index.html'):
+    os.remove('./index.html')
+
+news_items = almayadeen_scraper.scrape()
+almayadeen_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Mayadeen News')
+
+news_items = middleeasteye_scraper.scrape()
+middleeasteye_scraper.write_to_html('./template.html', './index.html', news_items, 'Middle East Eye News')
+
+news_items = aljazeera_scraper.scrape()
+aljazeera_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Jazeera News')
--- a/main.py.old
+++ b/main.py.old
@ -0,0 +1,119 @@
+import os
+import requests
+import requests_html
+from requests_html import HTMLSession
+from bs4 import BeautifulSoup
+
+class NewsScraper:
+    def __init__(self, url, title_selector, time_selector, item_selector):
+        self.url = url
+        self.title_selector = title_selector
+        self.time_selector = time_selector
+        self.item_selector = item_selector
+
+    def scrape(self):
+        response = requests.get(self.url)
+        html = response.content
+        soup = BeautifulSoup(html, 'html.parser')
+
+        news_items = []
+
+        for item in soup.select(self.item_selector):
+            title_element = item.select_one(self.title_selector)
+            time_element = item.select_one(self.time_selector)
+
+            if title_element and time_element:
+                title = title_element.text.strip()
+                time = time_element.text.strip()
+                news_item_html = """
+                    <div class="news-item">
+                        <h2 class="news-title">{title}</h2>
+                        <p class="news-time">{time}</p>
+                    </div>
+                """.format(title=title, time=time)
+                news_items.append(news_item_html)
+
+        return news_items
+
+    def write_to_html(self, template_path, output_path, news_items, heading):
+        with open(template_path, 'r') as f:
+            html_template = f.read()
+
+        news_html = '\n'.join(news_items)
+        html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
+
+        with open(output_path, 'a') as f:
+            f.write(html_output)
+
+class RequestsHTMLNewsScraper(NewsScraper):
+    def __init__(self, url, title_selector, time_selector, item_selector, content_selector):
+        super().__init__(url, title_selector, time_selector, item_selector)
+        self.content_selector = content_selector
+
+    def scrape(self):
+        session = HTMLSession()
+        response = session.get(self.url)
+        response.html.render(timeout=20)  # Render the JavaScript content (adjust the timeout as needed)
+
+        soup = BeautifulSoup(response.html.html, 'html.parser')
+
+        news_items = []
+
+        for item in soup.select(self.item_selector):
+            title_element = item.select_one(self.title_selector)
+            time_element = item.select_one(self.time_selector)
+            content_element = item.select_one(self.content_selector)
+
+            if title_element and time_element and content_element:
+                title = title_element.text.strip()
+                time = time_element.text.strip()
+                content = content_element.decode_contents(formatter="html")
+                news_item_html = """
+                    <div class="news-item">
+                        <h2 class="news-title">{title}</h2>
+                        <p class="news-time">{time}</p>
+                        <div class="news-content">{content}</div>
+                    </div>
+                """.format(title=title, time=time, content=content)
+                news_items.append(news_item_html)
+
+        return news_items
+
+# Define the scrapers for each news source
+
+almayadeen_scraper = NewsScraper(
+    url="https://english.almayadeen.net/shortnews",
+    title_selector="h4",
+    time_selector="div.post-tag.day-time",
+    item_selector="div.item"
+)
+
+middleeasteye_scraper = NewsScraper(
+    url="https://www.middleeasteye.net/live/israels-war-gaza-live-israel-pounds-rafah-overnight-strikes",
+    title_selector=".views-field.views-field-title-1 span.field-content a",
+    time_selector=".views-field.views-field-changed span.field-content",
+    item_selector=".views-row"
+)
+
+aljazeera_scraper = RequestsHTMLNewsScraper(
+    url="https://www.aljazeera.com/news/liveblog/2024/4/20/israels-war-on-gaza-live-israelis-continue-intense-raid-on-nur-shams-camp",
+    title_selector="h2",
+    time_selector=".date-relative__time",
+    item_selector=".card-live",
+    content_selector=".wysiwyg-content"
+)
+
+
+# Scrape and write to HTML for each news source
+
+if os.path.exists('./index.html'):
+    os.remove('./index.html')
+
+news_items = almayadeen_scraper.scrape()
+almayadeen_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Mayadeen')
+
+news_items = aljazeera_scraper.scrape()
+aljazeera_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Jazeera')
+
+news_items = middleeasteye_scraper.scrape()
+aljazeera_scraper.write_to_html('./template.html', './index.html', news_items, 'Middle East Eye')
--- a/template.html
+++ b/template.html
@ -0,0 +1,44 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Scraped News</title>
+    <style>
+        :root {
+  --color-background: #FFD4D4;
+  --color-text: #272727;
+  --highlight: rgba(225,0,0,.2)
+}
+
+        html, body {
+            background-color: var(--color-background);
+            color: var(--color-text);
+        }
+
+
+body {
+  margin: 1em auto;
+  max-width: 40em;
+  padding: 0.62em 3.24em;
+  font: 0.8em sans-serif;
+  transition: color 0.5s, font-size 0.5s;
+}
+
+        .news-item {
+            padding: 20px;
+            border-bottom: 1px solid #ccc;
+            margin-bottom: 20px;
+        }
+
+        .news-title {
+            text-decoration: none;
+            color: var(--color-text);
+        }
+        .news-time {
+            font-size: 14px;
+            color: #666;
+        }
+    </style>
+</head>
+<body>
+</body>
+</html>