init
This commit is contained in:
commit
f8548e32cc
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
env/
|
153
main.py
Normal file
153
main.py
Normal file
|
@ -0,0 +1,153 @@
|
|||
import os
|
||||
import requests
|
||||
import requests_html
|
||||
from requests_html import HTMLSession
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
class NewsScraper:
|
||||
def __init__(self, url, title_selector, time_selector, item_selector, show_content=False):
|
||||
self.url = url
|
||||
self.title_selector = title_selector
|
||||
self.time_selector = time_selector
|
||||
self.item_selector = item_selector
|
||||
self.show_content = show_content
|
||||
|
||||
def scrape(self):
|
||||
response = requests.get(self.url)
|
||||
html = response.content
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
news_items = []
|
||||
|
||||
for item in soup.select(self.item_selector):
|
||||
title_element = item.select_one(self.title_selector)
|
||||
time_element = item.select_one(self.time_selector)
|
||||
|
||||
if title_element and time_element:
|
||||
title = title_element.text.strip()
|
||||
time = time_element.text.strip()
|
||||
|
||||
if self.show_content:
|
||||
content_element = item.find('div', class_='field-content')
|
||||
if content_element:
|
||||
content = content_element.decode_contents(formatter="html")
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
<h2 class="news-title">{title}</h2>
|
||||
<p class="news-time">{time}</p>
|
||||
<div class="news-content">{content}</div>
|
||||
</div>
|
||||
""".format(title=title, time=time, content=content)
|
||||
else:
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
<h2 class="news-title">{title}</h2>
|
||||
<p class="news-time">{time}</p>
|
||||
</div>
|
||||
""".format(title=title, time=time)
|
||||
else:
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
<h2 class="news-title">{title}</h2>
|
||||
<p class="news-time">{time}</p>
|
||||
</div>
|
||||
""".format(title=title, time=time)
|
||||
|
||||
news_items.append(news_item_html)
|
||||
|
||||
return news_items
|
||||
|
||||
def write_to_html(self, template_path, output_path, news_items, heading):
|
||||
with open(template_path, 'r') as f:
|
||||
html_template = f.read()
|
||||
|
||||
news_html = '\n'.join(news_items)
|
||||
html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
|
||||
|
||||
with open(output_path, 'a') as f:
|
||||
f.write(html_output)
|
||||
|
||||
class RequestsHTMLNewsScraper(NewsScraper):
|
||||
def __init__(self, url, title_selector, time_selector, item_selector, content_selector, show_content=True):
|
||||
super().__init__(url, title_selector, time_selector, item_selector, show_content)
|
||||
self.content_selector = content_selector
|
||||
|
||||
def scrape(self):
|
||||
session = HTMLSession()
|
||||
response = session.get(self.url)
|
||||
response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed)
|
||||
|
||||
soup = BeautifulSoup(response.html.html, 'html.parser')
|
||||
|
||||
news_items = []
|
||||
|
||||
for item in soup.select(self.item_selector):
|
||||
title_element = item.select_one(self.title_selector)
|
||||
time_element = item.select_one(self.time_selector)
|
||||
content_element = item.select_one(self.content_selector)
|
||||
|
||||
if title_element and time_element and content_element:
|
||||
title = title_element.text.strip()
|
||||
time = time_element.text.strip()
|
||||
content = content_element.decode_contents(formatter="html")
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
<h2 class="news-title">{title}</h2>
|
||||
<p class="news-time">{time}</p>
|
||||
<div class="news-content">{content}</div>
|
||||
</div>
|
||||
""".format(title=title, time=time, content=content)
|
||||
news_items.append(news_item_html)
|
||||
|
||||
return news_items
|
||||
|
||||
def write_to_html(self, template_path, output_path, news_items, heading):
|
||||
with open(template_path, 'r') as f:
|
||||
html_template = f.read()
|
||||
|
||||
news_html = '\n'.join(news_items)
|
||||
html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
|
||||
|
||||
with open(output_path, 'a') as f:
|
||||
f.write(html_output)
|
||||
|
||||
# Define the scrapers for each news source
|
||||
|
||||
almayadeen_scraper = NewsScraper(
|
||||
url="https://english.almayadeen.net/shortnews",
|
||||
title_selector="h4",
|
||||
time_selector="div.post-tag.day-time",
|
||||
item_selector="div.item",
|
||||
show_content=False
|
||||
)
|
||||
|
||||
middleeasteye_scraper = NewsScraper(
|
||||
url="https://www.middleeasteye.net/live/israels-war-gaza-live-israel-pounds-rafah-overnight-strikes",
|
||||
title_selector=".views-field.views-field-title-1 span.field-content a",
|
||||
time_selector=".views-field.views-field-changed span.field-content",
|
||||
item_selector=".views-row",
|
||||
show_content=True
|
||||
)
|
||||
|
||||
aljazeera_scraper = RequestsHTMLNewsScraper(
|
||||
url="https://www.aljazeera.com/news/liveblog/2024/4/20/israels-war-on-gaza-live-israelis-continue-intense-raid-on-nur-shams-camp",
|
||||
title_selector="h2",
|
||||
time_selector=".date-relative__time",
|
||||
item_selector=".card-live",
|
||||
content_selector=".wysiwyg-content",
|
||||
show_content=True
|
||||
)
|
||||
|
||||
# Scrape and write to HTML for each news source
|
||||
|
||||
if os.path.exists('./index.html'):
|
||||
os.remove('./index.html')
|
||||
|
||||
news_items = almayadeen_scraper.scrape()
|
||||
almayadeen_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Mayadeen News')
|
||||
|
||||
news_items = middleeasteye_scraper.scrape()
|
||||
middleeasteye_scraper.write_to_html('./template.html', './index.html', news_items, 'Middle East Eye News')
|
||||
|
||||
news_items = aljazeera_scraper.scrape()
|
||||
aljazeera_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Jazeera News')
|
119
main.py.old
Normal file
119
main.py.old
Normal file
|
@ -0,0 +1,119 @@
|
|||
import os
|
||||
import requests
|
||||
import requests_html
|
||||
from requests_html import HTMLSession
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
class NewsScraper:
|
||||
def __init__(self, url, title_selector, time_selector, item_selector):
|
||||
self.url = url
|
||||
self.title_selector = title_selector
|
||||
self.time_selector = time_selector
|
||||
self.item_selector = item_selector
|
||||
|
||||
def scrape(self):
|
||||
response = requests.get(self.url)
|
||||
html = response.content
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
news_items = []
|
||||
|
||||
for item in soup.select(self.item_selector):
|
||||
title_element = item.select_one(self.title_selector)
|
||||
time_element = item.select_one(self.time_selector)
|
||||
|
||||
if title_element and time_element:
|
||||
title = title_element.text.strip()
|
||||
time = time_element.text.strip()
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
<h2 class="news-title">{title}</h2>
|
||||
<p class="news-time">{time}</p>
|
||||
</div>
|
||||
""".format(title=title, time=time)
|
||||
news_items.append(news_item_html)
|
||||
|
||||
return news_items
|
||||
|
||||
def write_to_html(self, template_path, output_path, news_items, heading):
|
||||
with open(template_path, 'r') as f:
|
||||
html_template = f.read()
|
||||
|
||||
news_html = '\n'.join(news_items)
|
||||
html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
|
||||
|
||||
with open(output_path, 'a') as f:
|
||||
f.write(html_output)
|
||||
|
||||
class RequestsHTMLNewsScraper(NewsScraper):
|
||||
def __init__(self, url, title_selector, time_selector, item_selector, content_selector):
|
||||
super().__init__(url, title_selector, time_selector, item_selector)
|
||||
self.content_selector = content_selector
|
||||
|
||||
def scrape(self):
|
||||
session = HTMLSession()
|
||||
response = session.get(self.url)
|
||||
response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed)
|
||||
|
||||
soup = BeautifulSoup(response.html.html, 'html.parser')
|
||||
|
||||
news_items = []
|
||||
|
||||
for item in soup.select(self.item_selector):
|
||||
title_element = item.select_one(self.title_selector)
|
||||
time_element = item.select_one(self.time_selector)
|
||||
content_element = item.select_one(self.content_selector)
|
||||
|
||||
if title_element and time_element and content_element:
|
||||
title = title_element.text.strip()
|
||||
time = time_element.text.strip()
|
||||
content = content_element.decode_contents(formatter="html")
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
<h2 class="news-title">{title}</h2>
|
||||
<p class="news-time">{time}</p>
|
||||
<div class="news-content">{content}</div>
|
||||
</div>
|
||||
""".format(title=title, time=time, content=content)
|
||||
news_items.append(news_item_html)
|
||||
|
||||
return news_items
|
||||
|
||||
# Define the scrapers for each news source
|
||||
|
||||
almayadeen_scraper = NewsScraper(
|
||||
url="https://english.almayadeen.net/shortnews",
|
||||
title_selector="h4",
|
||||
time_selector="div.post-tag.day-time",
|
||||
item_selector="div.item"
|
||||
)
|
||||
|
||||
middleeasteye_scraper = NewsScraper(
|
||||
url="https://www.middleeasteye.net/live/israels-war-gaza-live-israel-pounds-rafah-overnight-strikes",
|
||||
title_selector=".views-field.views-field-title-1 span.field-content a",
|
||||
time_selector=".views-field.views-field-changed span.field-content",
|
||||
item_selector=".views-row"
|
||||
)
|
||||
|
||||
aljazeera_scraper = RequestsHTMLNewsScraper(
|
||||
url="https://www.aljazeera.com/news/liveblog/2024/4/20/israels-war-on-gaza-live-israelis-continue-intense-raid-on-nur-shams-camp",
|
||||
title_selector="h2",
|
||||
time_selector=".date-relative__time",
|
||||
item_selector=".card-live",
|
||||
content_selector=".wysiwyg-content"
|
||||
)
|
||||
|
||||
|
||||
# Scrape and write to HTML for each news source
|
||||
|
||||
if os.path.exists('./index.html'):
|
||||
os.remove('./index.html')
|
||||
|
||||
news_items = almayadeen_scraper.scrape()
|
||||
almayadeen_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Mayadeen')
|
||||
|
||||
news_items = aljazeera_scraper.scrape()
|
||||
aljazeera_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Jazeera')
|
||||
|
||||
news_items = middleeasteye_scraper.scrape()
|
||||
aljazeera_scraper.write_to_html('./template.html', './index.html', news_items, 'Middle East Eye')
|
44
template.html
Normal file
44
template.html
Normal file
|
@ -0,0 +1,44 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Scraped News</title>
|
||||
<style>
|
||||
:root {
|
||||
--color-background: #FFD4D4;
|
||||
--color-text: #272727;
|
||||
--highlight: rgba(225,0,0,.2)
|
||||
}
|
||||
|
||||
html, body {
|
||||
background-color: var(--color-background);
|
||||
color: var(--color-text);
|
||||
}
|
||||
|
||||
|
||||
body {
|
||||
margin: 1em auto;
|
||||
max-width: 40em;
|
||||
padding: 0.62em 3.24em;
|
||||
font: 0.8em sans-serif;
|
||||
transition: color 0.5s, font-size 0.5s;
|
||||
}
|
||||
|
||||
.news-item {
|
||||
padding: 20px;
|
||||
border-bottom: 1px solid #ccc;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.news-title {
|
||||
text-decoration: none;
|
||||
color: var(--color-text);
|
||||
}
|
||||
.news-time {
|
||||
font-size: 14px;
|
||||
color: #666;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in a new issue