add js toggle and merge RequestsHTMLNewsScraper(js) and NewsScraper(non js) into NewsScraper
This commit is contained in:
parent
f8548e32cc
commit
53609fbb90
76
main.py
76
main.py
|
@ -5,16 +5,25 @@ from requests_html import HTMLSession
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
class NewsScraper:
|
class NewsScraper:
|
||||||
def __init__(self, url, title_selector, time_selector, item_selector, show_content=False):
|
def __init__(self, url, title_selector, time_selector, item_selector, show_content=False, enable_js=False, content_selector=None):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.title_selector = title_selector
|
self.title_selector = title_selector
|
||||||
self.time_selector = time_selector
|
self.time_selector = time_selector
|
||||||
self.item_selector = item_selector
|
self.item_selector = item_selector
|
||||||
self.show_content = show_content
|
self.show_content = show_content
|
||||||
|
self.enable_js = enable_js
|
||||||
|
self.content_selector = content_selector
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
response = requests.get(self.url)
|
if self.enable_js:
|
||||||
html = response.content
|
session = HTMLSession()
|
||||||
|
response = session.get(self.url)
|
||||||
|
response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed)
|
||||||
|
html = response.html.html
|
||||||
|
else:
|
||||||
|
response = requests.get(self.url)
|
||||||
|
html = response.content
|
||||||
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
news_items = []
|
news_items = []
|
||||||
|
@ -28,7 +37,11 @@ class NewsScraper:
|
||||||
time = time_element.text.strip()
|
time = time_element.text.strip()
|
||||||
|
|
||||||
if self.show_content:
|
if self.show_content:
|
||||||
content_element = item.find('div', class_='field-content')
|
if self.content_selector:
|
||||||
|
content_element = item.select_one(self.content_selector)
|
||||||
|
else:
|
||||||
|
content_element = item.find('div', class_='field-content')
|
||||||
|
|
||||||
if content_element:
|
if content_element:
|
||||||
content = content_element.decode_contents(formatter="html")
|
content = content_element.decode_contents(formatter="html")
|
||||||
news_item_html = """
|
news_item_html = """
|
||||||
|
@ -67,50 +80,6 @@ class NewsScraper:
|
||||||
with open(output_path, 'a') as f:
|
with open(output_path, 'a') as f:
|
||||||
f.write(html_output)
|
f.write(html_output)
|
||||||
|
|
||||||
class RequestsHTMLNewsScraper(NewsScraper):
|
|
||||||
def __init__(self, url, title_selector, time_selector, item_selector, content_selector, show_content=True):
|
|
||||||
super().__init__(url, title_selector, time_selector, item_selector, show_content)
|
|
||||||
self.content_selector = content_selector
|
|
||||||
|
|
||||||
def scrape(self):
|
|
||||||
session = HTMLSession()
|
|
||||||
response = session.get(self.url)
|
|
||||||
response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed)
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.html.html, 'html.parser')
|
|
||||||
|
|
||||||
news_items = []
|
|
||||||
|
|
||||||
for item in soup.select(self.item_selector):
|
|
||||||
title_element = item.select_one(self.title_selector)
|
|
||||||
time_element = item.select_one(self.time_selector)
|
|
||||||
content_element = item.select_one(self.content_selector)
|
|
||||||
|
|
||||||
if title_element and time_element and content_element:
|
|
||||||
title = title_element.text.strip()
|
|
||||||
time = time_element.text.strip()
|
|
||||||
content = content_element.decode_contents(formatter="html")
|
|
||||||
news_item_html = """
|
|
||||||
<div class="news-item">
|
|
||||||
<h2 class="news-title">{title}</h2>
|
|
||||||
<p class="news-time">{time}</p>
|
|
||||||
<div class="news-content">{content}</div>
|
|
||||||
</div>
|
|
||||||
""".format(title=title, time=time, content=content)
|
|
||||||
news_items.append(news_item_html)
|
|
||||||
|
|
||||||
return news_items
|
|
||||||
|
|
||||||
def write_to_html(self, template_path, output_path, news_items, heading):
|
|
||||||
with open(template_path, 'r') as f:
|
|
||||||
html_template = f.read()
|
|
||||||
|
|
||||||
news_html = '\n'.join(news_items)
|
|
||||||
html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
|
|
||||||
|
|
||||||
with open(output_path, 'a') as f:
|
|
||||||
f.write(html_output)
|
|
||||||
|
|
||||||
# Define the scrapers for each news source
|
# Define the scrapers for each news source
|
||||||
|
|
||||||
almayadeen_scraper = NewsScraper(
|
almayadeen_scraper = NewsScraper(
|
||||||
|
@ -118,7 +87,8 @@ almayadeen_scraper = NewsScraper(
|
||||||
title_selector="h4",
|
title_selector="h4",
|
||||||
time_selector="div.post-tag.day-time",
|
time_selector="div.post-tag.day-time",
|
||||||
item_selector="div.item",
|
item_selector="div.item",
|
||||||
show_content=False
|
show_content=False,
|
||||||
|
enable_js=False
|
||||||
)
|
)
|
||||||
|
|
||||||
middleeasteye_scraper = NewsScraper(
|
middleeasteye_scraper = NewsScraper(
|
||||||
|
@ -126,16 +96,18 @@ middleeasteye_scraper = NewsScraper(
|
||||||
title_selector=".views-field.views-field-title-1 span.field-content a",
|
title_selector=".views-field.views-field-title-1 span.field-content a",
|
||||||
time_selector=".views-field.views-field-changed span.field-content",
|
time_selector=".views-field.views-field-changed span.field-content",
|
||||||
item_selector=".views-row",
|
item_selector=".views-row",
|
||||||
show_content=True
|
show_content=True,
|
||||||
|
enable_js=False
|
||||||
)
|
)
|
||||||
|
|
||||||
aljazeera_scraper = RequestsHTMLNewsScraper(
|
aljazeera_scraper = NewsScraper(
|
||||||
url="https://www.aljazeera.com/news/liveblog/2024/4/20/israels-war-on-gaza-live-israelis-continue-intense-raid-on-nur-shams-camp",
|
url="https://www.aljazeera.com/news/liveblog/2024/4/20/israels-war-on-gaza-live-israelis-continue-intense-raid-on-nur-shams-camp",
|
||||||
title_selector="h2",
|
title_selector="h2",
|
||||||
time_selector=".date-relative__time",
|
time_selector=".date-relative__time",
|
||||||
item_selector=".card-live",
|
item_selector=".card-live",
|
||||||
content_selector=".wysiwyg-content",
|
content_selector=".wysiwyg-content",
|
||||||
show_content=True
|
show_content=True,
|
||||||
|
enable_js=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Scrape and write to HTML for each news source
|
# Scrape and write to HTML for each news source
|
||||||
|
|
Loading…
Reference in a new issue