From 53609fbb90e1c1d1e63fa9567b703767499d17c5 Mon Sep 17 00:00:00 2001 From: lelkins Date: Sat, 20 Apr 2024 11:34:06 +0300 Subject: [PATCH] add js toggle and merge RequestsHTMLNewsScraper(js) and NewsScraper(non js) into NewsScraper --- main.py | 76 ++++++++++++++++++--------------------------------------- 1 file changed, 24 insertions(+), 52 deletions(-) diff --git a/main.py b/main.py index f37e80b..fb403d7 100644 --- a/main.py +++ b/main.py @@ -5,16 +5,25 @@ from requests_html import HTMLSession from bs4 import BeautifulSoup class NewsScraper: - def __init__(self, url, title_selector, time_selector, item_selector, show_content=False): + def __init__(self, url, title_selector, time_selector, item_selector, show_content=False, enable_js=False, content_selector=None): self.url = url self.title_selector = title_selector self.time_selector = time_selector self.item_selector = item_selector self.show_content = show_content + self.enable_js = enable_js + self.content_selector = content_selector def scrape(self): - response = requests.get(self.url) - html = response.content + if self.enable_js: + session = HTMLSession() + response = session.get(self.url) + response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed) + html = response.html.html + else: + response = requests.get(self.url) + html = response.content + soup = BeautifulSoup(html, 'html.parser') news_items = [] @@ -28,7 +37,11 @@ class NewsScraper: time = time_element.text.strip() if self.show_content: - content_element = item.find('div', class_='field-content') + if self.content_selector: + content_element = item.select_one(self.content_selector) + else: + content_element = item.find('div', class_='field-content') + if content_element: content = content_element.decode_contents(formatter="html") news_item_html = """ @@ -67,50 +80,6 @@ class NewsScraper: with open(output_path, 'a') as f: f.write(html_output) -class RequestsHTMLNewsScraper(NewsScraper): - def __init__(self, url, title_selector, time_selector, item_selector, content_selector, show_content=True): - super().__init__(url, title_selector, time_selector, item_selector, show_content) - self.content_selector = content_selector - - def scrape(self): - session = HTMLSession() - response = session.get(self.url) - response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed) - - soup = BeautifulSoup(response.html.html, 'html.parser') - - news_items = [] - - for item in soup.select(self.item_selector): - title_element = item.select_one(self.title_selector) - time_element = item.select_one(self.time_selector) - content_element = item.select_one(self.content_selector) - - if title_element and time_element and content_element: - title = title_element.text.strip() - time = time_element.text.strip() - content = content_element.decode_contents(formatter="html") - news_item_html = """ -
-

{title}

-

{time}

-
{content}
-
- """.format(title=title, time=time, content=content) - news_items.append(news_item_html) - - return news_items - - def write_to_html(self, template_path, output_path, news_items, heading): - with open(template_path, 'r') as f: - html_template = f.read() - - news_html = '\n'.join(news_items) - html_output = html_template.replace('', f'

{heading}

\n{news_html}\n') - - with open(output_path, 'a') as f: - f.write(html_output) - # Define the scrapers for each news source almayadeen_scraper = NewsScraper( @@ -118,7 +87,8 @@ almayadeen_scraper = NewsScraper( title_selector="h4", time_selector="div.post-tag.day-time", item_selector="div.item", - show_content=False + show_content=False, + enable_js=False ) middleeasteye_scraper = NewsScraper( @@ -126,16 +96,18 @@ middleeasteye_scraper = NewsScraper( title_selector=".views-field.views-field-title-1 span.field-content a", time_selector=".views-field.views-field-changed span.field-content", item_selector=".views-row", - show_content=True + show_content=True, + enable_js=False ) -aljazeera_scraper = RequestsHTMLNewsScraper( +aljazeera_scraper = NewsScraper( url="https://www.aljazeera.com/news/liveblog/2024/4/20/israels-war-on-gaza-live-israelis-continue-intense-raid-on-nur-shams-camp", title_selector="h2", time_selector=".date-relative__time", item_selector=".card-live", content_selector=".wysiwyg-content", - show_content=True + show_content=True, + enable_js=True ) # Scrape and write to HTML for each news source