diff --git a/main.py b/main.py index f37e80b..fb403d7 100644 --- a/main.py +++ b/main.py @@ -5,16 +5,25 @@ from requests_html import HTMLSession from bs4 import BeautifulSoup class NewsScraper: - def __init__(self, url, title_selector, time_selector, item_selector, show_content=False): + def __init__(self, url, title_selector, time_selector, item_selector, show_content=False, enable_js=False, content_selector=None): self.url = url self.title_selector = title_selector self.time_selector = time_selector self.item_selector = item_selector self.show_content = show_content + self.enable_js = enable_js + self.content_selector = content_selector def scrape(self): - response = requests.get(self.url) - html = response.content + if self.enable_js: + session = HTMLSession() + response = session.get(self.url) + response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed) + html = response.html.html + else: + response = requests.get(self.url) + html = response.content + soup = BeautifulSoup(html, 'html.parser') news_items = [] @@ -28,7 +37,11 @@ class NewsScraper: time = time_element.text.strip() if self.show_content: - content_element = item.find('div', class_='field-content') + if self.content_selector: + content_element = item.select_one(self.content_selector) + else: + content_element = item.find('div', class_='field-content') + if content_element: content = content_element.decode_contents(formatter="html") news_item_html = """ @@ -67,50 +80,6 @@ class NewsScraper: with open(output_path, 'a') as f: f.write(html_output) -class RequestsHTMLNewsScraper(NewsScraper): - def __init__(self, url, title_selector, time_selector, item_selector, content_selector, show_content=True): - super().__init__(url, title_selector, time_selector, item_selector, show_content) - self.content_selector = content_selector - - def scrape(self): - session = HTMLSession() - response = session.get(self.url) - response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed) - - soup = BeautifulSoup(response.html.html, 'html.parser') - - news_items = [] - - for item in soup.select(self.item_selector): - title_element = item.select_one(self.title_selector) - time_element = item.select_one(self.time_selector) - content_element = item.select_one(self.content_selector) - - if title_element and time_element and content_element: - title = title_element.text.strip() - time = time_element.text.strip() - content = content_element.decode_contents(formatter="html") - news_item_html = """ -
{time}
-