add error handler by internal request
This commit is contained in:
parent
53609fbb90
commit
e42f68947e
105
main.py
105
main.py
|
@ -15,42 +15,50 @@ class NewsScraper:
|
|||
self.content_selector = content_selector
|
||||
|
||||
def scrape(self):
|
||||
if self.enable_js:
|
||||
session = HTMLSession()
|
||||
response = session.get(self.url)
|
||||
response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed)
|
||||
html = response.html.html
|
||||
else:
|
||||
response = requests.get(self.url)
|
||||
html = response.content
|
||||
try:
|
||||
if self.enable_js:
|
||||
session = HTMLSession()
|
||||
response = session.get(self.url)
|
||||
response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed)
|
||||
html = response.html.html
|
||||
else:
|
||||
response = requests.get(self.url)
|
||||
html = response.content
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
news_items = []
|
||||
news_items = []
|
||||
|
||||
for item in soup.select(self.item_selector):
|
||||
title_element = item.select_one(self.title_selector)
|
||||
time_element = item.select_one(self.time_selector)
|
||||
for item in soup.select(self.item_selector):
|
||||
title_element = item.select_one(self.title_selector)
|
||||
time_element = item.select_one(self.time_selector)
|
||||
|
||||
if title_element and time_element:
|
||||
title = title_element.text.strip()
|
||||
time = time_element.text.strip()
|
||||
if title_element and time_element:
|
||||
title = title_element.text.strip()
|
||||
time = time_element.text.strip()
|
||||
|
||||
if self.show_content:
|
||||
if self.content_selector:
|
||||
content_element = item.select_one(self.content_selector)
|
||||
else:
|
||||
content_element = item.find('div', class_='field-content')
|
||||
if self.show_content:
|
||||
if self.content_selector:
|
||||
content_element = item.select_one(self.content_selector)
|
||||
else:
|
||||
content_element = item.find('div', class_='field-content')
|
||||
|
||||
if content_element:
|
||||
content = content_element.decode_contents(formatter="html")
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
<h2 class="news-title">{title}</h2>
|
||||
<p class="news-time">{time}</p>
|
||||
<div class="news-content">{content}</div>
|
||||
</div>
|
||||
""".format(title=title, time=time, content=content)
|
||||
if content_element:
|
||||
content = content_element.decode_contents(formatter="html")
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
<h2 class="news-title">{title}</h2>
|
||||
<p class="news-time">{time}</p>
|
||||
<div class="news-content">{content}</div>
|
||||
</div>
|
||||
""".format(title=title, time=time, content=content)
|
||||
else:
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
<h2 class="news-title">{title}</h2>
|
||||
<p class="news-time">{time}</p>
|
||||
</div>
|
||||
""".format(title=title, time=time)
|
||||
else:
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
|
@ -58,27 +66,34 @@ class NewsScraper:
|
|||
<p class="news-time">{time}</p>
|
||||
</div>
|
||||
""".format(title=title, time=time)
|
||||
else:
|
||||
news_item_html = """
|
||||
<div class="news-item">
|
||||
<h2 class="news-title">{title}</h2>
|
||||
<p class="news-time">{time}</p>
|
||||
</div>
|
||||
""".format(title=title, time=time)
|
||||
|
||||
news_items.append(news_item_html)
|
||||
news_items.append(news_item_html)
|
||||
|
||||
return news_items
|
||||
return news_items
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Error fetching URL: {e}")
|
||||
return []
|
||||
except AttributeError as e:
|
||||
print(f"Error parsing HTML: {e}")
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"Unexpected error: {e}")
|
||||
return []
|
||||
|
||||
def write_to_html(self, template_path, output_path, news_items, heading):
|
||||
with open(template_path, 'r') as f:
|
||||
html_template = f.read()
|
||||
try:
|
||||
with open(template_path, 'r') as f:
|
||||
html_template = f.read()
|
||||
|
||||
news_html = '\n'.join(news_items)
|
||||
html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
|
||||
news_html = '\n'.join(news_items)
|
||||
html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
|
||||
|
||||
with open(output_path, 'a') as f:
|
||||
f.write(html_output)
|
||||
with open(output_path, 'a') as f:
|
||||
f.write(html_output)
|
||||
except IOError as e:
|
||||
print(f"Error writing to file: {e}")
|
||||
except Exception as e:
|
||||
print(f"Unexpected error: {e}")
|
||||
|
||||
# Define the scrapers for each news source
|
||||
|
||||
|
|
Loading…
Reference in a new issue