1
0
Fork 0
forked from Chay/scrp
scrp/main.py

141 lines
5.4 KiB
Python

import os
import requests
import requests_html
from requests_html import HTMLSession
from bs4 import BeautifulSoup
class NewsScraper:
def __init__(self, url, title_selector, time_selector, item_selector, show_content=False, enable_js=False, content_selector=None):
self.url = url
self.title_selector = title_selector
self.time_selector = time_selector
self.item_selector = item_selector
self.show_content = show_content
self.enable_js = enable_js
self.content_selector = content_selector
def scrape(self):
try:
if self.enable_js:
session = HTMLSession()
response = session.get(self.url)
response.html.render(timeout=20) # Render the JavaScript content (adjust the timeout as needed)
html = response.html.html
else:
response = requests.get(self.url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
news_items = []
for item in soup.select(self.item_selector):
title_element = item.select_one(self.title_selector)
time_element = item.select_one(self.time_selector)
if title_element and time_element:
title = title_element.text.strip()
time = time_element.text.strip()
if self.show_content:
if self.content_selector:
content_element = item.select_one(self.content_selector)
else:
content_element = item.find('div', class_='field-content')
if content_element:
content = content_element.decode_contents(formatter="html")
news_item_html = """
<div class="news-item">
<h2 class="news-title">{title}</h2>
<p class="news-time">{time}</p>
<div class="news-content">{content}</div>
</div>
""".format(title=title, time=time, content=content)
else:
news_item_html = """
<div class="news-item">
<h2 class="news-title">{title}</h2>
<p class="news-time">{time}</p>
</div>
""".format(title=title, time=time)
else:
news_item_html = """
<div class="news-item">
<h2 class="news-title">{title}</h2>
<p class="news-time">{time}</p>
</div>
""".format(title=title, time=time)
news_items.append(news_item_html)
return news_items
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return []
except AttributeError as e:
print(f"Error parsing HTML: {e}")
return []
except Exception as e:
print(f"Unexpected error: {e}")
return []
def write_to_html(self, template_path, output_path, news_items, heading):
try:
with open(template_path, 'r') as f:
html_template = f.read()
news_html = '\n'.join(news_items)
html_output = html_template.replace('</body>', f'<h1>{heading}</h1>\n{news_html}\n</body>')
with open(output_path, 'a') as f:
f.write(html_output)
except IOError as e:
print(f"Error writing to file: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
# Define the scrapers for each news source
almayadeen_scraper = NewsScraper(
url="https://english.almayadeen.net/shortnews",
title_selector="h4",
time_selector="div.post-tag.day-time",
item_selector="div.item",
show_content=False,
enable_js=False
)
middleeasteye_scraper = NewsScraper(
url="https://www.middleeasteye.net/live/israels-war-gaza-live-israel-pounds-rafah-overnight-strikes",
title_selector=".views-field.views-field-title-1 span.field-content a",
time_selector=".views-field.views-field-changed span.field-content",
item_selector=".views-row",
show_content=True,
enable_js=False
)
aljazeera_scraper = NewsScraper(
url="https://www.aljazeera.com/news/liveblog/2024/4/20/israels-war-on-gaza-live-israelis-continue-intense-raid-on-nur-shams-camp",
title_selector="h2",
time_selector=".date-relative__time",
item_selector=".card-live",
content_selector=".wysiwyg-content",
show_content=True,
enable_js=True
)
# Scrape and write to HTML for each news source
if os.path.exists('./index.html'):
os.remove('./index.html')
news_items = almayadeen_scraper.scrape()
almayadeen_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Mayadeen News')
news_items = middleeasteye_scraper.scrape()
middleeasteye_scraper.write_to_html('./template.html', './index.html', news_items, 'Middle East Eye News')
news_items = aljazeera_scraper.scrape()
aljazeera_scraper.write_to_html('./template.html', './index.html', news_items, 'Al Jazeera News')