scrp/news_scraper.py

import os
import requests
import requests_html
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import logging

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

class NewsScraper:
    def __init__(self, url, title_selector, time_selector, item_selector, show_content=False, enable_js=False, content_selector=None):
        self.url = url
        self.title_selector = title_selector
        self.time_selector = time_selector
        self.item_selector = item_selector
        self.show_content = show_content
        self.enable_js = enable_js
        self.content_selector = content_selector

    def scrape(self):
        try:
            if self.enable_js:
                session = HTMLSession()
                response = session.get(self.url)
                response.html.render(timeout=20)  # Render the JavaScript content (adjust the timeout as needed)
                html = response.html.html
            else:
                response = requests.get(self.url)
                html = response.content

            soup = BeautifulSoup(html, 'html.parser')

            news_items = []

            for item in soup.select(self.item_selector):
                title_element = item.select_one(self.title_selector)
                time_element = item.select_one(self.time_selector)

                if title_element and time_element:
                    title = title_element.text.strip()
                    time = time_element.text.strip()

                    if self.show_content:
                        if self.content_selector:
                            content_element = item.select_one(self.content_selector)
                        else:
                            content_element = item.find('div', class_='field-content')

                        if content_element:
                            content = content_element.decode_contents(formatter="html")
                            news_item_html = """
                                <div class="news-item">
                                    <h2 class="news-title">{title}</h2>
                                    <p class="news-time">{time}</p>
                                    <div class="news-content">{content}</div>
                                </div>
                            """.format(title=title, time=time, content=content)
                        else:
                            news_item_html = """
                                <div class="news-item">
                                    <h2 class="news-title">{title}</h2>
                                    <p class="news-time">{time}</p>
                                </div>
                            """.format(title=title, time=time)
                    else:
                        news_item_html = """
                            <div class="news-item">
                                <h2 class="news-title">{title}</h2>
                                <p class="news-time">{time}</p>
                            </div>
                        """.format(title=title, time=time)

                    news_items.append(news_item_html)

            return news_items
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
            return []
        except AttributeError as e:
            print(f"Error parsing HTML: {e}")
            return []
        except Exception as e:
            print(f"Unexpected error: {e}")
            return []

    def write_to_html(self, template_path, output_path, news_items, heading):
        try:
            news_html = '\n'.join(news_items)
            html_output = f'<h2>{heading}</h2>\n{news_html}\n'
            with open(output_path, 'a') as f:
                f.write(html_output)
        except IOError as e:
            print(f"Error writing to file: {e}")
        except Exception as e:
            print(f"Unexpected error: {e}")