import requests
from bs4 import BeautifulSoup
import csv
import time
import os
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import logging

class WebsiteCrawler:
    def __init__(self, base_url, csv_filename='website_data.csv', delay=1):
        self.base_url = base_url.rstrip('/')
        self.csv_filename = csv_filename
        self.delay = delay  # Delay between requests to be respectful
        self.visited_urls = set()
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        # Set up logging
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(__name__)
        
        # Initialize CSV file
        self._init_csv()
    
    def _init_csv(self):
        """Initialize CSV file with headers if it doesn't exist"""
        if not os.path.exists(self.csv_filename):
            with open(self.csv_filename, 'w', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['slug', 'title', 'description', 'url'])
            self.logger.info(f"Created new CSV file: {self.csv_filename}")
        else:
            # Load existing URLs to avoid duplicates
            with open(self.csv_filename, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    self.visited_urls.add(row['url'])
            self.logger.info(f"Loaded {len(self.visited_urls)} existing URLs from CSV")
    
    def _extract_page_data(self, url):
        """Extract title, description, and slug from a page"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract title
            title_tag = soup.find('title')
            title = title_tag.get_text().strip() if title_tag else 'No Title'
            
            # Extract description (try meta description first, then first paragraph)
            description = ''
            meta_desc = soup.find('meta', attrs={'name': 'description'})
            if meta_desc and meta_desc.get('content'):
                description = meta_desc['content'].strip()
            else:
                # Try Open Graph description
                og_desc = soup.find('meta', attrs={'property': 'og:description'})
                if og_desc and og_desc.get('content'):
                    description = og_desc['content'].strip()
                else:
                    # Fallback to first paragraph
                    first_p = soup.find('p')
                    if first_p:
                        description = first_p.get_text().strip()[:200] + '...'
            
            # Extract slug from URL
            parsed_url = urlparse(url)
            slug = parsed_url.path.strip('/') or 'home'
            
            return {
                'slug': slug,
                'title': title,
                'description': description,
                'url': url
            }
            
        except Exception as e:
            self.logger.error(f"Error extracting data from {url}: {str(e)}")
            return None
    
    def _get_internal_links(self, url, soup):
        """Extract internal links from a page"""
        links = set()
        
        for link in soup.find_all('a', href=True):
            href = link['href']
            absolute_url = urljoin(url, href)
            
            # Check if it's an internal link
            if urlparse(absolute_url).netloc == urlparse(self.base_url).netloc:
                # Remove fragments and query parameters for cleaner URLs
                clean_url = absolute_url.split('#')[0].split('?')[0]
                if clean_url not in self.visited_urls:
                    links.add(clean_url)
        
        return links
    
    def _update_csv(self, page_data):
        """Update CSV file with new page data"""
        with open(self.csv_filename, 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([
                page_data['slug'],
                page_data['title'],
                page_data['description'],
                page_data['url']
            ])
        self.logger.info(f"Added to CSV: {page_data['title']} ({page_data['url']})")
    
    def crawl_website(self, max_pages=100):
        """Main crawling function"""
        urls_to_visit = [self.base_url]
        pages_crawled = 0
        
        self.logger.info(f"Starting crawl of {self.base_url}")
        self.logger.info(f"Maximum pages to crawl: {max_pages}")
        
        while urls_to_visit and pages_crawled < max_pages:
            current_url = urls_to_visit.pop(0)
            
            if current_url in self.visited_urls:
                continue
            
            self.logger.info(f"Crawling page {pages_crawled + 1}: {current_url}")
            
            try:
                # Get page content
                response = self.session.get(current_url, timeout=10)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Extract page data
                page_data = self._extract_page_data(current_url)
                if page_data:
                    self._update_csv(page_data)
                    self.visited_urls.add(current_url)
                    pages_crawled += 1
                
                # Find new links to crawl
                new_links = self._get_internal_links(current_url, soup)
                urls_to_visit.extend(list(new_links))
                
                # Be respectful - add delay between requests
                time.sleep(self.delay)
                
            except Exception as e:
                self.logger.error(f"Error crawling {current_url}: {str(e)}")
                continue
        
        self.logger.info(f"Crawling completed. Total pages crawled: {pages_crawled}")
        self.logger.info(f"Data saved to: {self.csv_filename}")

def main():
    # Configuration
    WEBSITE_URL = "https://eos.com"
    if not WEBSITE_URL.startswith(('http://', 'https://')):
        WEBSITE_URL = 'https://' + WEBSITE_URL
    

    CSV_FILENAME = 'eos_website_data.csv'
    
    # try:
    #     MAX_PAGES = int(input("Enter maximum pages to crawl (default: 50): ") or "50")
    # except ValueError:
    MAX_PAGES = 50000
    
    # try:
    #     DELAY = float(input("Enter delay between requests in seconds (default: 1): ") or "1")
    # except ValueError:
    DELAY = 1
    
    # Create crawler instance
    crawler = WebsiteCrawler(
        base_url=WEBSITE_URL,
        csv_filename=CSV_FILENAME,
        delay=DELAY
    )
    
    # Start crawling
    try:
        crawler.crawl_website(max_pages=MAX_PAGES)
        print(f"\nCrawling completed! Check '{CSV_FILENAME}' for results.")
    except KeyboardInterrupt:
        print("\nCrawling interrupted by user.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()