import requests from bs4 import BeautifulSoup import csv import time import os from urllib.parse import urljoin, urlparse from urllib.robotparser import RobotFileParser import logging class WebsiteCrawler: def __init__(self, base_url, csv_filename='website_data.csv', delay=1): self.base_url = base_url.rstrip('/') self.csv_filename = csv_filename self.delay = delay # Delay between requests to be respectful self.visited_urls = set() self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') self.logger = logging.getLogger(__name__) # Initialize CSV file self._init_csv() def _init_csv(self): """Initialize CSV file with headers if it doesn't exist""" if not os.path.exists(self.csv_filename): with open(self.csv_filename, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['slug', 'title', 'description', 'url']) self.logger.info(f"Created new CSV file: {self.csv_filename}") else: # Load existing URLs to avoid duplicates with open(self.csv_filename, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: self.visited_urls.add(row['url']) self.logger.info(f"Loaded {len(self.visited_urls)} existing URLs from CSV") def _extract_page_data(self, url): """Extract title, description, and slug from a page""" try: response = self.session.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Extract title title_tag = soup.find('title') title = title_tag.get_text().strip() if title_tag else 'No Title' # Extract description (try meta description first, then first paragraph) description = '' meta_desc = soup.find('meta', attrs={'name': 'description'}) if meta_desc and meta_desc.get('content'): description = meta_desc['content'].strip() else: # Try Open Graph description og_desc = soup.find('meta', attrs={'property': 'og:description'}) if og_desc and og_desc.get('content'): description = og_desc['content'].strip() else: # Fallback to first paragraph first_p = soup.find('p') if first_p: description = first_p.get_text().strip()[:200] + '...' # Extract slug from URL parsed_url = urlparse(url) slug = parsed_url.path.strip('/') or 'home' return { 'slug': slug, 'title': title, 'description': description, 'url': url } except Exception as e: self.logger.error(f"Error extracting data from {url}: {str(e)}") return None def _get_internal_links(self, url, soup): """Extract internal links from a page""" links = set() for link in soup.find_all('a', href=True): href = link['href'] absolute_url = urljoin(url, href) # Check if it's an internal link if urlparse(absolute_url).netloc == urlparse(self.base_url).netloc: # Remove fragments and query parameters for cleaner URLs clean_url = absolute_url.split('#')[0].split('?')[0] if clean_url not in self.visited_urls: links.add(clean_url) return links def _update_csv(self, page_data): """Update CSV file with new page data""" with open(self.csv_filename, 'a', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow([ page_data['slug'], page_data['title'], page_data['description'], page_data['url'] ]) self.logger.info(f"Added to CSV: {page_data['title']} ({page_data['url']})") def crawl_website(self, max_pages=100): """Main crawling function""" urls_to_visit = [self.base_url] pages_crawled = 0 self.logger.info(f"Starting crawl of {self.base_url}") self.logger.info(f"Maximum pages to crawl: {max_pages}") while urls_to_visit and pages_crawled < max_pages: current_url = urls_to_visit.pop(0) if current_url in self.visited_urls: continue self.logger.info(f"Crawling page {pages_crawled + 1}: {current_url}") try: # Get page content response = self.session.get(current_url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Extract page data page_data = self._extract_page_data(current_url) if page_data: self._update_csv(page_data) self.visited_urls.add(current_url) pages_crawled += 1 # Find new links to crawl new_links = self._get_internal_links(current_url, soup) urls_to_visit.extend(list(new_links)) # Be respectful - add delay between requests time.sleep(self.delay) except Exception as e: self.logger.error(f"Error crawling {current_url}: {str(e)}") continue self.logger.info(f"Crawling completed. Total pages crawled: {pages_crawled}") self.logger.info(f"Data saved to: {self.csv_filename}") def main(): # Configuration WEBSITE_URL = "https://eos.com" if not WEBSITE_URL.startswith(('http://', 'https://')): WEBSITE_URL = 'https://' + WEBSITE_URL CSV_FILENAME = 'eos_website_data.csv' # try: # MAX_PAGES = int(input("Enter maximum pages to crawl (default: 50): ") or "50") # except ValueError: MAX_PAGES = 50000 # try: # DELAY = float(input("Enter delay between requests in seconds (default: 1): ") or "1") # except ValueError: DELAY = 1 # Create crawler instance crawler = WebsiteCrawler( base_url=WEBSITE_URL, csv_filename=CSV_FILENAME, delay=DELAY ) # Start crawling try: crawler.crawl_website(max_pages=MAX_PAGES) print(f"\nCrawling completed! Check '{CSV_FILENAME}' for results.") except KeyboardInterrupt: print("\nCrawling interrupted by user.") except Exception as e: print(f"An error occurred: {str(e)}") if __name__ == "__main__": main()