import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import json import os import time import random import traceback import sys try: session_url = sys.argv[1] sub_dir = sys.argv[2] except: print(traceback.format_exc()) def get_page_content(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers) return BeautifulSoup(response.content, 'html.parser') # ... (other functions remain the same) def scrape_website(base_url, output_dir): # ... (beginning of function remains the same) if not os.path.exists(output_dir): os.makedirs(output_dir) visited = set() to_visit = [base_url] while to_visit: try: url = to_visit.pop(0) if url in visited: continue if any(ext in str(url) for ext in ['png', 'jpg', 'jpeg', 'pdf']): continue print(f"Scraping: {url}") soup = get_page_content(url) slug = get_slug(url) content = extract_textual_content(soup) save_to_json(slug, content, output_dir) visited.add(url) new_links = get_all_links(soup, base_url) to_visit.extend([link for link in new_links if link not in visited]) except: print(traceback.format_exc()) # Random delay between 1 and 5 seconds time.sleep(random.uniform(10, 45)) def get_all_links(soup, base_url): links = [] for a_tag in soup.find_all('a', href=True): href = a_tag['href'] full_url = urljoin(base_url, href) if full_url.startswith(base_url): links.append(full_url) return list(set(links)) def get_slug(url): path = urlparse(url).path return path.strip('/').replace('/', '_') or 'index' def extract_textual_content(soup): # Remove script and style elements for script in soup(["script", "style"]): script.decompose() # Get text text = soup.get_text() # Break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # Break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Remove blank lines text = '\n'.join(chunk for chunk in chunks if chunk) return text def save_to_json(slug, content, output_dir): data = { "slug": slug, "content": content } filename = f"{slug}.json" filepath = os.path.join(output_dir, filename) with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) if session_url is not None: base_url = f"https://{session_url}" #base_slug = base_url.replace("https://","") # Replace with the website you want to scrape output_dir = f"blogs/{sub_dir}/scraped_pages_{session_url}" #tmux_cmd = 'tmux new-session -d -s au' + str(i) #os.system(tmux_cmd) #tmux_cmd = f'tmux send-keys -t au{i} "cd publish_blogs_ai" Enter' #tmux_cmd = 'tmux send-keys -t main "python3.8 web_scrape.py 0" Enter' scrape_website(base_url, output_dir)