import pandas as pd import requests import os import time import threading import concurrent.futures from requests.auth import HTTPBasicAuth from datetime import datetime, timedelta, timezone from urllib.parse import urlparse, unquote from dateutil import parser as date_parser from tqdm import tqdm # For progress bar # --- CONFIGURATION --- CSV_INPUT_FILE = 'blogs_to_delete.csv' CSV_OUTPUT_FILE = 'blogs_updated_status.csv' DELETED_LOG_FILE = 'deleted_urls.log' # New file for logging deleted URLs # WordPress Configuration WP_BASE_URL = 'https://farmonaut.com' WP_USERNAME = 'ankuromar296' WP_APP_PASSWORD = '2fZW 42ln jhSG qPfm Kx3b hacO' # Google Search Console Configuration GSC_SITE_URL = 'https://farmonaut.com' SCOPES = ['https://www.googleapis.com/auth/webmasters.readonly'] CLIENT_SECRETS_FILE = 'credentials.json' TOKEN_FILE = 'token.json' # Logic Configuration DAYS_OLD_THRESHOLD = 120 CLICKS_THRESHOLD = 1 EXCLUDED_COUNTRIES = ['PAK', 'TUR', 'AZE'] # --- PARALLELISM & RATE LIMITING CONFIGURATION --- # Set the maximum number of concurrent requests for each service # Adjust these based on your server's capacity and the API's rate limits. # A safe starting point is often between 2 and 5. WP_MAX_CONCURRENT_REQUESTS = 4 GSC_MAX_CONCURRENT_REQUESTS = 2 # GSC API has stricter quotas MAX_WORKERS = 1 # Max threads in the pool. Should be >= the sum of the above. # --- GOOGLE SEARCH CONSOLE API AUTHENTICATION (Unchanged) --- def get_gsc_service(): from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request from google.oauth2.credentials import Credentials import pickle creds = None if os.path.exists(TOKEN_FILE): with open(TOKEN_FILE, 'rb') as token: creds = pickle.load(token) if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES) creds = flow.run_local_server(port=0) with open(TOKEN_FILE, 'wb') as token: pickle.dump(creds, token) from googleapiclient.discovery import build return build('searchconsole', 'v1', credentials=creds) # --- API FUNCTIONS (UPDATED FOR RATE LIMITING) --- def get_wp_post_slug_from_url(url): """Extracts the slug from a WordPress URL.""" try: path = urlparse(url).path slug = path.strip('/').split('/')[-1] return unquote(slug) except Exception: return None def get_wordpress_post_details(slug, semaphore): """Fetches post ID and published date, respecting the semaphore.""" api_url = f"{WP_BASE_URL}/wp-json/wp/v2/posts?slug={slug}" with semaphore: # Acquire semaphore before making the request try: # A small delay can help distribute requests over time time.sleep(0.1) response = requests.get(api_url, timeout=120) response.raise_for_status() posts = response.json() if posts: post = posts[0] published_date_str = post.get('date_gmt', post.get('date')) if published_date_str: published_date = date_parser.isoparse(published_date_str) return post['id'], published_date.astimezone(timezone.utc) return post['id'], None return None, None except (requests.exceptions.RequestException, ValueError) as e: print(f"\n[Error] WP Details for slug {slug}: {e}") return None, None def delete_wordpress_post(post_id, semaphore): """Deletes a WordPress post, respecting the semaphore.""" api_url = f"{WP_BASE_URL}/wp-json/wp/v2/posts/{post_id}?force=true" auth = HTTPBasicAuth(WP_USERNAME, WP_APP_PASSWORD) with semaphore: # Acquire semaphore before making the request try: time.sleep(0.1) response = requests.delete(api_url, auth=auth, timeout=120) response.raise_for_status() return True except requests.exceptions.RequestException as e: print(f"\n[Error] Deleting WP post ID {post_id}: {e}") return False def get_gsc_clicks(service, page_url, semaphore): """Fetches GSC clicks, respecting the semaphore.""" today = datetime.now(timezone.utc) twelve_months_ago = today - timedelta(days=365) request_body = { 'startDate': twelve_months_ago.strftime('%Y-%m-%d'), 'endDate': today.strftime('%Y-%m-%d'), 'dimensions': ['page', 'country'], 'type': 'web', 'dimensionFilterGroups': [{'filters': [{'dimension': 'page', 'operator': 'equals', 'expression': page_url}]}], 'rowLimit': 25000 } with semaphore: # Acquire semaphore before making the request try: time.sleep(0.2) # GSC is more sensitive, add a slightly longer delay response = service.searchanalytics().query(siteUrl=GSC_SITE_URL, body=request_body).execute() total_clicks = 0 if 'rows' in response: for row in response['rows']: country = row['keys'][1].upper() if country not in EXCLUDED_COUNTRIES: total_clicks += row['clicks'] return total_clicks return 0 except Exception as e: print(f"\n[Error] GSC data for {page_url}: {e}") return None # --- WORKER FUNCTION FOR PARALLEL PROCESSING --- def process_single_blog(row, gsc_service, semaphores): """ Processes a single blog URL: checks criteria and deletes if necessary. This function is executed by each thread in the pool. """ url = row['url'] status = "not processed" slug = get_wp_post_slug_from_url(url) if not slug: status = "invalid_url_for_slug" return {**row.to_dict(), 'status': status} wp_post_id, wp_published_date = get_wordpress_post_details(slug, semaphores['wp']) if not wp_post_id: status = "wp_post_not_found" return {**row.to_dict(), 'status': status} if not wp_published_date: status = "wp_post_found_no_date" return {**row.to_dict(), 'status': status} # Check age criteria age = datetime.now(timezone.utc) - wp_published_date if age <= timedelta(days=DAYS_OLD_THRESHOLD): status = f"not_deleted_too_new ({age.days} days old)" return {**row.to_dict(), 'status': status} # Check GSC clicks criteria clicks = get_gsc_clicks(gsc_service, url, semaphores['gsc']) if clicks is None: status = "not_deleted_gsc_api_error" return {**row.to_dict(), 'status': status} if clicks >= CLICKS_THRESHOLD: status = f"not_deleted_high_clicks ({clicks} clicks)" return {**row.to_dict(), 'status': status} # If all criteria are met, proceed with deletion print(f"\n[FLAGGED FOR DELETION] URL: {url} (Age: {age.days} days, Clicks: {clicks})") if delete_wordpress_post(wp_post_id, semaphores['wp']): status = "deleted" print(f"[SUCCESS] Deleted: {url}") else: status = "delete_failed" print(f"[FAILURE] Failed to delete: {url}") return {**row.to_dict(), 'status': status} # --- MAIN SCRIPT LOGIC --- def main(): print("Starting parallel blog management script...") # Authenticate with Google Search Console first try: gsc_service = get_gsc_service() print("Successfully authenticated with Google Search Console.") except Exception as e: print(f"FATAL: Failed to authenticate with GSC: {e}") return # Read CSV try: df = pd.read_csv(CSV_INPUT_FILE) if 'url' not in df.columns: print(f"FATAL: CSV file '{CSV_INPUT_FILE}' must contain a 'url' column.") return urls_to_process = [row for _, row in df.iterrows()] except FileNotFoundError: print(f"FATAL: Input CSV file '{CSV_INPUT_FILE}' not found.") return # Create semaphores for rate limiting semaphores = { 'wp': threading.Semaphore(WP_MAX_CONCURRENT_REQUESTS), 'gsc': threading.Semaphore(GSC_MAX_CONCURRENT_REQUESTS) } results = [] deleted_urls = [] # Use ThreadPoolExecutor to process URLs in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: # Create a future for each URL processing task future_to_url = {executor.submit(process_single_blog, row, gsc_service, semaphores): row['url'] for row in urls_to_process} # Process results as they are completed, with a progress bar for future in tqdm(concurrent.futures.as_completed(future_to_url), total=len(urls_to_process), desc="Processing Blogs"): try: result_data = future.result() results.append(result_data) if result_data.get('status') == 'deleted': deleted_urls.append(result_data['url']) except Exception as exc: url = future_to_url[future] print(f"\n[FATAL_ERROR] URL '{url}' generated an exception: {exc}") # Optionally append a failure status to results results.append({'url': url, 'status': f'thread_exception: {exc}'}) # --- WRITE OUTPUT FILES --- # 1. Write the detailed status report CSV if results: output_df = pd.DataFrame(results) # Reorder columns to have 'url' and 'status' first for clarity if 'status' in output_df.columns and 'url' in output_df.columns: cols = ['url', 'status'] + [col for col in output_df.columns if col not in ['url', 'status']] output_df = output_df[cols] output_df.to_csv(CSV_OUTPUT_FILE, index=False) print(f"\nProcessing complete. Full status report saved to '{CSV_OUTPUT_FILE}'") # 2. Write the log of deleted URLs to a separate file if deleted_urls: with open(DELETED_LOG_FILE, 'w') as f: for url in deleted_urls: f.write(f"{url}\n") print(f"Successfully deleted {len(deleted_urls)} blogs. Log saved to '{DELETED_LOG_FILE}'") else: print("No blogs were deleted during this run.") if __name__ == '__main__': main()