import requests from requests.auth import HTTPBasicAuth from datetime import datetime, timedelta, timezone import os import pandas as pd # For easier CSV handling initially from urllib.parse import urlparse, unquote from dateutil import parser as date_parser # For robust date parsing # --- CONFIGURATION --- CSV_INPUT_FILE = 'blogs_to_delete.csv' CSV_OUTPUT_FILE = 'blogs_updated.csv' # WordPress Configuration WP_BASE_URL = 'https://farmonaut.com' # Your WordPress site URL (e.g., https://example.com) WP_USERNAME = 'ankuromar296' # Your WordPress username WP_APP_PASSWORD = '2fZW 42ln jhSG qPfm Kx3b hacO' # The Application Password you generated # Google Search Console Configuration GSC_SITE_URL = 'https://farmonaut.com' # Or 'https://yourdomain.com/' if URL prefix property # If using a domain property, it's 'sc-domain:yourdomain.com' # If using a URL prefix property, it's 'https://www.yourdomain.com/' or 'http://...' # Make sure this matches exactly what's in your Search Console SCOPES = ['https://www.googleapis.com/auth/webmasters.readonly'] CLIENT_SECRETS_FILE = 'credentials.json' # Downloaded from Google Cloud Console TOKEN_FILE = 'token.json' # Logic Configuration DAYS_OLD_THRESHOLD = 30 CLICKS_THRESHOLD = 10 EXCLUDED_COUNTRIES = ['PAK', 'TUR', 'AZE'] # ISO 3166-1 alpha-3 country codes # --- GOOGLE SEARCH CONSOLE API AUTHENTICATION --- def get_gsc_service(): from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request from google.oauth2.credentials import Credentials import pickle # Using pickle for token.json with google-auth, can also use json creds = None if os.path.exists(TOKEN_FILE): with open(TOKEN_FILE, 'rb') as token: creds = pickle.load(token) # Or json.load if you saved as json if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES) creds = flow.run_local_server(port=0) with open(TOKEN_FILE, 'wb') as token: # Or 'w' for json pickle.dump(creds, token) # Or json.dump from googleapiclient.discovery import build return build('searchconsole', 'v1', credentials=creds) # --- WORDPRESS API FUNCTIONS --- def get_wp_post_slug_from_url(url): """Extracts the slug from a WordPress URL.""" path = urlparse(url).path # Remove leading/trailing slashes and get the last part slug = path.strip('/').split('/')[-1] return unquote(slug) # Handle URL encoded characters in slug def get_wordpress_post_details(slug): """Fetches post ID and published date using the slug.""" api_url = f"{WP_BASE_URL}/wp-json/wp/v2/posts?slug={slug}" try: response = requests.get(api_url, timeout=10) response.raise_for_status() posts = response.json() if posts: post = posts[0] # Assuming slug is unique # Use date_gmt for timezone consistency published_date_str = post.get('date_gmt') if not published_date_str: published_date_str = post.get('date') # Fallback if published_date_str: # WordPress dates are ISO 8601 format e.g. "2023-10-26T14:30:00" # Parse it and make it timezone-aware (UTC) published_date = date_parser.isoparse(published_date_str) if published_date.tzinfo is None: # if not timezone aware, assume UTC as per date_gmt published_date = published_date.replace(tzinfo=timezone.utc) else: # ensure it's converted to UTC published_date = published_date.astimezone(timezone.utc) return post['id'], published_date else: print(f"Warning: Could not find published date for slug {slug}.") return post['id'], None else: print(f"No WordPress post found for slug: {slug}") return None, None except requests.exceptions.RequestException as e: print(f"Error fetching WordPress post for slug {slug}: {e}") return None, None except ValueError as e: # JSON decoding error print(f"Error decoding JSON for slug {slug}: {e}") return None, None def delete_wordpress_post(post_id): """Deletes a WordPress post by its ID.""" api_url = f"{WP_BASE_URL}/wp-json/wp/v2/posts/{post_id}?force=true" # force=true bypasses trash auth = HTTPBasicAuth(WP_USERNAME, WP_APP_PASSWORD) response = None try: response = requests.delete(api_url, auth=auth, timeout=15) response.raise_for_status() print(f"Successfully deleted WordPress post ID: {post_id}") return True except requests.exceptions.RequestException as e: print(f"Error deleting WordPress post ID {post_id}: {e}") if response is not None: print(f"Response content: {response.text}") return False # --- GOOGLE SEARCH CONSOLE API FUNCTION --- def get_gsc_clicks(service, page_url): """Fetches total clicks for a specific URL in the last 12 months, excluding certain countries.""" today = datetime.now(timezone.utc) twelve_months_ago = today - timedelta(days=365) start_date_str = twelve_months_ago.strftime('%Y-%m-%d') end_date_str = today.strftime('%Y-%m-%d') # Filters for country exclusion country_filters = [] for country_code in EXCLUDED_COUNTRIES: country_filters.append({ 'dimension': 'country', 'operator': 'notContains', 'expression': country_code }) request_body = { 'startDate': start_date_str, 'endDate': end_date_str, 'dimensions': ['page'], # We are already filtering by page, so sum of clicks is fine 'type': 'web', 'dimensionFilterGroups': [ { # Group for page URL 'filters': [ { 'dimension': 'page', 'operator': 'equals', 'expression': page_url } ] }, { # Group for country exclusions (all must be true within this group) # This might not work as expected for multiple "notContains". # A better way might be to get global clicks and then subtract clicks from excluded countries, # or filter client-side if the API doesn't support multiple 'notContains' easily. # For now, let's try with multiple filters. # Actually, GSC API supports only one filter per dimension in a group. # So, we can't do country != A AND country != B AND country != C directly. # The API documentation for `dimensionFilterGroups` says: # "All filters in a filter group are connected by an AND operation." # "All filter groups are connected by an OR operation." # This means we CANNOT simply list multiple "notContains" for country. # # The most straightforward way is to fetch ALL clicks for the page, # then make separate queries for each excluded country and subtract. # OR, fetch data with 'country' dimension and sum up manually. # Let's try fetching with country dimension and filtering in Python. 'filters': country_filters # This will likely be treated as OR by API if in same group. # Let's revise this part. } ], 'aggregationType': 'auto' # Let GSC decide, or use 'byPage' } # Revised approach for country exclusion: Fetch data with 'country' dimension # and sum clicks from non-excluded countries. request_body_revised = { 'startDate': start_date_str, 'endDate': end_date_str, 'dimensions': ['page', 'country'], # Add country dimension 'type': 'web', 'dimensionFilterGroups': [ { 'filters': [ { 'dimension': 'page', 'operator': 'equals', 'expression': page_url } ] } ], 'rowLimit': 25000 # Max rows, ensure we get all countries for the page } try: response = service.searchanalytics().query( siteUrl=GSC_SITE_URL, body=request_body_revised ).execute() total_clicks = 0 if 'rows' in response: for row in response['rows']: # row['keys'] will be [page_url, country_code] country = row['keys'][1].upper() # GSC returns lowercase country codes e.g. 'usa' if country not in EXCLUDED_COUNTRIES: total_clicks += row['clicks'] print(f"GSC: Found {total_clicks} clicks for {page_url} (excluding {', '.join(EXCLUDED_COUNTRIES)})") return total_clicks else: print(f"GSC: No data found for {page_url} in the last 12 months.") return 0 except Exception as e: print(f"Error fetching GSC data for {page_url}: {e}") return None # Indicate an error occurred # --- MAIN SCRIPT LOGIC --- def main(): print("Starting blog management script...") # Authenticate with Google Search Console try: gsc_service = get_gsc_service() print("Successfully authenticated with Google Search Console.") except Exception as e: print(f"Failed to authenticate with GSC: {e}") print("Please ensure 'credentials.json' is set up correctly and you've authorized the app.") return # Read CSV try: df = pd.read_csv(CSV_INPUT_FILE) if 'url' not in df.columns: print(f"Error: CSV file '{CSV_INPUT_FILE}' must contain a 'url' column.") return except FileNotFoundError: print(f"Error: Input CSV file '{CSV_INPUT_FILE}' not found.") return except Exception as e: print(f"Error reading CSV file: {e}") return output_rows = [] # Process each URL for index, row in df.iterrows(): url = row['url'] print(f"\nProcessing URL: {url}") status = "not processed" original_row_dict = row.to_dict() slug = get_wp_post_slug_from_url(url) if not slug: print(f"Could not extract slug from URL: {url}") original_row_dict['status'] = "invalid URL for slug extraction" output_rows.append(original_row_dict) continue wp_post_id, wp_published_date_dt = get_wordpress_post_details(slug) if wp_post_id and wp_published_date_dt: # Check age now_utc = datetime.now(timezone.utc) age = now_utc - wp_published_date_dt is_older_than_threshold = age > timedelta(days=DAYS_OLD_THRESHOLD) print(f"WP Post ID: {wp_post_id}, Published: {wp_published_date_dt.strftime('%Y-%m-%d')}, Age: {age.days} days.") print(f"Is older than {DAYS_OLD_THRESHOLD} days? {is_older_than_threshold}") if is_older_than_threshold: clicks = get_gsc_clicks(gsc_service, url) if clicks is not None: # Clicks could be 0, None means API error print(f"Clicks (last 12m, filtered): {clicks}") if clicks < CLICKS_THRESHOLD: print(f"FLAGGED: Post is older than {DAYS_OLD_THRESHOLD} days AND has < {CLICKS_THRESHOLD} clicks.") user_confirm = 'y' #user_confirm = input(f"Delete post '{url}' (ID: {wp_post_id})? [y/N]: ").strip().lower() if user_confirm == 'y': if delete_wordpress_post(wp_post_id): status = "deleted" else: status = "delete_failed" else: print("Skipping deletion.") status = "not_deleted_user_skip" else: status = "not_deleted_criteria_not_met (clicks ok)" else: status = "not_deleted_gsc_error" # Error fetching GSC data else: status = "not_deleted_criteria_not_met (too new)" elif wp_post_id and not wp_published_date_dt: status = "wp_post_found_no_date" else: status = "wp_post_not_found" original_row_dict['status'] = status output_rows.append(original_row_dict) # Write updated CSV if output_rows: output_df = pd.DataFrame(output_rows) try: output_df.to_csv(CSV_OUTPUT_FILE, index=False) print(f"\nProcessing complete. Updated CSV saved to '{CSV_OUTPUT_FILE}'") except Exception as e: print(f"Error writing output CSV: {e}") else: print("No rows were processed to write to output.") if __name__ == '__main__': main()