import pandas as pd
import requests
import os
import time
import threading
import concurrent.futures
from requests.auth import HTTPBasicAuth
from datetime import datetime, timedelta, timezone
from urllib.parse import urlparse, unquote
from dateutil import parser as date_parser
from tqdm import tqdm # For progress bar

# --- CONFIGURATION ---
CSV_INPUT_FILE = 'blogs_to_delete.csv'
CSV_OUTPUT_FILE = 'blogs_updated_status.csv'
DELETED_LOG_FILE = 'deleted_urls.log' # New file for logging deleted URLs

# WordPress Configuration
WP_BASE_URL = 'https://farmonaut.com'
WP_USERNAME = 'ankuromar296'
WP_APP_PASSWORD = '2fZW 42ln jhSG qPfm Kx3b hacO'

# Google Search Console Configuration
GSC_SITE_URL = 'https://farmonaut.com'
SCOPES = ['https://www.googleapis.com/auth/webmasters.readonly']
CLIENT_SECRETS_FILE = 'credentials.json'
TOKEN_FILE = 'token.json'

# Logic Configuration
DAYS_OLD_THRESHOLD = 120
CLICKS_THRESHOLD = 1
EXCLUDED_COUNTRIES = ['PAK', 'TUR', 'AZE']

# --- PARALLELISM & RATE LIMITING CONFIGURATION ---
# Set the maximum number of concurrent requests for each service
# Adjust these based on your server's capacity and the API's rate limits.
# A safe starting point is often between 2 and 5.
WP_MAX_CONCURRENT_REQUESTS = 4
GSC_MAX_CONCURRENT_REQUESTS = 2 # GSC API has stricter quotas
MAX_WORKERS = 1 # Max threads in the pool. Should be >= the sum of the above.

# --- GOOGLE SEARCH CONSOLE API AUTHENTICATION (Unchanged) ---
def get_gsc_service():
    from google_auth_oauthlib.flow import InstalledAppFlow
    from google.auth.transport.requests import Request
    from google.oauth2.credentials import Credentials
    import pickle

    creds = None
    if os.path.exists(TOKEN_FILE):
        with open(TOKEN_FILE, 'rb') as token:
            creds = pickle.load(token)
    
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
            creds = flow.run_local_server(port=0)
        with open(TOKEN_FILE, 'wb') as token:
            pickle.dump(creds, token)

    from googleapiclient.discovery import build
    return build('searchconsole', 'v1', credentials=creds)

# --- API FUNCTIONS (UPDATED FOR RATE LIMITING) ---

def get_wp_post_slug_from_url(url):
    """Extracts the slug from a WordPress URL."""
    try:
        path = urlparse(url).path
        slug = path.strip('/').split('/')[-1]
        return unquote(slug)
    except Exception:
        return None

def get_wordpress_post_details(slug, semaphore):
    """Fetches post ID and published date, respecting the semaphore."""
    api_url = f"{WP_BASE_URL}/wp-json/wp/v2/posts?slug={slug}"
    with semaphore: # Acquire semaphore before making the request
        try:
            # A small delay can help distribute requests over time
            time.sleep(0.1) 
            response = requests.get(api_url, timeout=120)
            response.raise_for_status()
            posts = response.json()
            if posts:
                post = posts[0]
                published_date_str = post.get('date_gmt', post.get('date'))
                if published_date_str:
                    published_date = date_parser.isoparse(published_date_str)
                    return post['id'], published_date.astimezone(timezone.utc)
                return post['id'], None
            return None, None
        except (requests.exceptions.RequestException, ValueError) as e:
            print(f"\n[Error] WP Details for slug {slug}: {e}")
            return None, None

def delete_wordpress_post(post_id, semaphore):
    """Deletes a WordPress post, respecting the semaphore."""
    api_url = f"{WP_BASE_URL}/wp-json/wp/v2/posts/{post_id}?force=true"
    auth = HTTPBasicAuth(WP_USERNAME, WP_APP_PASSWORD)
    with semaphore: # Acquire semaphore before making the request
        try:
            time.sleep(0.1)
            response = requests.delete(api_url, auth=auth, timeout=120)
            response.raise_for_status()
            return True
        except requests.exceptions.RequestException as e:
            print(f"\n[Error] Deleting WP post ID {post_id}: {e}")
            return False

def get_gsc_clicks(service, page_url, semaphore):
    """Fetches GSC clicks, respecting the semaphore."""
    today = datetime.now(timezone.utc)
    twelve_months_ago = today - timedelta(days=365)
    request_body = {
        'startDate': twelve_months_ago.strftime('%Y-%m-%d'),
        'endDate': today.strftime('%Y-%m-%d'),
        'dimensions': ['page', 'country'],
        'type': 'web',
        'dimensionFilterGroups': [{'filters': [{'dimension': 'page', 'operator': 'equals', 'expression': page_url}]}],
        'rowLimit': 25000
    }
    with semaphore: # Acquire semaphore before making the request
        try:
            time.sleep(0.2) # GSC is more sensitive, add a slightly longer delay
            response = service.searchanalytics().query(siteUrl=GSC_SITE_URL, body=request_body).execute()
            total_clicks = 0
            if 'rows' in response:
                for row in response['rows']:
                    country = row['keys'][1].upper()
                    if country not in EXCLUDED_COUNTRIES:
                        total_clicks += row['clicks']
                return total_clicks
            return 0
        except Exception as e:
            print(f"\n[Error] GSC data for {page_url}: {e}")
            return None

# --- WORKER FUNCTION FOR PARALLEL PROCESSING ---
def process_single_blog(row, gsc_service, semaphores):
    """
    Processes a single blog URL: checks criteria and deletes if necessary.
    This function is executed by each thread in the pool.
    """
    url = row['url']
    status = "not processed"
    
    slug = get_wp_post_slug_from_url(url)
    if not slug:
        status = "invalid_url_for_slug"
        return {**row.to_dict(), 'status': status}

    wp_post_id, wp_published_date = get_wordpress_post_details(slug, semaphores['wp'])
    if not wp_post_id:
        status = "wp_post_not_found"
        return {**row.to_dict(), 'status': status}
    
    if not wp_published_date:
        status = "wp_post_found_no_date"
        return {**row.to_dict(), 'status': status}

    # Check age criteria
    age = datetime.now(timezone.utc) - wp_published_date
    if age <= timedelta(days=DAYS_OLD_THRESHOLD):
        status = f"not_deleted_too_new ({age.days} days old)"
        return {**row.to_dict(), 'status': status}

    # Check GSC clicks criteria
    clicks = get_gsc_clicks(gsc_service, url, semaphores['gsc'])
    if clicks is None:
        status = "not_deleted_gsc_api_error"
        return {**row.to_dict(), 'status': status}

    if clicks >= CLICKS_THRESHOLD:
        status = f"not_deleted_high_clicks ({clicks} clicks)"
        return {**row.to_dict(), 'status': status}

    # If all criteria are met, proceed with deletion
    print(f"\n[FLAGGED FOR DELETION] URL: {url} (Age: {age.days} days, Clicks: {clicks})")
    if delete_wordpress_post(wp_post_id, semaphores['wp']):
        status = "deleted"
        print(f"[SUCCESS] Deleted: {url}")
    else:
        status = "delete_failed"
        print(f"[FAILURE] Failed to delete: {url}")
        
    return {**row.to_dict(), 'status': status}

# --- MAIN SCRIPT LOGIC ---
def main():
    print("Starting parallel blog management script...")

    # Authenticate with Google Search Console first
    try:
        gsc_service = get_gsc_service()
        print("Successfully authenticated with Google Search Console.")
    except Exception as e:
        print(f"FATAL: Failed to authenticate with GSC: {e}")
        return

    # Read CSV
    try:
        df = pd.read_csv(CSV_INPUT_FILE)
        if 'url' not in df.columns:
            print(f"FATAL: CSV file '{CSV_INPUT_FILE}' must contain a 'url' column.")
            return
        urls_to_process = [row for _, row in df.iterrows()]
    except FileNotFoundError:
        print(f"FATAL: Input CSV file '{CSV_INPUT_FILE}' not found.")
        return

    # Create semaphores for rate limiting
    semaphores = {
        'wp': threading.Semaphore(WP_MAX_CONCURRENT_REQUESTS),
        'gsc': threading.Semaphore(GSC_MAX_CONCURRENT_REQUESTS)
    }

    results = []
    deleted_urls = []

    # Use ThreadPoolExecutor to process URLs in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Create a future for each URL processing task
        future_to_url = {executor.submit(process_single_blog, row, gsc_service, semaphores): row['url'] for row in urls_to_process}
        
        # Process results as they are completed, with a progress bar
        for future in tqdm(concurrent.futures.as_completed(future_to_url), total=len(urls_to_process), desc="Processing Blogs"):
            try:
                result_data = future.result()
                results.append(result_data)
                if result_data.get('status') == 'deleted':
                    deleted_urls.append(result_data['url'])
            except Exception as exc:
                url = future_to_url[future]
                print(f"\n[FATAL_ERROR] URL '{url}' generated an exception: {exc}")
                # Optionally append a failure status to results
                results.append({'url': url, 'status': f'thread_exception: {exc}'})

    # --- WRITE OUTPUT FILES ---

    # 1. Write the detailed status report CSV
    if results:
        output_df = pd.DataFrame(results)
        # Reorder columns to have 'url' and 'status' first for clarity
        if 'status' in output_df.columns and 'url' in output_df.columns:
            cols = ['url', 'status'] + [col for col in output_df.columns if col not in ['url', 'status']]
            output_df = output_df[cols]
        output_df.to_csv(CSV_OUTPUT_FILE, index=False)
        print(f"\nProcessing complete. Full status report saved to '{CSV_OUTPUT_FILE}'")

    # 2. Write the log of deleted URLs to a separate file
    if deleted_urls:
        with open(DELETED_LOG_FILE, 'w') as f:
            for url in deleted_urls:
                f.write(f"{url}\n")
        print(f"Successfully deleted {len(deleted_urls)} blogs. Log saved to '{DELETED_LOG_FILE}'")
    else:
        print("No blogs were deleted during this run.")

if __name__ == '__main__':
    main()