import requests
from requests.auth import HTTPBasicAuth
from datetime import datetime, timedelta, timezone
import os
import pandas as pd # For easier CSV handling initially
from urllib.parse import urlparse, unquote
from dateutil import parser as date_parser # For robust date parsing

# --- CONFIGURATION ---
CSV_INPUT_FILE = 'blogs_to_delete.csv'
CSV_OUTPUT_FILE = 'blogs_updated.csv'

# WordPress Configuration
WP_BASE_URL = 'https://farmonaut.com' # Your WordPress site URL (e.g., https://example.com)
WP_USERNAME = 'ankuromar296'      # Your WordPress username
WP_APP_PASSWORD = '2fZW 42ln jhSG qPfm Kx3b hacO' # The Application Password you generated

# Google Search Console Configuration
GSC_SITE_URL = 'https://farmonaut.com' # Or 'https://yourdomain.com/' if URL prefix property
# If using a domain property, it's 'sc-domain:yourdomain.com'
# If using a URL prefix property, it's 'https://www.yourdomain.com/' or 'http://...'
# Make sure this matches exactly what's in your Search Console
SCOPES = ['https://www.googleapis.com/auth/webmasters.readonly']
CLIENT_SECRETS_FILE = 'credentials.json' # Downloaded from Google Cloud Console
TOKEN_FILE = 'token.json'

# Logic Configuration
DAYS_OLD_THRESHOLD = 30
CLICKS_THRESHOLD = 10
EXCLUDED_COUNTRIES = ['PAK', 'TUR', 'AZE'] # ISO 3166-1 alpha-3 country codes

# --- GOOGLE SEARCH CONSOLE API AUTHENTICATION ---
def get_gsc_service():
    from google_auth_oauthlib.flow import InstalledAppFlow
    from google.auth.transport.requests import Request
    from google.oauth2.credentials import Credentials
    import pickle # Using pickle for token.json with google-auth, can also use json

    creds = None
    if os.path.exists(TOKEN_FILE):
        with open(TOKEN_FILE, 'rb') as token:
            creds = pickle.load(token) # Or json.load if you saved as json
    
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
            creds = flow.run_local_server(port=0)
        with open(TOKEN_FILE, 'wb') as token: # Or 'w' for json
            pickle.dump(creds, token) # Or json.dump

    from googleapiclient.discovery import build
    return build('searchconsole', 'v1', credentials=creds)

# --- WORDPRESS API FUNCTIONS ---
def get_wp_post_slug_from_url(url):
    """Extracts the slug from a WordPress URL."""
    path = urlparse(url).path
    # Remove leading/trailing slashes and get the last part
    slug = path.strip('/').split('/')[-1]
    return unquote(slug) # Handle URL encoded characters in slug

def get_wordpress_post_details(slug):
    """Fetches post ID and published date using the slug."""
    api_url = f"{WP_BASE_URL}/wp-json/wp/v2/posts?slug={slug}"
    try:
        response = requests.get(api_url, timeout=10)
        response.raise_for_status()
        posts = response.json()
        if posts:
            post = posts[0] # Assuming slug is unique
            # Use date_gmt for timezone consistency
            published_date_str = post.get('date_gmt')
            if not published_date_str:
                 published_date_str = post.get('date') # Fallback
            
            if published_date_str:
                # WordPress dates are ISO 8601 format e.g. "2023-10-26T14:30:00"
                # Parse it and make it timezone-aware (UTC)
                published_date = date_parser.isoparse(published_date_str)
                if published_date.tzinfo is None: # if not timezone aware, assume UTC as per date_gmt
                    published_date = published_date.replace(tzinfo=timezone.utc)
                else: # ensure it's converted to UTC
                    published_date = published_date.astimezone(timezone.utc)

                return post['id'], published_date
            else:
                print(f"Warning: Could not find published date for slug {slug}.")
                return post['id'], None
        else:
            print(f"No WordPress post found for slug: {slug}")
            return None, None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching WordPress post for slug {slug}: {e}")
        return None, None
    except ValueError as e: # JSON decoding error
        print(f"Error decoding JSON for slug {slug}: {e}")
        return None, None

def delete_wordpress_post(post_id):
    """Deletes a WordPress post by its ID."""
    api_url = f"{WP_BASE_URL}/wp-json/wp/v2/posts/{post_id}?force=true" # force=true bypasses trash
    auth = HTTPBasicAuth(WP_USERNAME, WP_APP_PASSWORD)
    response = None
    try:
        response = requests.delete(api_url, auth=auth, timeout=15)
        response.raise_for_status()
        print(f"Successfully deleted WordPress post ID: {post_id}")
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error deleting WordPress post ID {post_id}: {e}")
        if response is not None:
            print(f"Response content: {response.text}")
        return False

# --- GOOGLE SEARCH CONSOLE API FUNCTION ---
def get_gsc_clicks(service, page_url):
    """Fetches total clicks for a specific URL in the last 12 months, excluding certain countries."""
    today = datetime.now(timezone.utc)
    twelve_months_ago = today - timedelta(days=365)
    
    start_date_str = twelve_months_ago.strftime('%Y-%m-%d')
    end_date_str = today.strftime('%Y-%m-%d')

    # Filters for country exclusion
    country_filters = []
    for country_code in EXCLUDED_COUNTRIES:
        country_filters.append({
            'dimension': 'country',
            'operator': 'notContains',
            'expression': country_code
        })
    
    request_body = {
        'startDate': start_date_str,
        'endDate': end_date_str,
        'dimensions': ['page'], # We are already filtering by page, so sum of clicks is fine
        'type': 'web',
        'dimensionFilterGroups': [
            { # Group for page URL
                'filters': [
                    {
                        'dimension': 'page',
                        'operator': 'equals',
                        'expression': page_url
                    }
                ]
            },
            { # Group for country exclusions (all must be true within this group)
              # This might not work as expected for multiple "notContains".
              # A better way might be to get global clicks and then subtract clicks from excluded countries,
              # or filter client-side if the API doesn't support multiple 'notContains' easily.
              # For now, let's try with multiple filters.
              # Actually, GSC API supports only one filter per dimension in a group.
              # So, we can't do country != A AND country != B AND country != C directly.
              # The API documentation for `dimensionFilterGroups` says:
              # "All filters in a filter group are connected by an AND operation."
              # "All filter groups are connected by an OR operation."
              # This means we CANNOT simply list multiple "notContains" for country.
              #
              # The most straightforward way is to fetch ALL clicks for the page,
              # then make separate queries for each excluded country and subtract.
              # OR, fetch data with 'country' dimension and sum up manually.
              # Let's try fetching with country dimension and filtering in Python.
              'filters': country_filters # This will likely be treated as OR by API if in same group.
                                          # Let's revise this part.
            }
        ],
        'aggregationType': 'auto' # Let GSC decide, or use 'byPage'
    }

    # Revised approach for country exclusion: Fetch data with 'country' dimension
    # and sum clicks from non-excluded countries.
    request_body_revised = {
        'startDate': start_date_str,
        'endDate': end_date_str,
        'dimensions': ['page', 'country'], # Add country dimension
        'type': 'web',
        'dimensionFilterGroups': [
            {
                'filters': [
                    {
                        'dimension': 'page',
                        'operator': 'equals',
                        'expression': page_url
                    }
                ]
            }
        ],
        'rowLimit': 25000 # Max rows, ensure we get all countries for the page
    }

    try:
        response = service.searchanalytics().query(
            siteUrl=GSC_SITE_URL,
            body=request_body_revised
        ).execute()

        total_clicks = 0
        if 'rows' in response:
            for row in response['rows']:
                # row['keys'] will be [page_url, country_code]
                country = row['keys'][1].upper() # GSC returns lowercase country codes e.g. 'usa'
                if country not in EXCLUDED_COUNTRIES:
                    total_clicks += row['clicks']
            print(f"GSC: Found {total_clicks} clicks for {page_url} (excluding {', '.join(EXCLUDED_COUNTRIES)})")
            return total_clicks
        else:
            print(f"GSC: No data found for {page_url} in the last 12 months.")
            return 0
    except Exception as e:
        print(f"Error fetching GSC data for {page_url}: {e}")
        return None # Indicate an error occurred

# --- MAIN SCRIPT LOGIC ---
def main():
    print("Starting blog management script...")
    
    # Authenticate with Google Search Console
    try:
        gsc_service = get_gsc_service()
        print("Successfully authenticated with Google Search Console.")
    except Exception as e:
        print(f"Failed to authenticate with GSC: {e}")
        print("Please ensure 'credentials.json' is set up correctly and you've authorized the app.")
        return

    # Read CSV
    try:
        df = pd.read_csv(CSV_INPUT_FILE)
        if 'url' not in df.columns:
            print(f"Error: CSV file '{CSV_INPUT_FILE}' must contain a 'url' column.")
            return
    except FileNotFoundError:
        print(f"Error: Input CSV file '{CSV_INPUT_FILE}' not found.")
        return
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return

    output_rows = []
    
    # Process each URL
    for index, row in df.iterrows():
        url = row['url']
        print(f"\nProcessing URL: {url}")
        status = "not processed"
        original_row_dict = row.to_dict()

        slug = get_wp_post_slug_from_url(url)
        if not slug:
            print(f"Could not extract slug from URL: {url}")
            original_row_dict['status'] = "invalid URL for slug extraction"
            output_rows.append(original_row_dict)
            continue

        wp_post_id, wp_published_date_dt = get_wordpress_post_details(slug)

        if wp_post_id and wp_published_date_dt:
            # Check age
            now_utc = datetime.now(timezone.utc)
            age = now_utc - wp_published_date_dt
            is_older_than_threshold = age > timedelta(days=DAYS_OLD_THRESHOLD)
            
            print(f"WP Post ID: {wp_post_id}, Published: {wp_published_date_dt.strftime('%Y-%m-%d')}, Age: {age.days} days.")
            print(f"Is older than {DAYS_OLD_THRESHOLD} days? {is_older_than_threshold}")

            if is_older_than_threshold:
                clicks = get_gsc_clicks(gsc_service, url)
                
                if clicks is not None: # Clicks could be 0, None means API error
                    print(f"Clicks (last 12m, filtered): {clicks}")
                    if clicks < CLICKS_THRESHOLD:
                        print(f"FLAGGED: Post is older than {DAYS_OLD_THRESHOLD} days AND has < {CLICKS_THRESHOLD} clicks.")
                        user_confirm = 'y'
                        #user_confirm = input(f"Delete post '{url}' (ID: {wp_post_id})? [y/N]: ").strip().lower()
                        if user_confirm == 'y':
                            if delete_wordpress_post(wp_post_id):
                                status = "deleted"
                            else:
                                status = "delete_failed"
                        else:
                            print("Skipping deletion.")
                            status = "not_deleted_user_skip"
                    else:
                        status = "not_deleted_criteria_not_met (clicks ok)"
                else:
                    status = "not_deleted_gsc_error" # Error fetching GSC data
            else:
                status = "not_deleted_criteria_not_met (too new)"
        elif wp_post_id and not wp_published_date_dt:
            status = "wp_post_found_no_date"
        else:
            status = "wp_post_not_found"

        original_row_dict['status'] = status
        output_rows.append(original_row_dict)

    # Write updated CSV
    if output_rows:
        output_df = pd.DataFrame(output_rows)
        try:
            output_df.to_csv(CSV_OUTPUT_FILE, index=False)
            print(f"\nProcessing complete. Updated CSV saved to '{CSV_OUTPUT_FILE}'")
        except Exception as e:
            print(f"Error writing output CSV: {e}")
    else:
        print("No rows were processed to write to output.")

if __name__ == '__main__':
    main()