#!/usr/bin/env python3
"""
WordPress SEO Updater for Rank Math using GPT-4o-mini

This script reads a CSV/Excel file containing page URLs and their ranking queries,
generates a new SEO title and meta description using OpenAI's API, and updates
the corresponding post in WordPress via its REST API, targeting Rank Math's fields.

It logs its progress to an Excel file to avoid re-processing posts if re-run.
Now includes tracking of failed updates and graceful handling of API timeouts.
"""

import os
import re
import json
import requests
import pandas as pd
from openai import OpenAI
from datetime import datetime
from urllib.parse import urlparse
import time

# --- CONFIGURATION ---
# --- Fill in these details before running the script ---

# WordPress Configuration
WP_BASE_URL = "https://farmonaut.com"  # e.g., "https://example.com"
WP_USERNAME = "ankuromar296"    # Your WordPress username
WP_APPLICATION_PASSWORD = "2fZW 42ln jhSG qPfm Kx3b hacO" # The Application Password you generated

# OpenAI Configuration
OPENAI_API_KEY = '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' # Your OpenAI API key

# File Configuration
INPUT_FILE = "gsc_analysis_report_20250610_024030.csv" # The file from the previous script
LOG_FILE = "updated_seo_log_rankmath.xlsx" # The file to store results and track progress
FAILED_LOG_FILE = "failed_updates_log.xlsx" # File to track failed updates

# --- END OF CONFIGURATION ---


def load_processed_urls(log_file):
    """Loads the URLs of posts that have already been processed."""
    if not os.path.exists(log_file):
        return set()
    try:
        df = pd.read_excel(log_file)
        return set(df['Page URL'].tolist())
    except Exception as e:
        print(f"⚠️ Warning: Could not read log file {log_file}. Starting fresh. Error: {e}")
        return set()

def load_failed_urls(failed_log_file):
    """Loads the URLs of posts that have failed to update and should be skipped."""
    if not os.path.exists(failed_log_file):
        return set()
    try:
        df = pd.read_excel(failed_log_file)
        return set(df['Page URL'].tolist())
    except Exception as e:
        print(f"⚠️ Warning: Could not read failed log file {failed_log_file}. Error: {e}")
        return set()

def save_failed_update(page_url, reason, failed_log_file):
    """Saves a failed update record to the failed log file."""
    try:
        # Load existing failed records
        if os.path.exists(failed_log_file):
            failed_df = pd.read_excel(failed_log_file)
        else:
            failed_df = pd.DataFrame(columns=['Page URL', 'Failure Reason', 'Date Failed', 'Attempts'])
        
        # Check if this URL already exists in failed records
        if page_url in failed_df['Page URL'].values:
            # Update existing record
            mask = failed_df['Page URL'] == page_url
            failed_df.loc[mask, 'Failure Reason'] = reason
            failed_df.loc[mask, 'Date Failed'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            failed_df.loc[mask, 'Attempts'] = failed_df.loc[mask, 'Attempts'] + 1
        else:
            # Add new failed record
            new_record = pd.DataFrame({
                'Page URL': [page_url],
                'Failure Reason': [reason],
                'Date Failed': [datetime.now().strftime("%Y-%m-%d %H:%M:%S")],
                'Attempts': [1]
            })
            failed_df = pd.concat([failed_df, new_record], ignore_index=True)
        
        # Save to file
        failed_df.to_excel(failed_log_file, index=False)
        print(f"  -> 📝 Failed update logged to '{failed_log_file}'.")
        
    except Exception as e:
        print(f"  -> ⚠️ Warning: Could not save failed update log. Error: {e}")

def get_slug_from_url(page_url):
    """
    Extracts the final slug from a full URL, intelligently ignoring categories 
    or parent paths in the permalink structure.
    """
    if not isinstance(page_url, str):
        return None
    try:
        path = urlparse(page_url).path
        
        # Split the path by '/' and filter out any empty strings that result
        # from leading/trailing slashes (e.g., '/blog/my-post/' -> ['', 'blog', 'my-post', ''])
        path_parts = [part for part in path.split('/') if part]
        
        # The actual slug is the very last non-empty part of the path.
        if path_parts:
            slug = path_parts[-1]
            return slug
        else:
            # This handles the case of the root URL "/" (the homepage)
            return None
    except Exception:
        # General catch-all for any other parsing errors
        return None

def get_post_id_from_slug(slug, page_url):
    """Finds the WordPress post ID for a given slug."""
    if not slug:
        save_failed_update(page_url, "Could not extract slug from URL", FAILED_LOG_FILE)
        return None
    
    try:
        # We ask for posts, but this can also find pages if the slug is unique
        api_url = f"{WP_BASE_URL}/wp-json/wp/v2/posts?slug={slug}"
        response = requests.get(api_url, timeout=15)
        response.raise_for_status()
        posts = response.json()
        
        if posts and isinstance(posts, list):
            return posts[0].get('id')
        else:
            # If not found in posts, check pages
            print(f"  -> Post not found for slug '{slug}'. Checking pages...")
            api_url_pages = f"{WP_BASE_URL}/wp-json/wp/v2/pages?slug={slug}"
            response_pages = requests.get(api_url_pages, timeout=15)
            response_pages.raise_for_status()
            pages = response_pages.json()
            
            if pages and isinstance(pages, list):
                return pages[0].get('id')

            print(f"  -> ❌ Post or Page not found for slug: {slug}")
            save_failed_update(page_url, f"Post/Page not found for slug: {slug}", FAILED_LOG_FILE)
            return None
            
    except requests.exceptions.Timeout:
        print(f"  -> ⏰ Timeout while fetching post/page ID for slug '{slug}'")
        save_failed_update(page_url, f"Timeout while fetching post/page ID for slug: {slug}", FAILED_LOG_FILE)
        return None
    except requests.exceptions.RequestException as e:
        print(f"  -> ❌ Error fetching post/page ID for slug '{slug}': {e}")
        save_failed_update(page_url, f"API error while fetching post/page ID: {str(e)}", FAILED_LOG_FILE)
        return None

def generate_seo_content(slug, queries, client, page_url):
    """Generates SEO title and meta description using OpenAI."""
    print(f"  -> 🤖 Generating SEO content for slug: {slug}...")
    
    prompt = f"""
You are an expert SEO copywriter for the Farmonaut - an ai spacetech agritech company. Your task is to write a compelling SEO title and meta description for a blog post.

**Context:**
- The post is already ranking for these unique user queries: `{queries}`
- Current Slug: {slug}
**Instructions:**
1.  **SEO Title:** Create a concise, click-worthy title. It MUST be under 55 characters. Incorporate the primary theme from the queries.
2.  **Meta Description:** Write an engaging meta description that encourages clicks. It MUST be under 160 characters. Summarize the content and include a call-to-action if appropriate.
3.  **Format:** Return your response as a single, valid JSON object with two keys: "title" and "description". Do not include any other text or explanations.
4. **Important:** Mandatorily ensure to fit in as many keywords/ queries within the title and description.
Example response:
{{
  "title": "Your New SEO Title Here",
  "description": "Your new, compelling meta description goes in this spot."
}}
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini-2025-04-14",
            messages=[{"role": "user", "content": prompt}],
            temperature=1,
            response_format={"type": "json_object"}
        )
        content = response.choices[0].message.content
        seo_data = json.loads(content)

        if "title" in seo_data and "description" in seo_data:
            print("  -> ✅ SEO content generated successfully.")
            return seo_data
        else:
            print("  -> ❌ Generated JSON is missing 'title' or 'description' key.")
            save_failed_update(page_url, "OpenAI generated invalid JSON format", FAILED_LOG_FILE)
            return None

    except Exception as e:
        print(f"  -> ❌ An error occurred with OpenAI API: {e}")
        save_failed_update(page_url, f"OpenAI API error: {str(e)}", FAILED_LOG_FILE)
        return None

# --- MODIFIED FUNCTION FOR RANK MATH ---
def update_rank_math_seo(post_id, title, description, page_url):
    """Updates the Rank Math SEO title and meta description for a given post ID."""
    api_url = f"{WP_BASE_URL}/wp-json/wp/v2/posts/{post_id}"
    
    # This payload is specific to Rank Math's REST API fields.
    # It updates the post's meta fields directly.
    # payload = {
    #     "meta": {
    #         "rank_math_title": title,
    #         "rank_math_description": description
    #     }
    # }
    payload = {
        "title": title,
        "excerpt": description
    }
    
    headers = {'Content-Type': 'application/json'}
    
    try:
        print(f"  -> ⬆️  Updating post ID: {post_id} with Rank Math data...")
        response = requests.post(
            api_url,
            auth=(WP_USERNAME, WP_APPLICATION_PASSWORD),
            headers=headers,
            json=payload,
            timeout=30  # Increased timeout
        )
        response.raise_for_status()
        print("  -> ✅ Post updated successfully on WordPress.")
        return True
        
    except requests.exceptions.Timeout:
        print(f"  -> ⏰ Timeout while updating post ID {post_id}")
        save_failed_update(page_url, f"Timeout while updating post ID: {post_id}", FAILED_LOG_FILE)
        return False
        
    except requests.exceptions.RequestException as e:
        error_message = str(e)
        if 'response' in locals() and response is not None and response.text:
            error_message += f"\n  -> Server response: {response.text}"
        
        # Check if the error is due to an invalid post type (e.g., trying to update a page via the /posts endpoint)
        if "rest_post_invalid_id" in error_message:
             print("  -> Retrying with /pages endpoint...")
             api_url = f"{WP_BASE_URL}/wp-json/wp/v2/pages/{post_id}"
             try:
                response = requests.post(
                    api_url,
                    auth=(WP_USERNAME, WP_APPLICATION_PASSWORD),
                    headers=headers,
                    json=payload,
                    timeout=30
                )
                response.raise_for_status()
                print("  -> ✅ Page updated successfully on WordPress.")
                return True
                
             except requests.exceptions.Timeout:
                print(f"  -> ⏰ Timeout while updating page ID {post_id}")
                save_failed_update(page_url, f"Timeout while updating page ID: {post_id}", FAILED_LOG_FILE)
                return False
                
             except requests.exceptions.RequestException as page_e:
                print(f"  -> ❌ Failed to update page ID {post_id} as well. Error: {page_e}")
                if 'response' in locals() and response is not None and response.text:
                    print(f"  -> Server response: {response.text}")
                save_failed_update(page_url, f"Failed to update both post and page ID {post_id}: {str(page_e)}", FAILED_LOG_FILE)
                return False
        else:
            print(f"  -> ❌ Failed to update WordPress post ID {post_id}. Error: {error_message}")
            save_failed_update(page_url, f"Failed to update post ID {post_id}: {error_message}", FAILED_LOG_FILE)
            return False


def main():
    """Main function to run the SEO update process."""
    print("🚀 Starting WordPress SEO Updater for Rank Math")

    # --- Initial Checks ---
    if "your-website.com" in WP_BASE_URL:
        print("❌ CRITICAL: Please update 'WP_BASE_URL' in the configuration.")
        return
    if not os.path.exists(INPUT_FILE):
        print(f"❌ CRITICAL: Input file '{INPUT_FILE}' not found. Please check the 'File Configuration'.")
        return

    # Initialize OpenAI client
    try:
        client = OpenAI(api_key=OPENAI_API_KEY)
    except Exception as e:
        print(f"❌ CRITICAL: Failed to initialize OpenAI client. Check your API key. Error: {e}")
        return

    # --- Load Data and Progress ---
    processed_urls = load_processed_urls(LOG_FILE)
    failed_urls = load_failed_urls(FAILED_LOG_FILE)
    print(f"📖 Found {len(processed_urls)} previously processed URLs in '{LOG_FILE}'.")
    print(f"⚠️ Found {len(failed_urls)} previously failed URLs in '{FAILED_LOG_FILE}' (will be skipped).")
    
    try:
        df = pd.read_csv(INPUT_FILE) if INPUT_FILE.endswith('.csv') else pd.read_excel(INPUT_FILE)
    except Exception as e:
        print(f"❌ CRITICAL: Could not read input file '{INPUT_FILE}'. Error: {e}")
        return

    # Check for required columns
    required_cols = ['Page URL', 'Unique Queries']
    if not all(col in df.columns for col in required_cols):
        print(f"❌ CRITICAL: Input file must contain the columns: {required_cols}")
        return

    # --- Main Processing Loop ---
    newly_processed_data = []
    total_processed = 0
    total_skipped_already_done = 0
    total_skipped_failed = 0
    total_successful = 0
    total_failed_new = 0
    
    for index, row in df.iterrows():
        page_url = row['Page URL']
        unique_queries = row['Unique Queries']

        print("-" * 50)
        print(f"Processing row {index + 1}/{len(df)}: {page_url}")
        total_processed += 1

        if page_url in processed_urls:
            print("  -> ⏭️  Skipping, already processed successfully.")
            total_skipped_already_done += 1
            continue
            
        if page_url in failed_urls:
            print("  -> ⏭️  Skipping, previously failed (check failed_updates_log.xlsx).")
            total_skipped_failed += 1
            continue
        
        if pd.isna(unique_queries) or not str(unique_queries).strip():
            print("  -> ⏭️  Skipping, no queries to process.")
            save_failed_update(page_url, "No queries provided", FAILED_LOG_FILE)
            continue

        slug = get_slug_from_url(page_url)
        if not slug:
            print("  -> ⏭️  Skipping, could not extract slug from URL.")
            continue

        seo_data = generate_seo_content(slug, unique_queries, client, page_url)
        if not seo_data:
            total_failed_new += 1
            continue

        new_title = seo_data['title']
        new_description = seo_data['description']
        print(f"  -> New Title: {new_title}")
        print(f"  -> New Description: {new_description}")

        post_id = get_post_id_from_slug(slug, page_url)
        if not post_id:
            total_failed_new += 1
            continue

        # Use the Rank Math specific update function
        update_successful = update_rank_math_seo(post_id, new_title, new_description, page_url)

        if update_successful:
            total_successful += 1
            newly_processed_data.append({
                "Page URL": page_url,
                "New Title": new_title,
                "New Description": new_description,
                "Date Updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

            # Save progress every 5 successful updates
            if len(newly_processed_data) >= 5:
                try:
                    log_df_new = pd.DataFrame(newly_processed_data)
                    log_df_old = pd.read_excel(LOG_FILE) if os.path.exists(LOG_FILE) else pd.DataFrame()
                    log_df_combined = pd.concat([log_df_old, log_df_new]).drop_duplicates(subset=['Page URL'], keep='last')
                    log_df_combined.to_excel(LOG_FILE, index=False)
                    print(f"  -> ✅ Progress saved to '{LOG_FILE}' (batch of {len(newly_processed_data)} updates).")
                    newly_processed_data = []  # Clear the list after saving
                except Exception as e:
                    print(f"  -> ⚠️ Warning: Could not save progress. Error: {e}")
        else:
            total_failed_new += 1

        # Add a small delay to avoid overwhelming the API
        time.sleep(1)

    # Save any remaining processed data
    if newly_processed_data:
        try:
            log_df_new = pd.DataFrame(newly_processed_data)
            log_df_old = pd.read_excel(LOG_FILE) if os.path.exists(LOG_FILE) else pd.DataFrame()
            log_df_combined = pd.concat([log_df_old, log_df_new]).drop_duplicates(subset=['Page URL'], keep='last')
            log_df_combined.to_excel(LOG_FILE, index=False)
            print(f"  -> ✅ Final progress saved to '{LOG_FILE}'.")
        except Exception as e:
            print(f"  -> ⚠️ Warning: Could not save final progress. Error: {e}")

    print("-" * 50)
    print("📊 SUMMARY:")
    print(f"  Total URLs processed: {total_processed}")
    print(f"  Already completed (skipped): {total_skipped_already_done}")
    print(f"  Previously failed (skipped): {total_skipped_failed}")
    print(f"  Successful updates: {total_successful}")
    print(f"  New failures: {total_failed_new}")
    print(f"  Success rate: {(total_successful / max(total_processed - total_skipped_already_done - total_skipped_failed, 1)) * 100:.1f}%")
    print("✅ Script finished. Check logs for details.")
    print(f"   - Success log: {LOG_FILE}")
    print(f"   - Failed updates log: {FAILED_LOG_FILE}")

if __name__ == "__main__":
    main()