import pandas as pd import requests from requests.auth import HTTPBasicAuth import json import openai import re import unicodedata import time import json from bs4 import BeautifulSoup import csv from urllib.parse import urlparse # Replace these with your actual credentials site_url = "https://www.farmonaut.com" username = "ankuromar296" app_password = "Tjat A2hz 9XMv pXJi YbV0 GR8o" auth = HTTPBasicAuth(username, app_password) headers = {"Content-Type": "application/json"} def add_redirect(source, target): payload = {"source": source, "target": target} #print(payload) #return "done" resp = requests.post( f"{site_url}/wp-json/my-redirects/v1/rule", json=payload, auth=auth, headers=headers ) resp.raise_for_status() return resp.json() def update_blog_post( site_url, username, app_password, post_id, new_slug=None, new_title=None, new_excerpt=None, new_content=None, ): url = f"{site_url}/wp-json/wp/v2/posts/{post_id}" auth = HTTPBasicAuth(username, app_password) # Build the payload only with non-None values payload = {} if new_slug is not None: payload["slug"] = new_slug if new_title is not None: payload["title"] = new_title if new_excerpt is not None: payload["excerpt"] = new_excerpt if new_content is not None: payload["content"] = new_content # If nothing to update, return early if not payload: return {"status": "skipped", "reason": "No data to update."} response = requests.post(url, auth=auth, json=payload) return response from urllib.parse import urlparse from urllib.parse import urlparse def extract_slug_and_category(url): parsed_url = urlparse(url) slug = parsed_url.path.strip('/') parts = slug.split('/') slug = '/' + slug category = parts[0] if parts else '' return slug, category def append_to_csv(file_path, row): """ Appends a single value as a new row to a one-column CSV file. Args: file_path (str): Path to the CSV file. value (str): The value to append. """ with open(file_path, "a", newline="") as file: writer = csv.writer(file) writer.writerow(row) def getDictFromFile(path): with open(path, "r", encoding="utf-8") as f: return json.loads(f.read()) def saveJsonDataToFile(path, data): with open(path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) def is_valid_json(s): """ Checks if the given string `s` is valid JSON. Returns (True, parsed_json) if valid, otherwise (False, error message). """ try: parsed = json.loads(s) return True, parsed except json.JSONDecodeError as e: return False, str(e) def call_llm(prompt): try: response = client.chat.completions.create( # model="gpt-4.1-nano-2025-04-14", model="gpt-4.1-mini-2025-04-14", messages=[ { "role": "system", "content": "You are a helpful assistant that returns structured JSON for relevance classification.", }, {"role": "user", "content": prompt}, ], temperature=0, ) text = response.choices[0].message.content is_json, json_output = is_valid_json(text) return {"json_output": json_output, "is_json": is_json} except Exception as e: print("Error calling OpenAI LLM:", e) return {"relevant": "Unknown", "reason": f"Error: {e}"} def count_words_in_slug(slug): if "_" in slug: return len(slug.split("_")) elif "-" in slug: return len(slug.split("-")) else: return 1 if slug else 0 def title_to_slug(title): # Normalize to remove accents (e.g., á -> a) title = ( unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8") ) # Lowercase title = title.lower() # Replace any non-alphanumeric characters with hyphens title = re.sub(r"[^a-z0-9]+", "-", title) # Remove leading/trailing hyphens title = title.strip("-") return title def count_path_segments(url): parsed = urlparse(url) # Remove leading/trailing slashes and split by "/" path_segments = [seg for seg in parsed.path.strip("/").split("/") if seg] return len(path_segments) df_new = pd.read_csv("all_pages_by_date_GSC.csv") df_all_urls = df_new.groupby("clean_url")["clicks"].sum().reset_index() df_all_urls["clean_url"] = df_all_urls["clean_url"].astype(str) url_to_clicks = dict(zip(df_all_urls["clean_url"], df_all_urls["clicks"])) dict_all_blogs = getDictFromFile("blog_posts.json") df_all_blogs = pd.DataFrame.from_dict(dict_all_blogs, orient="index") df_all_blogs["slug_length"] = df_all_blogs["slug"].str.len() df_all_blogs["title_length"] = df_all_blogs["title"].str.len() df_all_blogs["meta_description_length"] = df_all_blogs["meta_description"].str.len() # Apply to create the 'clicks' column df_all_blogs["clicks"] = df_all_blogs["url"].apply( lambda url: url_to_clicks.get(url, 0) ) df_all_blogs["path_segments"] = df_all_blogs["url"].apply(count_path_segments) df_all_blogs = df_all_blogs.drop_duplicates(subset="blog_id") filtered_df = df_all_blogs[ (df_all_blogs["path_segments"] > 1) & ( df_all_blogs["title"].notna() | (df_all_blogs["slug_length"] > 75) | (df_all_blogs["title_length"] > 70) | (df_all_blogs["meta_description_length"] > 150) ) ] # print(filtered_df.head(5)) filtered_index_list = filtered_df.index.tolist() print("Blog Posts to Update", len(filtered_index_list), filtered_index_list) try: processed_df = pd.read_csv("processed_indices.csv", header=None) first_column_name = processed_df.columns[0] processed_list = processed_df[first_column_name].tolist() except: processed_list = list() try: unprocessed_df = pd.read_csv("unprocessed_indices.csv", header=None) first_column_name = unprocessed_df.columns[0] unprocessed_indices = unprocessed_df[first_column_name].tolist() except: unprocessed_indices = list() print("Already Processed", processed_list) print("Already UnProcessed", unprocessed_indices) # Set your OpenAI API key API_KEY = "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" # Replace with your actual OpenAI API key client = openai.OpenAI(api_key=API_KEY) # for key, blog_post_data in dict_all_blogs.items(): for index in filtered_index_list: # index = "5078" print("Processing Index", index) if (int(index) not in processed_list) and (int(index) not in unprocessed_indices): blog_post_data = df_all_blogs.loc[index] url_blog = blog_post_data["url"] slug_blog = blog_post_data.get("slug", "") title_blog = blog_post_data.get("title", "") meta_description_blog = blog_post_data.get("meta_description", "") id_blog = blog_post_data.get("blog_id", None) content_blog = blog_post_data.get("content", "") clicks = blog_post_data.get("clicks", 0) path_segments = count_path_segments(url_blog) if ( url_blog and slug_blog and content_blog and count_words_in_slug(slug_blog) > 3 and path_segments > 1 ): prompt = f""" You are an expert SEO assistant. I will provide you with a blog's **title** and **meta_description**. Your task is to **rewrite all elements to maximize SEO effectiveness** based on best practices. SEO Guidelines to follow: **Title Optimization:** - Must be under 53 characters. - Place the **primary focus keyword** within the **first 50%** of the title. - Use **power words** and **emotional triggers** to make it more engaging. - Include **numbers** if relevant to increase specificity and clickability. - Avoid clickbait but make it compelling enough to boost CTR. **Slug Optimization:** - Keep it under 63 characters. - Ensure it includes the focus keyword and is **lowercase**, **hyphen-separated**, and **free from stop words** (like “and”, “the”, etc.) - Make it concise, descriptive, and easily readable. **Meta Description Optimization:** - Between 75 and 135 characters. - Include the **primary focus keyword** naturally. - Write it in an informative and persuasive tone to encourage users to click. Here is the input: title: {title_blog} slug : {slug_blog} meta_description: {meta_description_blog} Return the optimized result in the following JSON format with keys: "title": "...", "meta_description": "..." """ innerloopcount = 0 while True: # Call your LLM here using the prompt json_output_llm = call_llm(prompt) # json_output_llm = { # "json_output": { # "title": "New Title", # "meta_description": "New Meta Description", # }, # "is_json": True, # } if ( json_output_llm.get("is_json") is True and isinstance(json_output_llm.get("json_output"), dict) and all( key in json_output_llm["json_output"] and json_output_llm["json_output"][key] for key in ["title", "meta_description"] ) ): break else: ## TODO :- Should not get stuck here print( f"Retrying for URL after 3 sec: {index},{url_blog}", json_output_llm, ) time.sleep( 3 ) # Optional delay between retries to avoid hammering the API innerloopcount = innerloopcount + 1 if innerloopcount > 3: append_to_csv("unprocessed_indices.csv", [index]) unprocessed_indices.append(index) break new_title = json_output_llm["json_output"]["title"] new_meta_description = json_output_llm["json_output"]["meta_description"] # if clicks > 10: # new_slug = None # else: new_slug = title_to_slug(new_title) ######## Remove H1 tags # Find all