import requests from requests.auth import HTTPBasicAuth import json from urllib.parse import urlparse from bs4 import BeautifulSoup # Replace these with your actual details site_url = "https://www.farmonaut.com" username = "ankuromar296" app_password = "Tjat A2hz 9XMv pXJi YbV0 GR8o" def getDictFromFile(path): with open(path, "r", encoding="utf-8") as f: return json.loads(f.read()) def saveJsonDataToFile(path, data): with open(path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) def count_words_in_slug(slug): if "_" in slug: return len(slug.split("_")) elif "-" in slug: return len(slug.split("-")) else: return 1 if slug else 0 def extract_image_alt_text(content): """ Extracts the alt text from all image tags in the given HTML content. Args: content (str): The HTML content to parse. Returns: list: A list of strings, where each string is the alt text of an image. Returns an empty list if no images are found. """ excluded_alts = { # "Farmonaut Web App", # "Farmonaut Android App", # "Farmonaut iOS App", # "Satellite Imagery", # "pinterest", # "tumblr", } results = [] soup = BeautifulSoup(content, "html.parser") # Use BeautifulSoup to parse the HTML img_tags = soup.find_all("img") # Find all tags for img in img_tags: alt_text = img.get("alt") # Get the 'alt' attribute src = img.get("src") if alt_text not in excluded_alts: # Check if alt attribute exists results.append({"alt": alt_text, "url": src}) return results #### get all posts from wordpress press json_filepath = "blog_posts.json" all_posts = getDictFromFile(json_filepath) for i, blog_post_data in all_posts.items(): post_data = {} url = blog_post_data["url"] slug = blog_post_data.get("slug", "") # print(slug, count_words_in_slug(slug)) if slug != "" and count_words_in_slug(slug) > 2: continue slug_from_url = urlparse(url).path.strip("/").split("/")[-1] if slug_from_url != "" and count_words_in_slug(slug_from_url) > 2: post = None for post_type in ["posts", "pages"]: api_url = f"{site_url}/wp-json/wp/v2/{post_type}?slug={slug_from_url}" response = requests.get(api_url, auth=HTTPBasicAuth(username, app_password)) data = response.json() if isinstance(data, list) and len(data) > 0: post = data[0] break if post is None: post_data = { "url": url, "slug": "No-Post-Found", } elif post and post["status"] == "publish": title = post["title"]["rendered"] slug_from_url = post["slug"] meta_description = "" if "excerpt" in post and "rendered" in post["excerpt"]: # Basic extraction from excerpt - might need more sophisticated method meta_description = ( post["excerpt"]["rendered"].strip("

").strip("

\n") ) content = post["content"]["rendered"] images = [] if content: images = extract_image_alt_text(content) image_alt_text = "" if "featured_media" in post and post["featured_media"] != 0: media_api_url = ( f"{site_url}/wp-json/wp/v2/media/{post['featured_media']}" ) media_response = requests.get( api_url, auth=HTTPBasicAuth(username, app_password) ) if media_response.status_code == 200: media_data = media_response.json() if "alt_text" in media_data: image_alt_text = media_data["alt_text"] # print(media_api_url) post_data = { "published_date": post.get("date_gmt", ""), "url": url, "title": title, "meta_description": meta_description, "slug": slug_from_url, "images": images, "blog_id": post["id"], "content": content, "categories": post.get("categories", []), } else: post_data = { "url": url, "slug": "Post-NotPublished", } else: post_data = { "url": url, "slug": "No-Slug-Found", } if count_words_in_slug(slug_from_url) > 1: print(i, url, slug_from_url) all_posts[i] = post_data if int(i) % 50 == 0: saveJsonDataToFile(json_filepath, all_posts) ## TODO:- Remove the break to fetch all blogs # break