import requests
from requests.auth import HTTPBasicAuth
import json
from urllib.parse import urlparse
from bs4 import BeautifulSoup
# Replace these with your actual details
site_url = "https://www.farmonaut.com"
username = "ankuromar296"
app_password = "Tjat A2hz 9XMv pXJi YbV0 GR8o"
def getDictFromFile(path):
with open(path, "r", encoding="utf-8") as f:
return json.loads(f.read())
def saveJsonDataToFile(path, data):
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def count_words_in_slug(slug):
if "_" in slug:
return len(slug.split("_"))
elif "-" in slug:
return len(slug.split("-"))
else:
return 1 if slug else 0
def extract_image_alt_text(content):
"""
Extracts the alt text from all image tags in the given HTML content.
Args:
content (str): The HTML content to parse.
Returns:
list: A list of strings, where each string is the alt text of an image.
Returns an empty list if no images are found.
"""
excluded_alts = {
# "Farmonaut Web App",
# "Farmonaut Android App",
# "Farmonaut iOS App",
# "Satellite Imagery",
# "pinterest",
# "tumblr",
}
results = []
soup = BeautifulSoup(content, "html.parser") # Use BeautifulSoup to parse the HTML
img_tags = soup.find_all("img") # Find all tags
for img in img_tags:
alt_text = img.get("alt") # Get the 'alt' attribute
src = img.get("src")
if alt_text not in excluded_alts: # Check if alt attribute exists
results.append({"alt": alt_text, "url": src})
return results
#### get all posts from wordpress press
json_filepath = "blog_posts.json"
all_posts = getDictFromFile(json_filepath)
for i, blog_post_data in all_posts.items():
post_data = {}
url = blog_post_data["url"]
slug = blog_post_data.get("slug", "")
# print(slug, count_words_in_slug(slug))
if slug != "" and count_words_in_slug(slug) > 2:
continue
slug_from_url = urlparse(url).path.strip("/").split("/")[-1]
if slug_from_url != "" and count_words_in_slug(slug_from_url) > 2:
post = None
for post_type in ["posts", "pages"]:
api_url = f"{site_url}/wp-json/wp/v2/{post_type}?slug={slug_from_url}"
response = requests.get(api_url, auth=HTTPBasicAuth(username, app_password))
data = response.json()
if isinstance(data, list) and len(data) > 0:
post = data[0]
break
if post is None:
post_data = {
"url": url,
"slug": "No-Post-Found",
}
elif post and post["status"] == "publish":
title = post["title"]["rendered"]
slug_from_url = post["slug"]
meta_description = ""
if "excerpt" in post and "rendered" in post["excerpt"]:
# Basic extraction from excerpt - might need more sophisticated method
meta_description = (
post["excerpt"]["rendered"].strip("
").strip("
\n") ) content = post["content"]["rendered"] images = [] if content: images = extract_image_alt_text(content) image_alt_text = "" if "featured_media" in post and post["featured_media"] != 0: media_api_url = ( f"{site_url}/wp-json/wp/v2/media/{post['featured_media']}" ) media_response = requests.get( api_url, auth=HTTPBasicAuth(username, app_password) ) if media_response.status_code == 200: media_data = media_response.json() if "alt_text" in media_data: image_alt_text = media_data["alt_text"] # print(media_api_url) post_data = { "published_date": post.get("date_gmt", ""), "url": url, "title": title, "meta_description": meta_description, "slug": slug_from_url, "images": images, "blog_id": post["id"], "content": content, "categories": post.get("categories", []), } else: post_data = { "url": url, "slug": "Post-NotPublished", } else: post_data = { "url": url, "slug": "No-Slug-Found", } if count_words_in_slug(slug_from_url) > 1: print(i, url, slug_from_url) all_posts[i] = post_data if int(i) % 50 == 0: saveJsonDataToFile(json_filepath, all_posts) ## TODO:- Remove the break to fetch all blogs # break