import requests
from requests.auth import HTTPBasicAuth
import json
from urllib.parse import urlparse
from bs4 import BeautifulSoup


# Replace these with your actual details
site_url = "https://www.farmonaut.com"
username = "ankuromar296"
app_password = "Tjat A2hz 9XMv pXJi YbV0 GR8o"


def getDictFromFile(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.loads(f.read())


def saveJsonDataToFile(path, data):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


def count_words_in_slug(slug):
    if "_" in slug:
        return len(slug.split("_"))
    elif "-" in slug:
        return len(slug.split("-"))
    else:
        return 1 if slug else 0


def extract_image_alt_text(content):
    """
    Extracts the alt text from all image tags in the given HTML content.

    Args:
        content (str): The HTML content to parse.

    Returns:
        list: A list of strings, where each string is the alt text of an image.
                Returns an empty list if no images are found.
    """
    excluded_alts = {
        # "Farmonaut Web App",
        # "Farmonaut Android App",
        # "Farmonaut iOS App",
        # "Satellite Imagery",
        # "pinterest",
        # "tumblr",
    }
    results = []

    soup = BeautifulSoup(content, "html.parser")  # Use BeautifulSoup to parse the HTML
    img_tags = soup.find_all("img")  # Find all <img> tags

    for img in img_tags:
        alt_text = img.get("alt")  # Get the 'alt' attribute
        src = img.get("src")

        if alt_text not in excluded_alts:  # Check if alt attribute exists
            results.append({"alt": alt_text, "url": src})
    return results


#### get all posts from wordpress press
json_filepath = "blog_posts.json"
all_posts = getDictFromFile(json_filepath)


for i, blog_post_data in all_posts.items():
    post_data = {}
    url = blog_post_data["url"]

    slug = blog_post_data.get("slug", "")

    # print(slug, count_words_in_slug(slug))

    if slug != "" and count_words_in_slug(slug) > 2:
        continue

    slug_from_url = urlparse(url).path.strip("/").split("/")[-1]

    if slug_from_url != "" and count_words_in_slug(slug_from_url) > 2:

        post = None
        for post_type in ["posts", "pages"]:
            api_url = f"{site_url}/wp-json/wp/v2/{post_type}?slug={slug_from_url}"
            response = requests.get(api_url, auth=HTTPBasicAuth(username, app_password))
            data = response.json()
            if isinstance(data, list) and len(data) > 0:
                post = data[0]
                break

        if post is None:
            post_data = {
                "url": url,
                "slug": "No-Post-Found",
            }

        elif post and post["status"] == "publish":
            title = post["title"]["rendered"]
            slug_from_url = post["slug"]

            meta_description = ""
            if "excerpt" in post and "rendered" in post["excerpt"]:
                # Basic extraction from excerpt - might need more sophisticated method
                meta_description = (
                    post["excerpt"]["rendered"].strip("<p>").strip("</p>\n")
                )

            content = post["content"]["rendered"]
            images = []
            if content:
                images = extract_image_alt_text(content)

            image_alt_text = ""
            if "featured_media" in post and post["featured_media"] != 0:
                media_api_url = (
                    f"{site_url}/wp-json/wp/v2/media/{post['featured_media']}"
                )
                media_response = requests.get(
                    api_url, auth=HTTPBasicAuth(username, app_password)
                )
                if media_response.status_code == 200:
                    media_data = media_response.json()
                    if "alt_text" in media_data:
                        image_alt_text = media_data["alt_text"]

                # print(media_api_url)

            post_data = {
                "published_date": post.get("date_gmt", ""),
                "url": url,
                "title": title,
                "meta_description": meta_description,
                "slug": slug_from_url,
                "images": images,
                "blog_id": post["id"],
                "content": content,
                "categories": post.get("categories", []),
            }

        else:
            post_data = {
                "url": url,
                "slug": "Post-NotPublished",
            }

    else:
        post_data = {
            "url": url,
            "slug": "No-Slug-Found",
        }

    if count_words_in_slug(slug_from_url) > 1:
        print(i, url, slug_from_url)

    all_posts[i] = post_data

    if int(i) % 50 == 0:
        saveJsonDataToFile(json_filepath, all_posts)
        ## TODO:- Remove the break to fetch all blogs
        # break