from openai import OpenAI
import anthropic
from tqdm import tqdm
from urllib.parse import urlparse
from requests.auth import HTTPBasicAuth
import mimetypes
import ast
from PIL import Image
import re
import unicodedata
from eventregistry import *
import json
from fuzzywuzzy import fuzz
from datetime import datetime, timedelta
import time
def get_news_articles(er,date_start, date_end,geography, lang, max_items=100):
    """
    Fetch news articles based on location and date range, filtering out similar titles.

    Args:
        location_uri (str): The URI of the location (e.g., country).
        date_start (str): The start date for fetching articles (format: YYYY-MM-DD).
        date_end (str): The end date for fetching articles (format: YYYY-MM-DD).
        max_items (int): Maximum number of articles to fetch. Default is 100.

    Returns:
        list: A list of unique or sufficiently distinct articles fetched based on the query.
    """
    # Define the query with input parameters
    query = {
        "$query": {
            "$and": [
                {
                    "$or": [
                       
                {
                    "conceptUri": "http://en.wikipedia.org/wiki/Soil"
                },
       {"conceptUri": "http://en.wikipedia.org/wiki/Agriculture"},
                        {"conceptUri": "http://en.wikipedia.org/wiki/Farming"},
                        {
                            "conceptUri": "http://en.wikipedia.org/wiki/Forestry"
                        },
                        {
                            "conceptUri": "http://en.wikipedia.org/wiki/Agribusiness"
                        }

                    ]
                }

,
                {
                            "locationUri": geography
                        }
                ,
                {
                    "dateStart": date_start,  # Use the input dateStart
                    "dateEnd": date_end,  # Use the input dateEnd
                },
            ]
        },"$filter": {
            "dataType": [
                "blog"
            ],
             "isDuplicate": "skipDuplicates"
        }
         ,"articlesSortBy": "rel"
    }
    if lang == "en":
        query_and = query["$query"]["$and"]
        query_and.append({
        "$or": [
          {
            "lang": "eng"
          }
        ]
      })
        query["$query"]["$and"] = query_and
    print(query)
    # Initialize query
    q = QueryArticlesIter.initWithComplexQuery(query)

    # List to store unique/sufficiently distinct articles
    articles_list = []
    seen_titles = []  # List to track seen titles (for fuzzy matching)

    # Fetch and append unique articles to the list
    for article in q.execQuery(er, maxItems=max_items):
        title = article.get("title", None)
        if title:
            #print(title)
            # Check for duplicates or highly similar titles using fuzzy matching
            if not any(
                is_similar_title(title, seen_title) for seen_title in seen_titles
            ):
                articles_list.append(article)
                seen_titles.append(title)  # Track the title for similarity comparison
    #time.sleep(10000)
    return articles_list

def is_similar_title(title1, title2, threshold=60):
    """
    Check if two titles are similar based on a similarity threshold.

    Args:
        title1 (str): First title.
        title2 (str): Second title.
        threshold (int): Similarity threshold (default 85%).

    Returns:
        bool: True if titles are similar, False otherwise.
    """
    return fuzz.token_sort_ratio(title1, title2) >= threshold