from openai import OpenAI import anthropic from tqdm import tqdm from urllib.parse import urlparse from requests.auth import HTTPBasicAuth import mimetypes import ast from PIL import Image import re import unicodedata from eventregistry import * import json from fuzzywuzzy import fuzz from datetime import datetime, timedelta import time def get_news_articles(er,date_start, date_end,geography, lang, max_items=100): """ Fetch news articles based on location and date range, filtering out similar titles. Args: location_uri (str): The URI of the location (e.g., country). date_start (str): The start date for fetching articles (format: YYYY-MM-DD). date_end (str): The end date for fetching articles (format: YYYY-MM-DD). max_items (int): Maximum number of articles to fetch. Default is 100. Returns: list: A list of unique or sufficiently distinct articles fetched based on the query. """ # Define the query with input parameters query = { "$query": { "$and": [ { "$or": [ { "conceptUri": "http://en.wikipedia.org/wiki/Soil" }, {"conceptUri": "http://en.wikipedia.org/wiki/Agriculture"}, {"conceptUri": "http://en.wikipedia.org/wiki/Farming"}, { "conceptUri": "http://en.wikipedia.org/wiki/Forestry" }, { "conceptUri": "http://en.wikipedia.org/wiki/Agribusiness" } ] } , { "locationUri": geography } , { "dateStart": date_start, # Use the input dateStart "dateEnd": date_end, # Use the input dateEnd }, ] },"$filter": { "dataType": [ "blog" ], "isDuplicate": "skipDuplicates" } ,"articlesSortBy": "rel" } if lang == "en": query_and = query["$query"]["$and"] query_and.append({ "$or": [ { "lang": "eng" } ] }) query["$query"]["$and"] = query_and print(query) # Initialize query q = QueryArticlesIter.initWithComplexQuery(query) # List to store unique/sufficiently distinct articles articles_list = [] seen_titles = [] # List to track seen titles (for fuzzy matching) # Fetch and append unique articles to the list for article in q.execQuery(er, maxItems=max_items): title = article.get("title", None) if title: #print(title) # Check for duplicates or highly similar titles using fuzzy matching if not any( is_similar_title(title, seen_title) for seen_title in seen_titles ): articles_list.append(article) seen_titles.append(title) # Track the title for similarity comparison #time.sleep(10000) return articles_list def is_similar_title(title1, title2, threshold=60): """ Check if two titles are similar based on a similarity threshold. Args: title1 (str): First title. title2 (str): Second title. threshold (int): Similarity threshold (default 85%). Returns: bool: True if titles are similar, False otherwise. """ return fuzz.token_sort_ratio(title1, title2) >= threshold