import requests from bs4 import BeautifulSoup from collections import Counter import nltk import ssl import sys try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context import os import json requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) def read_json_files(root_folder): print(root_folder) for folder_name in os.listdir(root_folder): folder_path = os.path.join(root_folder, folder_name) if os.path.isdir(folder_path): for file_name in os.listdir(folder_path): print(file_name) if file_name.endswith('.json'): file_path = os.path.join(folder_path, file_name) with open(file_path, 'r') as file: try: data = json.load(file) slug = data.get('slug', 'No slug found') content = data.get('content', 'No content found') keywords = extract_keywords(content, 20) keywords_arr = [] #print(f"Top {num_keywords} keywords for the blog post:") for keyword, frequency in keywords: keywords_arr.append(keyword) #print(f"{keyword}: {frequency}") print(slug, keywords_arr) # print(f"File: {file_path}") # print(f"Slug: {slug}") # print(f"Content: {content}") # print("-" * 50) except json.JSONDecodeError: print(f"Error reading JSON from {file_path}") def download_nltk_data(): """Download required NLTK data, handling potential SSL issues.""" try: nltk.download('punkt') nltk.download('stopwords') nltk.download('punkt_tab') except Exception as e: print(f"Error downloading NLTK data: {e}") print("Please download the required NLTK data manually:") print("1. Open a Python console") print("2. Run the following commands:") print(" import nltk") print(" nltk.download('punkt')") print(" nltk.download('stopwords')") sys.exit(1) download_nltk_data() from nltk.corpus import stopwords from nltk.tokenize import word_tokenize def extract_text_from_url(url): """Extract text content from a given URL.""" try: response = requests.get(url, verify=False) # Disable SSL verification response.raise_for_status() # Raise an exception for bad status codes soup = BeautifulSoup(response.text, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() return soup.get_text() except requests.RequestException as e: print(f"Error fetching URL: {e}") return None def preprocess_text(text): """Tokenize and remove stopwords from the text.""" stop_words = set(stopwords.words('english')) tokens = word_tokenize(text.lower()) return [token for token in tokens if token.isalnum() and token not in stop_words] def extract_keywords(text, num_keywords=10): """Extract top keywords from the given text.""" tokens = preprocess_text(text) keyword_freq = Counter(tokens) return keyword_freq.most_common(num_keywords) # def main(blog_url, num_keywords=10): # """Main function to extract top keywords from a blog post.""" # # text = extract_text_from_url(blog_url) # # if text is None: # # return # keywords = extract_keywords(text, num_keywords) # print(f"Top {num_keywords} keywords for the blog post:") # for keyword, frequency in keywords: # print(f"{keyword}: {frequency}") # Disable SSL warnings # Usage # root_folder = 'scraped_pages' # read_json_files(root_folder) # blog_url = "https://eos.com/blog/cotton-yield-estimation-in-texas-proof-of-concept/" # num_keywords = 20 # main(blog_url, num_keywords)