U
    ÈöfÍ  ã                   @   sÐ   d dl Z d dlmZ d dlmZ d dlZd dlZd dlZz
ejZ	W n e
k
rV   Y nX e	e_d dlZd dlZe jj e jjjj¡ dd„ Zdd„ Zeƒ  d dlmZ d d	lmZ d
d„ Zdd„ Zddd„ZdS )é    N)ÚBeautifulSoup)ÚCounterc                 C   sø   t | ƒ t | ¡D ]à}tj | |¡}tj |¡rt |¡D ]¶}t |ƒ | d¡r:tj ||¡}t|dƒ‚}zVt 	|¡}| 
dd¡}| 
dd¡}t|dƒ}	g }
|	D ]\}}|
 |¡ q t ||
ƒ W n$ tjk
rä   t d|› ƒ Y nX W 5 Q R X q:qd S )	Nz.jsonÚrÚslugzNo slug foundÚcontentzNo content foundé   zError reading JSON from )ÚprintÚosÚlistdirÚpathÚjoinÚisdirÚendswithÚopenÚjsonÚloadÚgetÚextract_keywordsÚappendÚJSONDecodeError)Zroot_folderÚfolder_nameÚfolder_pathÚ	file_nameÚ	file_pathÚfileÚdatar   r   ÚkeywordsZkeywords_arrÚkeywordÚ	frequency© r   ú>/home/ankuromar296_gmail_com/publish_blogs_ai/gsc/kw_search.pyÚread_json_files   s(    


r!   c               
   C   s’   z"t  d¡ t  d¡ t  d¡ W nj tk
rŒ }  zLtd| › ƒ tdƒ tdƒ tdƒ tdƒ td	ƒ td
ƒ t d¡ W 5 d} ~ X Y nX dS )z;Download required NLTK data, handling potential SSL issues.ZpunktÚ	stopwordsZ	punkt_tabzError downloading NLTK data: z0Please download the required NLTK data manually:z1. Open a Python consolez2. Run the following commands:z   import nltkz   nltk.download('punkt')z   nltk.download('stopwords')é   N)ÚnltkÚdownloadÚ	Exceptionr   ÚsysÚexit)Úer   r   r    Údownload_nltk_data1   s    

r*   )r"   )Úword_tokenizec              
   C   s„   zFt j| dd}| ¡  t|jdƒ}|ddgƒD ]}| ¡  q0| ¡ W S  t jk
r~ } ztd|› ƒ W Y ¢dS d}~X Y nX dS )z&Extract text content from a given URL.F)Úverifyzhtml.parserÚscriptÚstylezError fetching URL: N)	Úrequestsr   Úraise_for_statusr   ÚtextÚ	decomposeÚget_textÚRequestExceptionr   )ÚurlÚresponseÚsoupr-   r)   r   r   r    Úextract_text_from_urlF   s    

r8   c                    s,   t t d¡ƒ‰ t|  ¡ ƒ}‡ fdd„|D ƒS )z,Tokenize and remove stopwords from the text.Úenglishc                    s    g | ]}|  ¡ r|ˆ kr|‘qS r   )Úisalnum)Ú.0Útoken©Ú
stop_wordsr   r    Ú
<listcomp>Z   s       z#preprocess_text.<locals>.<listcomp>)Úsetr"   Úwordsr+   Úlower)r1   Útokensr   r=   r    Úpreprocess_textV   s    rD   é
   c                 C   s   t | ƒ}t|ƒ}| |¡S )z)Extract top keywords from the given text.)rD   r   Úmost_common)r1   Únum_keywordsrC   Zkeyword_freqr   r   r    r   \   s    r   )rE   )r/   Úbs4r   Úcollectionsr   r$   Ússlr'   Ú_create_unverified_contextZ _create_unverified_https_contextÚAttributeErrorÚ_create_default_https_contextr	   r   ÚpackagesÚurllib3Údisable_warningsÚ
exceptionsÚInsecureRequestWarningr!   r*   Znltk.corpusr"   Znltk.tokenizer+   r8   rD   r   r   r   r   r    Ú<module>   s*   
