a
    dG(bX                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
 eeZedkrejdejd edd	ej ed
d e Zejdddd ejddd ejddedd ejddedd ejddedd ejddedddgd  ejd!d"edd ejd#d$ed%d ejd&d'edd ejd(d)edd ejd*d+ed, ejd-d.edddgd  ejd/d0edddgd  ejd1d2d e ZejdkrdZejsd3e_ndZejsd4e_e
ejZe	eejej ej!ej"ej#ejeej$ej%dej&d5Z'ej(r$ej(Z)e'j*j+e)ej,d6 nTej-d7d Z)e'.e)d8  ej,dkrde'j*j+e)d9 dd6 ne'j*j+e)d: d;d6 ej/rej/Z0e'/e0 ed<ej12ejd  dS )=a  
USAGE: %(program)s -train CORPUS -output VECTORS -size SIZE -window WINDOW
-cbow CBOW -sample SAMPLE -hs HS -negative NEGATIVE -threads THREADS -iter ITER
-min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE

Trains a neural embedding model on text file CORPUS.
Parameters essentially reproduce those used by the original C tool
(see https://code.google.com/archive/p/word2vec/).

Parameters for training:
        -train <file>
                Use text data from <file> to train the model
        -output <file>
                Use <file> to save the resulting word vectors / word clusters
        -size <int>
                Set size of word vectors; default is 100
        -window <int>
                Set max skip length between words; default is 5
        -sample <float>
                Set threshold for occurrence of words. Those that appear with higher frequency in the training data
                will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
        -hs <int>
                Use Hierarchical Softmax; default is 0 (not used)
        -negative <int>
                Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
        -threads <int>
                Use <int> threads (default 3)
        -iter <int>
                Run more training iterations (default 5)
        -min_count <int>
                This will discard words that appear less than <int> times; default is 5
        -alpha <float>
                Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
        -binary <int>
                Save the resulting vectors in binary moded; default is 0 (off)
        -cbow <int>
                Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
        -accuracy <file>
                Compute accuracy of the resulting model analogical inference power on questions file <file>
                See an example of questions file
                at https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt

Example: python -m gensim.scripts.word2vec_standalone -train data.txt          -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3
    N)seterr)Word2VecLineSentence__main__z:%(asctime)s : %(threadName)s : %(levelname)s : %(message)s)formatlevelz
running %s raise)allz-trainz0Use text data from file TRAIN to train the modelT)helpZrequiredz-outputz2Use file OUTPUT to save the resulting word vectors)r   z-windowz6Set max skip length WINDOW between words; default is 5   )r   typedefaultz-sizez(Set size of word vectors; default is 100d   z-samplezSet threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)gMbP?z-hsz1Use Hierarchical Softmax; default is 0 (not used)   )r   r   r   choicesz	-negativezRNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)z-threadszUse THREADS threads (default 3)   z-iterz(Run more training iterations (default 5)z
-min_countzKThis will discard words that appear less than MIN_COUNT times; default is 5z-alphazPSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW)r   r   z-cbowzOUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)z-binaryz=Save the resulting vectors in binary mode; default is 0 (off)z	-accuracyz6Use questions from file ACCURACY to evaluate the modelg?g?)Zvector_size	min_countZworkerswindowsamplealphaZsghsnegativeZ	cbow_meanZepochs)binary.z.modelz
.model.binz
.model.txtFzfinished running %s)3__doc__Zloggingos.pathossysargparseZnumpyr   Zgensim.models.word2vecr   r   Z	getLogger__name__ZloggerZbasicConfigINFOinfojoinargvArgumentParserparseradd_argumentintfloat
parse_argsargsZcbowZskipgramr   ZtrainZcorpussizer   Zthreadsr   r   r   r   iterZmodeloutputZoutfileZwvZsave_word2vec_formatr   splitZsaveZaccuracyZquestions_filepathbasename r2   r2   Alib/python3.9/site-packages/gensim/scripts/word2vec_standalone.py<module>   s   /







