a
    dG(bP                     @   s   d Z ddlZddlZddlZddlZddlZddlZddlm	Z	m
Z
mZ eeZdZzddlZW n eyt   Y n0 G dd de
jZdd Zd	d
 ZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZdS )a|  Compute similarities across a collection of documents in the Vector Space Model.

The main class is :class:`~gensim.similarities.docsim.Similarity`, which builds an index for a given set of documents.

Once the index is built, you can perform efficient queries like "Tell me how similar is this query document to each
document in the index?". The result is a vector of numbers as large as the size of the initial set of documents,
that is, one float for each index document. Alternatively, you can also request only the top-N most
similar index documents to the query.


How It Works
------------
The :class:`~gensim.similarities.docsim.Similarity` class splits the index into several smaller sub-indexes ("shards"),
which are disk-based. If your entire index fits in memory (~one million documents per 1GB of RAM),
you can also use the :class:`~gensim.similarities.docsim.MatrixSimilarity`
or :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly.
These are more simple but do not scale as well: they keep the entire index in RAM, no sharding. They also do not
support adding new document to the index dynamically.

Once the index has been initialized, you can query for document similarity simply by

.. sourcecode:: pycon

    >>> from gensim.similarities import Similarity
    >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
    >>>
    >>> index_tmpfile = get_tmpfile("index")
    >>> query = [(1, 2), (6, 1), (7, 2)]
    >>>
    >>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary))  # build the index
    >>> similarities = index[query]  # get similarities between the query and all index documents

If you have more query documents, you can submit them all at once, in a batch

.. sourcecode:: pycon

    >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
    >>>
    >>> index_tmpfile = get_tmpfile("index")
    >>> batch_of_documents = common_corpus[:]  # only as example
    >>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary))  # build the index
    >>>
    >>> # the batch is simply an iterable of documents, aka gensim corpus:
    >>> for similarities in index[batch_of_documents]:
    ...     pass

The benefit of this batch (aka "chunked") querying is a much better performance.
To see the speed-up on your machine, run ``python -m gensim.test.simspeed``
(compare to my results `here <http://groups.google.com/group/gensim/msg/4f6f171a869e4fca?>`_).

There is also a special syntax for when you need similarity of documents in the index
to the index itself (i.e. queries = the indexed documents themselves). This special syntax
uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**:

.. sourcecode:: pycon

    >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
    >>>
    >>> index_tmpfile = get_tmpfile("index")
    >>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary))  # build the index
    >>>
    >>> for similarities in index:  # yield similarities of the 1st indexed document, then 2nd...
    ...     pass

    N)
interfacesutilsmatutilsFc                   @   sP   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd ZdS )Sharda7  A proxy that represents a single shard instance within :class:`~gensim.similarity.docsim.Similarity` index.

    Basically just wraps :class:`~gensim.similarities.docsim.MatrixSimilarity`,
    :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`, etc, so that it mmaps from disk on request (query).

    c                 C   sR   t j|\| _| _t|| _|j| _t	
d|   ||   |  | _dS )z

        Parameters
        ----------
        fname : str
            Path to top-level directory (file) to traverse for corpus documents.
        index : :class:`~gensim.interfaces.SimilarityABC`
            Index object.

        zsaving index shard to %sN)ospathsplitdirnamefnamelenlength	__class__clsloggerinfofullnamesave	get_indexindex)selfr
   r    r   9lib/python3.9/site-packages/gensim/similarities/docsim.py__init__e   s    
zShard.__init__c                 C   s   t j| j| jS )zuGet full path to shard file.

        Return
        ------
        str
            Path to shard instance.

        )r   r   joinr	   r
   r   r   r   r   r   w   s    	zShard.fullnamec                 C   s   | j S )zGet length.)r   r   r   r   r   __len__   s    zShard.__len__c                 C   s   | j  }d|v r|d= |S )zSpecial handler for pickle.

        Returns
        -------
        dict
            Object that contains state of current instance without `index`.

        r   )__dict__copy)r   resultr   r   r   __getstate__   s    	
zShard.__getstate__c                 C   s   d| j jt| |  f S )Nz%s Shard(%i documents in %s))r   __name__r   r   r   r   r   r   __str__   s    zShard.__str__c                 C   s6   t | ds0td|   | jj|  dd| _| jS )zLoad & get index.

        Returns
        -------
        :class:`~gensim.interfaces.SimilarityABC`
            Index instance.

        r   zmmaping index from %sr)Zmmap)hasattrr   debugr   r   loadr   r   r   r   r   r      s    	
zShard.get_indexc                 C   s0   d|  krt | k s"n J d|  j| S )a%  Get index vector at position `pos`.

        Parameters
        ----------
        pos : int
            Vector position.

        Return
        ------
        {:class:`scipy.sparse.csr_matrix`, :class:`numpy.ndarray`}
            Index vector. Type depends on underlying index.

        Notes
        -----
        The vector is of the same type as the underlying index (ie., dense for
        :class:`~gensim.similarities.docsim.MatrixSimilarity`
        and scipy.sparse for :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`.

        r   zrequested position out of range)r   r   r   )r   posr   r   r   get_document_id   s    "zShard.get_document_idc                 C   s@   |   }z| j|_| j|_W n ty6   tdY n0 || S )a*  Get similarities of document (or corpus) `query` to all documents in the corpus.

        Parameters
        ----------
        query : {iterable of list of (int, number) , list of (int, number))}
            Document or corpus.

        Returns
        -------
        :class:`numpy.ndarray`
            Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or**
        :class:`scipy.sparse.csr_matrix`
            for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`.

        zJnum_best and normalize have to be set before querying a proxy Shard object)r   num_best	normalize	Exception
ValueError)r   queryr   r   r   r   __getitem__   s    zShard.__getitem__N)r    
__module____qualname____doc__r   r   r   r   r!   r   r'   r-   r   r   r   r   r   ^   s   r   c                 C   s<   | \}}t d||jt  || }t d|t  |S )aq  Helper for request query from shard, same as shard[query].

    Parameters
    ---------
    args : (list of (int, number), :class:`~gensim.interfaces.SimilarityABC`)
        Query and Shard instances

    Returns
    -------
    :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
        Similarities of the query against documents indexed in this shard.

    z+querying shard %s num_best=%s in process %sz(finished querying shard %s in process %s)r   r$   r(   r   getpid)argsr,   shardr   r   r   r   query_shard   s
    r4   c                 C   s   t j| tj| dd dS )a(  Helper for extracting n documents with maximum similarity.

    Parameters
    ----------
    n : int
        Number of elements to be extracted
    iterable : iterable of list of (int, float)
        Iterable containing documents with computed similarities

    Returns
    -------
    :class:`list`
        List with the n largest elements from the dataset defined by iterable.

    Notes
    -----
    Elements are compared by the absolute value of similarity, because negative value of similarity
    does not mean some form of dissimilarity.

    c                 S   s   t | d S )N   )abs)itemr   r   r   <lambda>       z_nlargest.<locals>.<lambda>)key)heapqnlargest	itertoolschain)niterabler   r   r   	_nlargest   s    rA   c                       s   e Zd ZdZd&ddZdd	 Zd
d Zdd Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zd'ddZd d! Zd( fd"d#	Zd$d% Z  ZS ))
Similarityag  Compute cosine similarity of a dynamic query against a corpus of documents ('the index').

    The index supports adding new documents dynamically.

    Notes
    -----
    Scalability is achieved by sharding the index into smaller pieces, each of which fits into core memory
    The shards themselves are simply stored as files to disk and mmap'ed back as needed.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.corpora.textcorpus import TextCorpus
        >>> from gensim.test.utils import datapath, get_tmpfile
        >>> from gensim.similarities import Similarity
        >>>
        >>> corpus = TextCorpus(datapath('testcorpus.mm'))
        >>> index_temp = get_tmpfile("index")
        >>> index = Similarity(index_temp, corpus, num_features=400)  # create index
        >>>
        >>> query = next(iter(corpus))
        >>> result = index[query]  # search similar to `query` in index
        >>>
        >>> for sims in index[corpus]:  # if you have more query documents, you can submit them all at once, in a batch
        ...     pass
        >>>
        >>> # There is also a special syntax for when you need similarity of documents in the index
        >>> # to the index itself (i.e. queries=indexed documents themselves). This special syntax
        >>> # uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**:
        >>> for similarities in index:  # yield similarities of the 1st indexed document, then 2nd...
        ...     pass

    See Also
    --------
    :class:`~gensim.similarities.docsim.MatrixSimilarity`
        Index similarity (dense with cosine distance).
    :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`
        Index similarity (sparse with cosine distance).
    :class:`~gensim.similarities.docsim.WmdSimilarity`
        Index similarity (with word-mover distance).

    N      l2c                 C   sx   |du rt jdd| _n|| _td| j || _|| _|| _t|| _	|| _
g | _g d | _| _|durt| | dS )a  

        Parameters
        ----------
        output_prefix : str
            Prefix for shard filename. If None, a random filename in temp will be used.
        corpus : iterable of list of (int, number)
            Corpus in streamed Gensim bag-of-words format.
        num_features : int
            Size of the dictionary (number of features).
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        shardsize : int, optional
            Maximum shard size, in documents. Choose a value so that a `shardsize x chunksize` matrix of floats fits
            comfortably into your RAM.
        norm : {'l1', 'l2'}, optional
            Normalization to use.

        Notes
        -----
        Documents are split (internally, transparently) into shards of `shardsize` documents each, and each shard
        converted to a matrix, for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number`.

        If you don't specify an output prefix, a random filename in temp will be used.

        If your entire index fits in memory (~1 million documents per 1GB of RAM), you can also use the
        :class:`~gensim.similarities.docsim.MatrixSimilarity` or
        :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly.
        These are more simple but do not scale as well (they keep the entire index in RAM, no sharding).
        They also do not support adding new document dynamically.

        NZ	simserver)prefixz"starting similarity index under %sr   )r   Z	randfnameoutput_prefixr   r   num_featuresr(   normint	chunksize	shardsizeshards
fresh_docs	fresh_nnzadd_documents)r   rG   corpusrH   r(   rK   rL   rI   r   r   r   r   0  s    $
zSimilarity.__init__c                 C   s   t | jtdd | jD  S )zGet length of index.c                 s   s   | ]}t |V  qd S Nr   .0r3   r   r   r   	<genexpr>g  r9   z%Similarity.__len__.<locals>.<genexpr>)r   rN   sumrM   r   r   r   r   r   e  s    zSimilarity.__len__c                 C   s   dt | t | j| jf S )NzASimilarity index with %i documents in %i shards (stored under %s))r   rM   rG   r   r   r   r   r!   i  s    zSimilarity.__str__c                 C   s  d}| j r*t| j d || j k r*|   |D ]}t|tjrHt|}n`tj	|r\|j
}nLt|}|d| j k rtt|g| jj| j}ntt|| j| j}| j| |  j|7  _t| j| jkr|   t| jd dkr.tdt| j q.dS )a  Extend the index with new documents.

        Parameters
        ----------
        corpus : iterable of list of (int, number)
            Corpus in BoW format.

        Notes
        -----
        Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them
        (or when a query is issued).

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora.textcorpus import TextCorpus
            >>> from gensim.test.utils import datapath, get_tmpfile
            >>> from gensim.similarities import Similarity
            >>>
            >>> corpus = TextCorpus(datapath('testcorpus.mm'))
            >>> index_temp = get_tmpfile("index")
            >>> index = Similarity(index_temp, corpus, num_features=400)  # create index
            >>>
            >>> one_more_corpus = TextCorpus(datapath('testcorpus.txt'))
            >>> index.add_documents(one_more_corpus)  # add more documents in corpus

              ?333333?'  r   zPROGRESS: fresh_shard size=%iN)rM   r   rL   reopen_shard
isinstancenumpyndarrayscipysparseissparseZnnzrH   r   unitvec
corpus2cscTrI   sparse2fullrN   appendrO   close_shardr   r   )r   rQ   Z	min_ratiodocZdoclenr   r   r   rP   n  s$    
zSimilarity.add_documentsc                 C   s,   | j drd| j |f S d| j |f S dS )zGet shard file by `shardid`.

        Parameters
        ----------
        shardid : int
            Shard index.

        Return
        ------
        str
            Path to shard file.

        .z%s%sz%s.%sN)rG   endswith)r   shardidr   r   r   shardid2filename  s    zSimilarity.shardid2filenamec                 C   s   | j s
dS t| j}dd| j t| j | j  k}|rTt| j | jt| j | jd}nt| j | jd}td|rrdnd| t	| 
||}| j|_| j|_| j| g d	 | _ | _dS )
a  Force the latest shard to close (be converted to a matrix and stored to disk).
         Do nothing if no new documents added since last call.

        Notes
        -----
        The shard is closed even if it is not full yet (its size is smaller than `self.shardsize`).
        If documents are added later via :meth:`~gensim.similarities.docsim.MatrixSimilarity.add_documents`
        this incomplete shard will be loaded again and completed.

        NrZ   rX   )	num_termsnum_docsnum_nnz)rH   zcreating %s shard #%sra   Zdenser   )rN   r   rM   rO   rH   SparseMatrixSimilarityMatrixSimilarityr   r   r   rm   r(   rp   rg   )r   rl   rb   r   r3   r   r   r   rh     s    
zSimilarity.close_shardc                 C   sd   | j s
J | jrtd| j d }| }tdt| t|j| _|j	| _
| j d= td dS )zReopen an incomplete shard.z3cannot reopen a shard with fresh documents in indexrY   z-reopening an incomplete shard of %i documentszreopen completeN)rM   rN   r+   r   r   r   r   listr   rp   rO   r$   )r   Z
last_shardZ
last_indexr   r   r   r\     s    

zSimilarity.reopen_shardc                 C   sp   t |gt| j | j}trZtdkrZtdt tt}|jt	|dt| jt  d}nd}t
t	|}||fS )ao  Apply shard[query] to each shard in `self.shards`. Used internally.

        Parameters
        ----------
        query : {iterable of list of (int, number) , list of (int, number))}
            Document in BoW format or corpus of documents.

        Returns
        -------
        (None, list of individual shard query results)
            Query results.

        r5   zspawning %i query processes)rK   N)zipr   rM   PARALLEL_SHARDSr   r$   multiprocessingZPoolZimapr4   map)r   r,   r2   poolr   r   r   r   query_shards  s    
 
zSimilarity.query_shardsc                    s@  |    | jD ]}| j|_| j|_q| |\}}| jdu rLtt|}nt	dgdd | jD  fdd t
|\}}|pt|do|jdko|jd dk}|sЇ fd	d
t|D }t| j|}n^g }t|D ]&\} fdd|D }|| qg }t| D ]}	t| j|	}
||
 q|r<|  |S )a  Get similarities of the document (or corpus) `query` to all documents in the corpus.

        Parameters
        ----------
        query : {iterable of list of (int, number) , list of (int, number))}
            A single document in bag-of-words format, or a corpus (iterable) of such documents.

        Return
        ------
        :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
            Similarities of the query against this index.

        Notes
        -----
        If `query` is a corpus (iterable of documents), return a matrix of similarities of
        all query documents vs. all corpus document. This batch query is more efficient than computing the similarities
        one document after another.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora.textcorpus import TextCorpus
            >>> from gensim.test.utils import datapath
            >>> from gensim.similarities import Similarity
            >>>
            >>> corpus = TextCorpus(datapath('testcorpus.txt'))
            >>> index = Similarity('temp', corpus, num_features=400)
            >>> result = index[corpus]  # pairwise similarities of each document against each document

        Nr   c                 S   s   g | ]}t |qS r   rS   rT   r   r   r   
<listcomp>+  r9   z*Similarity.__getitem__.<locals>.<listcomp>c                    s    fdd|D S )Nc                    s    g | ]\}}|   |fqS r   r   )rU   Z	doc_indexsim)offsetsshard_nor   r   rz   .  r9   z;Similarity.__getitem__.<locals>.convert.<locals>.<listcomp>r   )r}   ri   )r|   )r}   r   convert-  s    z'Similarity.__getitem__.<locals>.convertndimr5   c                 3   s   | ]\}} ||V  qd S rR   r   )rU   r}   r   )r~   r   r   rV   4  r9   z)Similarity.__getitem__.<locals>.<genexpr>c                    s   g | ]} |qS r   r   )rU   ri   )r~   r}   r   r   rz   :  r9   )rh   rM   r(   rI   r)   ry   r^   Zhstackrs   Zcumsumr   	is_corpusr#   r   shape	enumeraterA   rg   rt   Z	terminate)r   r,   r3   rx   Zshard_resultsr   r   ZresultsZshard_resultpartsZmergedr   )r~   r|   r}   r   r-     s2     


&zSimilarity.__getitem__c                 C   st   |    d}| jD ]}|t|7 }||k r q0q| jrF|dk sF||krZtd|t| f ||| t| }|S )a  Get the indexed vector corresponding to the document at position `docpos`.

        Parameters
        ----------
        docpos : int
            Document position

        Return
        ------
        :class:`scipy.sparse.csr_matrix`
            Indexed vector.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora.textcorpus import TextCorpus
            >>> from gensim.test.utils import datapath
            >>> from gensim.similarities import Similarity
            >>>
            >>> # Create index:
            >>> corpus = TextCorpus(datapath('testcorpus.txt'))
            >>> index = Similarity('temp', corpus, num_features=400)
            >>> vector = index.vector_by_id(1)

        r   z3invalid document position: %s (must be 0 <= x < %s))rh   rM   r   r+   r'   )r   docposr&   r3   r   r   r   r   vector_by_idG  s    
zSimilarity.vector_by_idc                 C   s*   |  |}| jd }| _| | }|| _|S )a   Get similarity of a document specified by its index position `docpos`.

        Parameters
        ----------
        docpos : int
            Document position in the index.

        Return
        ------
        :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
            Similarities of the given document against this index.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora.textcorpus import TextCorpus
            >>> from gensim.test.utils import datapath
            >>> from gensim.similarities import Similarity
            >>>
            >>> corpus = TextCorpus(datapath('testcorpus.txt'))
            >>> index = Similarity('temp', corpus, num_features=400)
            >>> similarities = index.similarity_by_id(1)

        F)r   rI   )r   r   r,   rI   r   r   r   r   similarity_by_idm  s
    
zSimilarity.similarity_by_idc                 c   sT   | j d }| _ |  D ]2}|jd dkr>| | D ]
}|V  q0q| | V  q|| _ dS )aj  For each index document in index, compute cosine similarity against all other documents in the index.
        Uses :meth:`~gensim.similarities.docsim.Similarity.iter_chunks` internally.

        Yields
        ------
        :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
            Similarities of each document in turn against the index.

        Fr   r5   N)rI   iter_chunksr   )r   rI   chunkr{   r   r   r   __iter__  s    
zSimilarity.__iter__c                 c   sn   |    |du r| j}| jD ]L}| j}td|jd |D ]*}t|jd || }||| }|V  q<qdS )a  Iteratively yield the index as chunks of document vectors, each of size <= chunksize.

        Parameters
        ----------
        chunksize : int, optional
            Size of chunk,, if None - `self.chunksize` will be used.

        Yields
        ------
        :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
            Chunks of the index as 2D arrays. The arrays are either dense or sparse, depending on
            whether the shard was storing dense or sparse vectors.

        Nr   )rh   rK   rM   r   r   ranger   min)r   rK   r3   r,   Zchunk_startZ	chunk_endr   r   r   r   r     s    

zSimilarity.iter_chunksc                 C   s$   t j| j}| jD ]
}||_qdS )z\Update shard locations, for case where the server prefix location changed on the filesystem.N)r   r   r	   rG   rM   )r   r	   r3   r   r   r   check_moved  s    
zSimilarity.check_movedc                    s8   |    |du r| j}tt| j|g|R i | dS )a  Save the index object via pickling under `fname`. See also :meth:`~gensim.docsim.Similarity.load()`.

        Parameters
        ----------
        fname : str, optional
            Path for save index, if not provided - will be saved to `self.output_prefix`.
        *args : object
            Arguments, see :meth:`gensim.utils.SaveLoad.save`.
        **kwargs : object
            Keyword arguments, see :meth:`gensim.utils.SaveLoad.save`.

        Notes
        -----
        Will call :meth:`~gensim.similarities.Similarity.close_shard` internally to spill
        any unfinished shards to disk first.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora.textcorpus import TextCorpus
            >>> from gensim.test.utils import datapath, get_tmpfile
            >>> from gensim.similarities import Similarity
            >>>
            >>> temp_fname = get_tmpfile("index")
            >>> output_fname = get_tmpfile("saved_index")
            >>>
            >>> corpus = TextCorpus(datapath('testcorpus.txt'))
            >>> index = Similarity(output_fname, corpus, num_features=400)
            >>>
            >>> index.save(output_fname)
            >>> loaded_index = index.load(output_fname)

        N)rh   rG   superrB   r   )r   r
   r2   kwargsr   r   r   r     s    #zSimilarity.savec                 C   s8   ddl }| | jd D ]}td| t| qdS )ua   Delete all files under self.output_prefix Index is not usable anymore after calling this method.r   N*zdeleting %s)globrG   r   r   r   remove)r   r   r
   r   r   r   destroy  s    zSimilarity.destroy)NrC   rD   rE   )N)N)r    r.   r/   r0   r   r   r!   rP   rm   rh   r\   ry   r-   r   r   r   r   r   r   r   __classcell__r   r   r   r   rB     s"   ,
53O& 
(rB   c                   @   s>   e Zd ZdZdejdddfddZdd Zdd	 Zd
d Z	dS )rr   a>  Compute cosine similarity against a corpus of documents by storing the index matrix in memory.

    Unless the entire matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.test.utils import common_corpus, common_dictionary
        >>> from gensim.similarities import MatrixSimilarity
        >>>
        >>> query = [(1, 2), (5, 4)]
        >>> index = MatrixSimilarity(common_corpus, num_features=len(common_dictionary))
        >>> sims = index[query]

    NrC   c           	      C   s   |du r t d dt| }|| _|| _d| _|| _|du rHt|}|dur| jdkrbt	dt 
d|| tj||f|d| _t|D ]f\}}|d	 dkrt d
|| t|tjrn,tj|r|  }ntt||}|| j|< qdS )aR  

        Parameters
        ----------
        corpus : iterable of list of (int, number)
            Corpus in streamed Gensim bag-of-words format.
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        num_features : int
            Size of the dictionary (number of features).
        corpus_len : int, optional
            Number of documents in `corpus`. If not specified, will scan the corpus to determine the matrix size.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        dtype : numpy.dtype, optional
            Datatype to store the internal matrix in.

        Nz`scanning corpus to determine the number of features (consider setting `num_features` explicitly)r5   Tr   zzcannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)z1creating matrix with %i documents and %i features)r   dtypei  zPROGRESS: at document #%i/%i)r   Zwarningr   Z
get_max_idrH   r(   r)   rK   r   r+   r   r^   emptyr   r   r$   r]   r_   r`   ra   rb   toarrayflattenr   rc   rf   )	r   rQ   r(   r   rH   rK   Z
corpus_lenZdocnoZvectorr   r   r   r   	  s6    
zMatrixSimilarity.__init__c                 C   s   | j jd S )Nr   r   r   r   r   r   r   r   C  s    zMatrixSimilarity.__len__c                    s   t |\}}|r4tj fdd|D  jjd}nDtj|rJ|	 }nt
|tjrXnt| j}tj| jjd}t j|jj}|S )a  Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly, use the :class:`~gensim.similarities.docsim.MatrixSimilarity.__getitem__`
        instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix.

        c                    s   g | ]}t | jqS r   )r   rf   rH   )rU   Zvecr   r   r   rz   \  r9   z5MatrixSimilarity.get_similarities.<locals>.<listcomp>r   )r   r   r^   asarrayr   r   r`   ra   rb   r   r]   r_   r   rf   rH   dotre   r   r,   r   r   r   r   r   get_similaritiesF  s    
z!MatrixSimilarity.get_similaritiesc                 C   s   d| j jt| | jjd f S N%s<%i docs, %i features>r5   )r   r    r   r   r   r   r   r   r   r!   n  s    zMatrixSimilarity.__str__)
r    r.   r/   r0   r^   float32r   r   r   r!   r   r   r   r   rr     s
   :(rr   c                   @   s2   e Zd ZdZdddZdd Zd	d
 Zdd ZdS )SoftCosineSimilaritya&  Compute soft cosine similarity against a corpus of documents by storing the index matrix in memory.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.test.utils import common_texts
        >>> from gensim.corpora import Dictionary
        >>> from gensim.models import Word2Vec
        >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
        >>> from gensim.similarities import WordEmbeddingSimilarityIndex
        >>>
        >>> model = Word2Vec(common_texts, vector_size=20, min_count=1)  # train word-vectors
        >>> termsim_index = WordEmbeddingSimilarityIndex(model.wv)
        >>> dictionary = Dictionary(common_texts)
        >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts]
        >>> similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
        >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
        >>>
        >>> query = 'graph trees computer'.split()  # make a query
        >>> sims = docsim_index[dictionary.doc2bow(query)]  # calculate similarity of query to each doc from bow_corpus

    Check out `the Gallery <https://radimrehurek.com/gensim/auto_examples/tutorials/run_scm.html>`__
    for more examples.

    NrC   TTc                 C   s<   || _ t|| _|| _|| _|| _d| _tt	|| _
dS )a  

        Parameters
        ----------
        corpus: iterable of list of (int, float)
            A list of documents in the BoW format.
        similarity_matrix : :class:`gensim.similarities.SparseTermSimilarityMatrix`
            A term similarity matrix.
        num_best : int, optional
            The number of results to retrieve for a query, if None - return similarities with all elements from corpus.
        chunksize: int, optional
            Size of one corpus chunk.
        normalized : tuple of {True, False, 'maintain'}, optional
            First/second value specifies whether the query/document vectors in the inner product
            will be L2-normalized (True; corresponds to the soft cosine similarity measure; default),
            maintain their L2-norm during change of basis ('maintain'; corresponds to query
            expansion with partial membership), or kept as-is (False;
            corresponds to query expansion).

        See Also
        --------
        :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
            A sparse term similarity matrix built using a term similarity index.
        :class:`~gensim.similarities.termsim.LevenshteinSimilarityIndex`
            A term similarity index that computes Levenshtein similarities between terms.
        :class:`~gensim.similarities.termsim.WordEmbeddingSimilarityIndex`
            A term similarity index that computes cosine similarities between word embeddings.

        FN)similarity_matrixrs   rQ   r(   rK   
normalizedr)   r^   aranger   r   )r   rQ   r   r(   rK   r   r   r   r   r     s    
zSoftCosineSimilarity.__init__c                 C   s
   t | jS rR   r   rQ   r   r   r   r   r     s    zSoftCosineSimilarity.__len__c                    s    j st S t|\}}|s>t|tjr> fdd|D } jj| j  j	d}t
j|rnt| S t|rt|S t|d S )a  Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number)}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix.

        c                    s   g | ]} j | qS r   rQ   rU   ir   r   r   rz     r9   z9SoftCosineSimilarity.get_similarities.<locals>.<listcomp>)r   r   )rQ   r^   arrayr   r   r]   r_   r   Zinner_productr   r`   ra   rb   r   ZtodenseZisscalarr   r   r   r   r     s    

z%SoftCosineSimilarity.get_similaritiesc                 C   s   d| j jt| | jjd f S )Nr   r   )r   r    r   r   r   r   r   r   r   r!     s    zSoftCosineSimilarity.__str__)NrC   r   r    r.   r/   r0   r   r   r   r!   r   r   r   r   r   r  s
   
- r   c                   @   s2   e Zd ZdZdddZdd Zdd	 Zd
d ZdS )WmdSimilaritya  Compute negative WMD similarity against a corpus of documents.

    Check out `the Gallery <https://radimrehurek.com/gensim/auto_examples/tutorials/run_wmd.html>`__
    for more examples.

    When using this code, please consider citing the following papers:

    * `Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching"
      <http://www.cs.huji.ac.il/~werman/Papers/ECCV2008.pdf>`_
    * `Ofir Pele and Michael Werman, "Fast and robust earth mover's distances"
      <http://www.cs.huji.ac.il/~werman/Papers/ICCV2009.pdf>`_
    * `Matt Kusner et al. "From Word Embeddings To Document Distances"
      <http://proceedings.mlr.press/v37/kusnerb15.pdf>`_

    Example
    -------
    .. sourcecode:: pycon

        >>> from gensim.test.utils import common_texts
        >>> from gensim.models import Word2Vec
        >>> from gensim.similarities import WmdSimilarity
        >>>
        >>> model = Word2Vec(common_texts, vector_size=20, min_count=1)  # train word-vectors
        >>>
        >>> index = WmdSimilarity(common_texts, model)
        >>> # Make query.
        >>> query = ['trees']
        >>> sims = index[query]

    NrC   c                 C   s2   || _ || _|| _|| _d| _tt|| _dS )a  

        Parameters
        ----------
        corpus: iterable of list of str
            A list of documents, each of which is a list of tokens.
        kv_model: :class:`~gensim.models.keyedvectors.KeyedVectors`
            A set of KeyedVectors
        num_best: int, optional
            Number of results to retrieve.
        chunksize : int, optional
            Size of chunk.

        FN)	rQ   wvr(   rK   r)   r^   r   r   r   )r   rQ   Zkv_modelr(   rK   r   r   r   r      s    zWmdSimilarity.__init__c                 C   s
   t | jS )zGet size of corpus.r   r   r   r   r   r     s    zWmdSimilarity.__len__c                    s   t tjrfddD r0t d ts6gt}g }t|D ]<  fddjD }t|}dd|  }|| qJt|dkr|d }n
t|}|S )a  Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of str, iterable of list of str}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix.

        c                    s   g | ]} j | qS r   r   r   r   r   r   rz   2  r9   z2WmdSimilarity.get_similarities.<locals>.<listcomp>r   c                    s   g | ]}j |  qS r   )r   Z
wmdistance)rU   ZdocumentZqidxr,   r   r   r   rz   ;  r9   rX   r5   )	r]   r^   r_   rs   r   r   rQ   r   rg   )r   r,   Z	n_queriesr   Zqresultr   r   r   r     s    


zWmdSimilarity.get_similaritiesc                 C   s"   d| j jt| | jjjjd f S r   )r   r    r   Z	w2v_modelr   Zsyn0r   r   r   r   r   r!   J  s    zWmdSimilarity.__str__)NrC   r   r   r   r   r   r     s
   
,r   c                   @   s<   e Zd ZdZddddddejdfddZdd Zd	d
 ZdS )rq   a  Compute cosine similarity against a corpus of documents by storing the index matrix in memory.

    Notes
    -----
    Use this if your input corpus contains sparse vectors (such as TF-IDF documents) and fits into RAM.

    The matrix is internally stored as a :class:`scipy.sparse.csr_matrix` matrix. Unless the entire
    matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead.

    Takes an optional `maintain_sparsity` argument, setting this to True
    causes `get_similarities` to return a sparse matrix instead of a
    dense representation if possible.

    See also
    --------
    :class:`~gensim.similarities.docsim.Similarity`
        Index similarity (wrapper for other inheritors of :class:`~gensim.interfaces.SimilarityABC`).
    :class:`~gensim.similarities.docsim.MatrixSimilarity`
        Index similarity (dense with cosine distance).

    Ni  Fc
           
      C   s   || _ d| _|| _|	| _|durtd z$|j|j|j  }}}t	d W n t
y`   Y n0 |durn|}|du r~tddd |D }tj|||||dd	j| _| j | _td
| j dS )a  

        Parameters
        ----------
        corpus: iterable of list of (int, float)
            A list of documents in the BoW format.
        num_features : int, optional
            Size of the dictionary. Must be either specified, or present in `corpus.num_terms`.
        num_terms : int, optional
            Alias for `num_features`, you can use either.
        num_docs : int, optional
            Number of documents in `corpus`. Will be calculated if not provided.
        num_nnz : int, optional
            Number of non-zero elements in `corpus`. Will be calculated if not provided.
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        dtype : numpy.dtype, optional
            Data type of the internal matrix.
        maintain_sparsity : bool, optional
            Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`?

        TNzcreating sparse indexz%using efficient sparse index creationzPrefusing to guess the number of sparse features: specify num_features explicitlyc                 s   sD   | ]<}t j|rt|nt|tjr2t|nt	|V  qd S rR   )
r`   ra   rb   r   Zscipy2sparser]   r^   r_   Zfull2sparserc   )rU   vr   r   r   rV     s   z2SparseMatrixSimilarity.__init__.<locals>.<genexpr>r[   )rn   ro   rp   r   Zprintprogressz
created %r)r(   r)   rK   maintain_sparsityr   r   rn   ro   rp   r$   AttributeErrorr+   r   rd   re   r   Ztocsr)
r   rQ   rH   rn   ro   rp   r(   rK   r   r   r   r   r   r   d  s0    

zSparseMatrixSimilarity.__init__c                 C   s   | j jd S )zGet size of index.r   r   r   r   r   r   r     s    zSparseMatrixSimilarity.__len__c                 C   s   t |\}}|r0tj|| jjd | jjd}nntj	|rD|j
}nZt|tjr|jdkrhdt|f|_tjj|| jjdj
}ntj|g| jjd | jjd}| j|  }|jd dkr|s|  }n| jr|j
}n
| j
}|S )a)  Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix (if maintain_sparsity=False) **OR**
        :class:`scipy.sparse.csc`
            otherwise

        r5   r   )r   r   r   rd   r   r   r   r`   ra   rb   re   r]   r^   r_   r   r   Z
csr_matrixZtocscr   r   r   r   r   r   r   r     s"    

z'SparseMatrixSimilarity.get_similarities)	r    r.   r/   r0   r^   r   r   r   r   r   r   r   r   rq   N  s   

>rq   )r0   Zloggingr=   r   r;   r^   Zscipy.sparser`   Zgensimr   r   r   Z	getLoggerr    r   ru   rv   ImportErrorZSaveLoadr   r4   rA   ZSimilarityABCrB   rr   r   r   rq   r   r   r   r   <module>   s0   A
x   xzom