#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2018 Vit Novotny # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ This module allows fast fuzzy search between strings, using kNN queries with Levenshtein similarity. """ import logging from gensim.similarities.termsim import TermSimilarityIndex from gensim import utils try: from gensim.similarities.fastss import FastSS, editdist # noqa:F401 except ImportError: raise utils.NO_CYTHON logger = logging.getLogger(__name__) class LevenshteinSimilarityIndex(TermSimilarityIndex): r""" Retrieve the most similar terms from a static set of terms ("dictionary") given a query term, using Levenshtein similarity. "Levenshtein similarity" is a modification of the Levenshtein (edit) distance, defined in [charletetal17]_. This implementation uses the FastSS neighbourhood algorithm for fast kNN nearest-neighbor retrieval. Parameters ---------- dictionary : :class:`~gensim.corpora.dictionary.Dictionary` A dictionary that specifies the considered terms. alpha : float, optional Multiplicative factor `alpha` for the Levenshtein similarity. See [charletetal17]_. beta : float, optional The exponential factor `beta` for the Levenshtein similarity. See [charletetal17]_. max_distance : int, optional Do not consider terms with Levenshtein distance larger than this as "similar". This is done for performance reasons: keep this value below 3 for reasonable retrieval performance. Default is 1. See Also -------- :class:`~gensim.similarities.termsim.WordEmbeddingSimilarityIndex` Retrieve most similar terms for a given term using the cosine similarity over word embeddings. :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` Build a term similarity matrix and compute the Soft Cosine Measure. References ---------- .. [charletetal17] Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3: Soft-Cosine Semantic Similarity between Questions for Community Question Answering", 2017, https://www.aclweb.org/anthology/S17-2051/. """ def __init__(self, dictionary, alpha=1.8, beta=5.0, max_distance=2): self.dictionary = dictionary self.alpha = alpha self.beta = beta self.max_distance = max_distance logger.info("creating FastSS index from %s", dictionary) self.index = FastSS(words=self.dictionary.values(), max_dist=max_distance) super(LevenshteinSimilarityIndex, self).__init__() def levsim(self, t1, t2, distance): """Calculate the Levenshtein similarity between two terms given their Levenshtein distance.""" max_lengths = max(len(t1), len(t2)) or 1 return self.alpha * (1.0 - distance * 1.0 / max_lengths)**self.beta def most_similar(self, t1, topn=10): """kNN fuzzy search: find the `topn` most similar terms from `self.dictionary` to `t1`.""" result = {} # map of {dictionary term => its levenshtein similarity to t1} if self.max_distance > 0: effective_topn = topn + 1 if t1 in self.dictionary.token2id else topn effective_topn = min(len(self.dictionary), effective_topn) # Implement a "distance backoff" algorithm: # Start with max_distance=1, for performance. And if that doesn't return enough results, # continue with max_distance=2 etc, all the way until self.max_distance which # is a hard cutoff. # At that point stop searching, even if we don't have topn results yet. # # We use the backoff algo to speed up queries for short terms. These return enough results already # with max_distance=1. # # See the discussion at https://github.com/RaRe-Technologies/gensim/pull/3146 for distance in range(1, self.max_distance + 1): for t2 in self.index.query(t1, distance).get(distance, []): if t1 == t2: continue similarity = self.levsim(t1, t2, distance) if similarity > 0: result[t2] = similarity if len(result) >= effective_topn: break return sorted(result.items(), key=lambda x: (-x[1], x[0]))[:topn]