#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ Automated tests for indirect confirmation measures in the indirect_confirmation_measure module. """ import logging import unittest import numpy as np from gensim.corpora.dictionary import Dictionary from gensim.topic_coherence import indirect_confirmation_measure from gensim.topic_coherence import text_analysis class TestIndirectConfirmation(unittest.TestCase): def setUp(self): # Set up toy example for better understanding and testing # of this module. See the modules for the mathematical formulas self.topics = [np.array([1, 2])] # Result from s_one_set segmentation: self.segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] self.gamma = 1 self.measure = 'nlr' self.dictionary = Dictionary() self.dictionary.id2token = {1: 'fake', 2: 'tokens'} def test_cosine_similarity(self): """Test cosine_similarity()""" accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, self.dictionary) accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} accumulator._num_docs = 5 obtained = indirect_confirmation_measure.cosine_similarity( self.segmentation, accumulator, self.topics, self.measure, self.gamma) # The steps involved in this calculation are as follows: # 1. Take (1, array([1, 2]). Take w' which is 1. # 2. Calculate nlr(1, 1), nlr(1, 2). This is our first vector. # 3. Take w* which is array([1, 2]). # 4. Calculate nlr(1, 1) + nlr(2, 1). Calculate nlr(1, 2), nlr(2, 2). This is our second vector. # 5. Find out cosine similarity between these two vectors. # 6. Similarly for the second segmentation. expected = (0.6230 + 0.6230) / 2. # To account for EPSILON approximation self.assertAlmostEqual(expected, obtained[0], 4) mean, std = indirect_confirmation_measure.cosine_similarity( self.segmentation, accumulator, self.topics, self.measure, self.gamma, with_std=True)[0] self.assertAlmostEqual(expected, mean, 4) self.assertAlmostEqual(0.0, std, 1) def test_word2vec_similarity(self): """Sanity check word2vec_similarity.""" accumulator = text_analysis.WordVectorsAccumulator({1, 2}, self.dictionary) accumulator.accumulate([ ['fake', 'tokens'], ['tokens', 'fake'] ], 5) mean, std = indirect_confirmation_measure.word2vec_similarity( self.segmentation, accumulator, with_std=True)[0] self.assertNotEqual(0.0, mean) self.assertNotEqual(0.0, std) if __name__ == '__main__': logging.root.setLevel(logging.WARNING) unittest.main()