#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Unit tests for the `corpora.Dictionary` class.
"""

from collections.abc import Mapping
from itertools import chain
import logging
import unittest
import codecs
import os
import os.path

import scipy
import gensim
from gensim.corpora import Dictionary
from gensim.utils import to_utf8
from gensim.test.utils import get_tmpfile, common_texts


class TestDictionary(unittest.TestCase):
    def setUp(self):
        self.texts = common_texts

    def test_doc_freq_one_doc(self):
        texts = [['human', 'interface', 'computer']]
        d = Dictionary(texts)
        expected = {0: 1, 1: 1, 2: 1}
        self.assertEqual(d.dfs, expected)

    def test_doc_freq_and_token2id_for_several_docs_with_one_word(self):
        # two docs
        texts = [['human'], ['human']]
        d = Dictionary(texts)
        expected = {0: 2}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 0}
        self.assertEqual(d.token2id, expected)

        # three docs
        texts = [['human'], ['human'], ['human']]
        d = Dictionary(texts)
        expected = {0: 3}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 0}
        self.assertEqual(d.token2id, expected)

        # four docs
        texts = [['human'], ['human'], ['human'], ['human']]
        d = Dictionary(texts)
        expected = {0: 4}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 0}
        self.assertEqual(d.token2id, expected)

    def test_doc_freq_for_one_doc_with_several_word(self):
        # two words
        texts = [['human', 'cat']]
        d = Dictionary(texts)
        expected = {0: 1, 1: 1}
        self.assertEqual(d.dfs, expected)

        # three words
        texts = [['human', 'cat', 'minors']]
        d = Dictionary(texts)
        expected = {0: 1, 1: 1, 2: 1}
        self.assertEqual(d.dfs, expected)

    def test_doc_freq_and_collection_freq(self):
        # one doc
        texts = [['human', 'human', 'human']]
        d = Dictionary(texts)
        self.assertEqual(d.cfs, {0: 3})
        self.assertEqual(d.dfs, {0: 1})

        # two docs
        texts = [['human', 'human'], ['human']]
        d = Dictionary(texts)
        self.assertEqual(d.cfs, {0: 3})
        self.assertEqual(d.dfs, {0: 2})

        # three docs
        texts = [['human'], ['human'], ['human']]
        d = Dictionary(texts)
        self.assertEqual(d.cfs, {0: 3})
        self.assertEqual(d.dfs, {0: 3})

    def test_build(self):
        d = Dictionary(self.texts)

        # Since we don't specify the order in which dictionaries are built,
        # we cannot reliably test for the mapping; only the keys and values.
        expected_keys = list(range(12))
        expected_values = [2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3]
        self.assertEqual(sorted(d.dfs.keys()), expected_keys)
        self.assertEqual(sorted(d.dfs.values()), expected_values)

        expected_keys = sorted([
            'computer', 'eps', 'graph', 'human', 'interface',
            'minors', 'response', 'survey', 'system', 'time', 'trees', 'user'
        ])
        expected_values = list(range(12))
        self.assertEqual(sorted(d.token2id.keys()), expected_keys)
        self.assertEqual(sorted(d.token2id.values()), expected_values)

    def test_merge(self):
        d = Dictionary(self.texts)
        f = Dictionary(self.texts[:3])
        g = Dictionary(self.texts[3:])

        f.merge_with(g)
        self.assertEqual(sorted(d.token2id.keys()), sorted(f.token2id.keys()))

    def test_filter(self):
        d = Dictionary(self.texts)
        d.filter_extremes(no_below=2, no_above=1.0, keep_n=4)
        dfs_expected = {0: 3, 1: 3, 2: 3, 3: 3}
        cfs_expected = {0: 4, 1: 3, 2: 3, 3: 3}
        self.assertEqual(d.dfs, dfs_expected)
        self.assertEqual(d.cfs, cfs_expected)

    def testFilterKeepTokens_keepTokens(self):
        # provide keep_tokens argument, keep the tokens given
        d = Dictionary(self.texts)
        d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['human', 'survey'])
        expected = {'graph', 'trees', 'human', 'system', 'user', 'survey'}
        self.assertEqual(set(d.token2id.keys()), expected)

    def testFilterKeepTokens_unchangedFunctionality(self):
        # do not provide keep_tokens argument, filter_extremes functionality is unchanged
        d = Dictionary(self.texts)
        d.filter_extremes(no_below=3, no_above=1.0)
        expected = {'graph', 'trees', 'system', 'user'}
        self.assertEqual(set(d.token2id.keys()), expected)

    def testFilterKeepTokens_unseenToken(self):
        # do provide keep_tokens argument with unseen tokens, filter_extremes functionality is unchanged
        d = Dictionary(self.texts)
        d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['unknown_token'])
        expected = {'graph', 'trees', 'system', 'user'}
        self.assertEqual(set(d.token2id.keys()), expected)

    def testFilterKeepTokens_keepn(self):
        # keep_tokens should also work if the keep_n parameter is used, but only
        # to keep a maximum of n (so if keep_n < len(keep_n) the tokens to keep are
        # still getting removed to reduce the size to keep_n!)
        d = Dictionary(self.texts)
        # Note: there are four tokens with freq 3, all the others have frequence 2
        # in self.texts. In order to make the test result deterministic, we add
        # 2 tokens of frequency one
        d.add_documents([['worda'], ['wordb']])
        # this should keep the 3 tokens with freq 3 and the one we want to keep
        d.filter_extremes(keep_n=5, no_below=0, no_above=1.0, keep_tokens=['worda'])
        expected = {'graph', 'trees', 'system', 'user', 'worda'}
        self.assertEqual(set(d.token2id.keys()), expected)

    def test_filter_most_frequent(self):
        d = Dictionary(self.texts)
        d.filter_n_most_frequent(4)
        expected = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2}
        self.assertEqual(d.dfs, expected)

    def test_filter_tokens(self):
        self.maxDiff = 10000
        d = Dictionary(self.texts)

        removed_word = d[0]
        d.filter_tokens([0])

        expected = {
            'computer': 0, 'eps': 8, 'graph': 10, 'human': 1,
            'interface': 2, 'minors': 11, 'response': 3, 'survey': 4,
            'system': 5, 'time': 6, 'trees': 9, 'user': 7
        }
        del expected[removed_word]
        self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))

        expected[removed_word] = len(expected)
        d.add_documents([[removed_word]])
        self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))

    def test_doc2bow(self):
        d = Dictionary([["žluťoučký"], ["žluťoučký"]])

        # pass a utf8 string
        self.assertEqual(d.doc2bow(["žluťoučký"]), [(0, 1)])

        # doc2bow must raise a TypeError if passed a string instead of array of strings by accident
        self.assertRaises(TypeError, d.doc2bow, "žluťoučký")

        # unicode must be converted to utf8
        self.assertEqual(d.doc2bow([u'\u017elu\u0165ou\u010dk\xfd']), [(0, 1)])

    def test_saveAsText(self):
        """`Dictionary` can be saved as textfile. """
        tmpf = get_tmpfile('save_dict_test.txt')
        small_text = [
            ["prvé", "slovo"],
            ["slovo", "druhé"],
            ["druhé", "slovo"]
        ]

        d = Dictionary(small_text)

        d.save_as_text(tmpf)
        with codecs.open(tmpf, 'r', encoding='utf-8') as file:
            serialized_lines = file.readlines()
            self.assertEqual(serialized_lines[0], u"3\n")
            self.assertEqual(len(serialized_lines), 4)
            # We do not know, which word will have which index
            self.assertEqual(serialized_lines[1][1:], u"\tdruhé\t2\n")
            self.assertEqual(serialized_lines[2][1:], u"\tprvé\t1\n")
            self.assertEqual(serialized_lines[3][1:], u"\tslovo\t3\n")

        d.save_as_text(tmpf, sort_by_word=False)
        with codecs.open(tmpf, 'r', encoding='utf-8') as file:
            serialized_lines = file.readlines()
            self.assertEqual(serialized_lines[0], u"3\n")
            self.assertEqual(len(serialized_lines), 4)
            self.assertEqual(serialized_lines[1][1:], u"\tslovo\t3\n")
            self.assertEqual(serialized_lines[2][1:], u"\tdruhé\t2\n")
            self.assertEqual(serialized_lines[3][1:], u"\tprvé\t1\n")

    def test_loadFromText_legacy(self):
        """
        `Dictionary` can be loaded from textfile in legacy format.
        Legacy format does not have num_docs on the first line.
        """
        tmpf = get_tmpfile('load_dict_test_legacy.txt')
        no_num_docs_serialization = to_utf8("1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 0)

    def test_loadFromText(self):
        """`Dictionary` can be loaded from textfile."""
        tmpf = get_tmpfile('load_dict_test.txt')
        no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 2)

    def test_saveAsText_and_loadFromText(self):
        """`Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        for sort_by_word in [True, False]:
            d = Dictionary(self.texts)
            d.save_as_text(tmpf, sort_by_word=sort_by_word)
            self.assertTrue(os.path.exists(tmpf))

            d_loaded = Dictionary.load_from_text(tmpf)
            self.assertNotEqual(d_loaded, None)
            self.assertEqual(d_loaded.token2id, d.token2id)

    def test_from_corpus(self):
        """build `Dictionary` from an existing corpus"""

        documents = [
            "Human machine interface for lab abc computer applications",
            "A survey of user opinion of computer system response time",
            "The EPS user interface management system",
            "System and human system engineering testing of EPS",
            "Relation of user perceived response time to error measurement",
            "The generation of random binary unordered trees",
            "The intersection graph of paths in trees",
            "Graph minors IV Widths of trees and well quasi ordering",
            "Graph minors A survey"
        ]
        stoplist = set('for a of the and to in'.split())
        texts = [
            [word for word in document.lower().split() if word not in stoplist]
            for document in documents]

        # remove words that appear only once
        all_tokens = list(chain.from_iterable(texts))
        tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once] for text in texts]

        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        # Create dictionary from corpus without a token map
        dictionary_from_corpus = Dictionary.from_corpus(corpus)

        dict_token2id_vals = sorted(dictionary.token2id.values())
        dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values())
        self.assertEqual(dict_token2id_vals, dict_from_corpus_vals)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)

        # Create dictionary from corpus with an id=>token map
        dictionary_from_corpus_2 = Dictionary.from_corpus(corpus, id2word=dictionary)

        self.assertEqual(dictionary.token2id, dictionary_from_corpus_2.token2id)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus_2.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus_2.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus_2.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus_2.num_nnz)

        # Ensure Sparse2Corpus is compatible with from_corpus
        bow = gensim.matutils.Sparse2Corpus(scipy.sparse.rand(10, 100))
        dictionary = Dictionary.from_corpus(bow)
        self.assertEqual(dictionary.num_docs, 100)

    def test_dict_interface(self):
        """Test Python 2 dict-like interface in both Python 2 and 3."""
        d = Dictionary(self.texts)

        self.assertTrue(isinstance(d, Mapping))

        self.assertEqual(list(zip(d.keys(), d.values())), list(d.items()))

        # Even in Py3, we want the iter* members.
        self.assertEqual(list(d.items()), list(d.iteritems()))
        self.assertEqual(list(d.keys()), list(d.iterkeys()))
        self.assertEqual(list(d.values()), list(d.itervalues()))

    def test_patch_with_special_tokens(self):
        special_tokens = {'pad': 0, 'space': 1, 'quake': 3}
        corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
        d = Dictionary(corpus)
        self.assertEqual(len(d.token2id), 5)
        d.patch_with_special_tokens(special_tokens)
        self.assertEqual(d.token2id['pad'], 0)
        self.assertEqual(d.token2id['space'], 1)
        self.assertEqual(d.token2id['quake'], 3)
        self.assertEqual(len(d.token2id), 8)
        self.assertNotIn((0, 1), d.doc2bow(corpus[0]))
        self.assertIn((0, 1), d.doc2bow(['pad'] + corpus[0]))
        corpus_with_special_tokens = [["máma", "mele", "maso"], ["ema", "má", "máma", "space"]]
        d = Dictionary(corpus_with_special_tokens)
        self.assertEqual(len(d.token2id), 6)
        self.assertNotEqual(d.token2id['space'], 1)
        d.patch_with_special_tokens(special_tokens)
        self.assertEqual(len(d.token2id), 8)
        self.assertEqual(max(d.token2id.values()), 7)
        self.assertEqual(d.token2id['space'], 1)
        self.assertNotIn((1, 1), d.doc2bow(corpus_with_special_tokens[0]))
        self.assertIn((1, 1), d.doc2bow(corpus_with_special_tokens[1]))

    def test_most_common_with_n(self):
        texts = [['human', 'human', 'human', 'computer', 'computer', 'interface', 'interface']]
        d = Dictionary(texts)
        expected = [('human', 3), ('computer', 2)]
        assert d.most_common(n=2) == expected

    def test_most_common_without_n(self):
        texts = [['human', 'human', 'human', 'computer', 'computer', 'interface', 'interface']]
        d = Dictionary(texts)
        expected = [('human', 3), ('computer', 2), ('interface', 2)]
        assert d.most_common(n=None) == expected


# endclass TestDictionary


if __name__ == '__main__':
    logging.basicConfig(level=logging.WARNING)
    unittest.main()