#!/usr/bin/env python # # Natural Language Toolkit: TGrep search # # Copyright (C) 2001-2022 NLTK Project # Author: Will Roberts # URL: # For license information, see LICENSE.TXT """ Unit tests for nltk.tgrep. """ import unittest from nltk import tgrep from nltk.tree import ParentedTree class TestSequenceFunctions(unittest.TestCase): """ Class containing unit tests for nltk.tgrep. """ def test_tokenize_simple(self): """ Simple test of tokenization. """ tokens = tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]") self.assertEqual( tokens, [ "A", "..", "(", "B", "!", "<", "C", ".", "D", ")", "|", "!", "[", "<<", "(", "E", ",", "F", ")", "$", "G", "]", ], ) def test_tokenize_encoding(self): """ Test that tokenization handles bytes and strs the same way. """ self.assertEqual( tgrep.tgrep_tokenize(b"A .. (B !< C . D) | ![<< (E , F) $ G]"), tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]"), ) def test_tokenize_link_types(self): """ Test tokenization of basic link types. """ self.assertEqual(tgrep.tgrep_tokenize("AB"), ["A", ">", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<3B"), ["A", "<3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>3B"), ["A", ">3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<,B"), ["A", "<,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>,B"), ["A", ">,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<-3B"), ["A", "<-3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>-3B"), ["A", ">-3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<-B"), ["A", "<-", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>-B"), ["A", ">-", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<'B"), ["A", "<'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>'B"), ["A", ">'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<:B"), ["A", "<:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>:B"), ["A", ">:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<>B"), ["A", ">>", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<<,B"), ["A", "<<,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>>,B"), ["A", ">>,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<<'B"), ["A", "<<'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>>'B"), ["A", ">>'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<<:B"), ["A", "<<:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>>:B"), ["A", ">>:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A.B"), ["A", ".", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A,B"), ["A", ",", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A..B"), ["A", "..", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A,,B"), ["A", ",,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A$B"), ["A", "$", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A$.B"), ["A", "$.", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A$,B"), ["A", "$,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A$..B"), ["A", "$..", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A$,,B"), ["A", "$,,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!B"), ["A", "!", ">", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<3B"), ["A", "!", "<3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>3B"), ["A", "!", ">3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<,B"), ["A", "!", "<,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>,B"), ["A", "!", ">,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<-3B"), ["A", "!", "<-3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>-3B"), ["A", "!", ">-3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<-B"), ["A", "!", "<-", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>-B"), ["A", "!", ">-", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<'B"), ["A", "!", "<'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>'B"), ["A", "!", ">'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<:B"), ["A", "!", "<:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>:B"), ["A", "!", ">:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<>B"), ["A", "!", ">>", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<<,B"), ["A", "!", "<<,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>>,B"), ["A", "!", ">>,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<<'B"), ["A", "!", "<<'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>>'B"), ["A", "!", ">>'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<<:B"), ["A", "!", "<<:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>>:B"), ["A", "!", ">>:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!.B"), ["A", "!", ".", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!,B"), ["A", "!", ",", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!..B"), ["A", "!", "..", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!,,B"), ["A", "!", ",,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!$B"), ["A", "!", "$", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!$.B"), ["A", "!", "$.", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!$,B"), ["A", "!", "$,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!$..B"), ["A", "!", "$..", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!$,,B"), ["A", "!", "$,,", "B"]) def test_tokenize_examples(self): """ Test tokenization of the TGrep2 manual example patterns. """ self.assertEqual(tgrep.tgrep_tokenize("NP < PP"), ["NP", "<", "PP"]) self.assertEqual(tgrep.tgrep_tokenize("/^NP/"), ["/^NP/"]) self.assertEqual( tgrep.tgrep_tokenize("NP << PP . VP"), ["NP", "<<", "PP", ".", "VP"] ) self.assertEqual( tgrep.tgrep_tokenize("NP << PP | . VP"), ["NP", "<<", "PP", "|", ".", "VP"] ) self.assertEqual( tgrep.tgrep_tokenize("NP !<< PP [> NP | >> VP]"), ["NP", "!", "<<", "PP", "[", ">", "NP", "|", ">>", "VP", "]"], ) self.assertEqual( tgrep.tgrep_tokenize("NP << (PP . VP)"), ["NP", "<<", "(", "PP", ".", "VP", ")"], ) self.assertEqual( tgrep.tgrep_tokenize("NP <' (PP <, (IN < on))"), ["NP", "<'", "(", "PP", "<,", "(", "IN", "<", "on", ")", ")"], ) self.assertEqual( tgrep.tgrep_tokenize("S < (A < B) < C"), ["S", "<", "(", "A", "<", "B", ")", "<", "C"], ) self.assertEqual( tgrep.tgrep_tokenize("S < ((A < B) < C)"), ["S", "<", "(", "(", "A", "<", "B", ")", "<", "C", ")"], ) self.assertEqual( tgrep.tgrep_tokenize("S < (A < B < C)"), ["S", "<", "(", "A", "<", "B", "<", "C", ")"], ) self.assertEqual(tgrep.tgrep_tokenize("A3B"3B"', "<", "C"], ) def test_tokenize_nodenames(self): """ Test tokenization of node names. """ self.assertEqual(tgrep.tgrep_tokenize("Robert"), ["Robert"]) self.assertEqual(tgrep.tgrep_tokenize("/^[Bb]ob/"), ["/^[Bb]ob/"]) self.assertEqual(tgrep.tgrep_tokenize("*"), ["*"]) self.assertEqual(tgrep.tgrep_tokenize("__"), ["__"]) # test tokenization of NLTK tree position syntax self.assertEqual(tgrep.tgrep_tokenize("N()"), ["N(", ")"]) self.assertEqual(tgrep.tgrep_tokenize("N(0,)"), ["N(", "0", ",", ")"]) self.assertEqual(tgrep.tgrep_tokenize("N(0,0)"), ["N(", "0", ",", "0", ")"]) self.assertEqual( tgrep.tgrep_tokenize("N(0,0,)"), ["N(", "0", ",", "0", ",", ")"] ) def test_tokenize_macros(self): """ Test tokenization of macro definitions. """ self.assertEqual( tgrep.tgrep_tokenize( "@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN" ), [ "@", "NP", "/^NP/", ";", "@", "NN", "/^NN/", ";", "@NP", "[", "!", "<", "NP", "|", "<", "@NN", "]", "!", "$..", "@NN", ], ) def test_node_simple(self): """ Test a simple use of tgrep for finding nodes matching a given pattern. """ tree = ParentedTree.fromstring( "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))" ) self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]]) self.assertEqual( list(tgrep.tgrep_nodes("NN", [tree])), [[tree[0, 2], tree[2, 1]]] ) self.assertEqual( list(tgrep.tgrep_positions("NN|JJ", [tree])), [[(0, 1), (0, 2), (2, 1)]] ) def test_node_printing(self): """Test that the tgrep print operator ' is properly ignored.""" tree = ParentedTree.fromstring("(S (n x) (N x))") self.assertEqual( list(tgrep.tgrep_positions("N", [tree])), list(tgrep.tgrep_positions("'N", [tree])), ) self.assertEqual( list(tgrep.tgrep_positions("/[Nn]/", [tree])), list(tgrep.tgrep_positions("'/[Nn]/", [tree])), ) def test_node_encoding(self): """ Test that tgrep search strings handles bytes and strs the same way. """ tree = ParentedTree.fromstring( "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))" ) self.assertEqual( list(tgrep.tgrep_positions(b"NN", [tree])), list(tgrep.tgrep_positions(b"NN", [tree])), ) self.assertEqual( list(tgrep.tgrep_nodes(b"NN", [tree])), list(tgrep.tgrep_nodes("NN", [tree])), ) self.assertEqual( list(tgrep.tgrep_positions(b"NN|JJ", [tree])), list(tgrep.tgrep_positions("NN|JJ", [tree])), ) def test_node_nocase(self): """ Test selecting nodes using case insensitive node names. """ tree = ParentedTree.fromstring("(S (n x) (N x))") self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]]) def test_node_quoted(self): """ Test selecting nodes using quoted node names. """ tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))') self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]]) self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]]) def test_node_regex(self): """ Test regex matching on nodes. """ tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))") # This is a regular expression that matches any node whose # name starts with NP, including NP-SBJ: self.assertEqual(list(tgrep.tgrep_positions("/^NP/", [tree])), [[(0,), (1,)]]) def test_node_regex_2(self): """ Test regex matching on nodes. """ tree = ParentedTree.fromstring("(S (SBJ x) (SBJ1 x) (NP-SBJ x))") self.assertEqual(list(tgrep.tgrep_positions("/^SBJ/", [tree])), [[(0,), (1,)]]) # This is a regular expression that matches any node whose # name includes SBJ, including NP-SBJ: self.assertEqual( list(tgrep.tgrep_positions("/SBJ/", [tree])), [[(0,), (1,), (2,)]] ) def test_node_tree_position(self): """ Test matching on nodes based on NLTK tree position. """ tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))") # test all tree positions that are not leaves leaf_positions = {tree.leaf_treeposition(x) for x in range(len(tree.leaves()))} tree_positions = [x for x in tree.treepositions() if x not in leaf_positions] for position in tree_positions: node_id = f"N{position}" tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree])) self.assertEqual(len(tgrep_positions[0]), 1) self.assertEqual(tgrep_positions[0][0], position) def test_node_noleaves(self): """ Test node name matching with the search_leaves flag set to False. """ tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))") self.assertEqual( list(tgrep.tgrep_positions("x", [tree])), [[(0, 0, 0), (1, 0, 0)]] ) self.assertEqual(list(tgrep.tgrep_positions("x", [tree], False)), [[]]) def tests_rel_dominance(self): """ Test matching nodes based on dominance relations. """ tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))") self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* < T > S", [tree])), [[(0,)]]) self.assertEqual( list(tgrep.tgrep_positions("* !< T", [tree])), [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]], ) self.assertEqual(list(tgrep.tgrep_positions("* !< T > S", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* > A", [tree])), [[(0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions("* > B", [tree])), [[(1, 0)]]) self.assertEqual( list(tgrep.tgrep_positions("* !> B", [tree])), [[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* !> B >> S", [tree])), [[(0,), (0, 0), (1,)]] ) self.assertEqual( list(tgrep.tgrep_positions("* >> S", [tree])), [[(0,), (0, 0), (1,), (1, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* >>, S", [tree])), [[(0,), (0, 0)]] ) self.assertEqual( list(tgrep.tgrep_positions("* >>' S", [tree])), [[(1,), (1, 0)]] ) # Known issue: # self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])), # [[()]]) self.assertEqual(list(tgrep.tgrep_positions("* << T", [tree])), [[(), (0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <<' T", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <<1 N", [tree])), [[(1,)]]) self.assertEqual( list(tgrep.tgrep_positions("* !<< T", [tree])), [[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]], ) tree = ParentedTree.fromstring("(S (A (T x)) (B (T x) (N x )))") self.assertEqual(list(tgrep.tgrep_positions("* <: T", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,), (1,)]]) self.assertEqual( list(tgrep.tgrep_positions("* !<: T", [tree])), [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]], ) self.assertEqual(list(tgrep.tgrep_positions("* !<: T > S", [tree])), [[(1,)]]) tree = ParentedTree.fromstring("(S (T (A x) (B x)) (T (C x)))") self.assertEqual(list(tgrep.tgrep_positions("* >: T", [tree])), [[(1, 0)]]) self.assertEqual( list(tgrep.tgrep_positions("* !>: T", [tree])), [[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]], ) tree = ParentedTree.fromstring( "(S (A (B (C (D (E (T x))))))" " (A (B (C (D (E (T x))) (N x)))))" ) self.assertEqual( list(tgrep.tgrep_positions("* <<: T", [tree])), [ [ (0,), (0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0), (1, 0, 0, 0), (1, 0, 0, 0, 0), ] ], ) self.assertEqual( list(tgrep.tgrep_positions("* >>: A", [tree])), [ [ (0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0), (1, 0), (1, 0, 0), ] ], ) def test_bad_operator(self): """ Test error handling of undefined tgrep operators. """ tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))") self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions("* >>> S", [tree]) ) def test_comments(self): """ Test that comments are correctly filtered out of tgrep search strings. """ tree = ParentedTree.fromstring("(S (NN x) (NP x) (NN x))") search1 = """ @ NP /^NP/; @ NN /^NN/; @NN """ self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]]) search2 = """ # macros @ NP /^NP/; @ NN /^NN/; # search string @NN """ self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]]) def test_rel_sister_nodes(self): """ Test matching sister nodes in a tree. """ tree = ParentedTree.fromstring("(S (A x) (B x) (C x))") self.assertEqual(list(tgrep.tgrep_positions("* $. B", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $.. B", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $, B", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $,, B", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $ B", [tree])), [[(0,), (2,)]]) def tests_rel_indexed_children(self): """ Test matching nodes based on their index in their parent node. """ tree = ParentedTree.fromstring("(S (A x) (B x) (C x))") self.assertEqual(list(tgrep.tgrep_positions("* >, S", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >1 S", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >2 S", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >3 S", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >' S", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >-1 S", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >-2 S", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >-3 S", [tree])), [[(0,)]]) tree = ParentedTree.fromstring( "(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) " "(F (C x) (A x) (B x)))" ) self.assertEqual(list(tgrep.tgrep_positions("* <, A", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <1 A", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <2 A", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <3 A", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <' A", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <-1 A", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <-2 A", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <-3 A", [tree])), [[(0,)]]) def test_rel_precedence(self): """ Test matching nodes based on precedence relations. """ tree = ParentedTree.fromstring( "(S (NP (NP (PP x)) (NP (AP x)))" " (VP (AP (X (PP x)) (Y (AP x))))" " (NP (RC (NP (AP x)))))" ) self.assertEqual( list(tgrep.tgrep_positions("* . X", [tree])), [[(0,), (0, 1), (0, 1, 0)]] ) self.assertEqual( list(tgrep.tgrep_positions("* . Y", [tree])), [[(1, 0, 0), (1, 0, 0, 0)]] ) self.assertEqual( list(tgrep.tgrep_positions("* .. X", [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* .. Y", [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* , X", [tree])), [[(1, 0, 1), (1, 0, 1, 0)]] ) self.assertEqual( list(tgrep.tgrep_positions("* , Y", [tree])), [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* ,, X", [tree])), [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* ,, Y", [tree])), [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], ) def test_examples(self): """ Test the Basic Examples from the TGrep2 manual. """ tree = ParentedTree.fromstring("(S (NP (AP x)) (NP (PP x)))") # This matches any NP node that immediately dominates a PP: self.assertEqual(list(tgrep.tgrep_positions("NP < PP", [tree])), [[(1,)]]) tree = ParentedTree.fromstring("(S (NP x) (VP x) (NP (PP x)) (VP x))") # This matches an NP that dominates a PP and is immediately # followed by a VP: self.assertEqual(list(tgrep.tgrep_positions("NP << PP . VP", [tree])), [[(2,)]]) tree = ParentedTree.fromstring( "(S (NP (AP x)) (NP (PP x)) " "(NP (DET x) (NN x)) (VP x))" ) # This matches an NP that dominates a PP or is immediately # followed by a VP: self.assertEqual( list(tgrep.tgrep_positions("NP << PP | . VP", [tree])), [[(1,), (2,)]] ) tree = ParentedTree.fromstring( "(S (NP (NP (PP x)) (NP (AP x)))" " (VP (AP (NP (PP x)) (NP (AP x))))" " (NP (RC (NP (AP x)))))" ) # This matches an NP that does not dominate a PP. Also, the NP # must either have a parent that is an NP or be dominated by a # VP: self.assertEqual( list(tgrep.tgrep_positions("NP !<< PP [> NP | >> VP]", [tree])), [[(0, 1), (1, 0, 1)]], ) tree = ParentedTree.fromstring( "(S (NP (AP (PP x) (VP x))) " "(NP (AP (PP x) (NP x))) (NP x))" ) # This matches an NP that dominates a PP which itself is # immediately followed by a VP. Note the use of parentheses to # group ". VP" with the PP rather than with the NP: self.assertEqual( list(tgrep.tgrep_positions("NP << (PP . VP)", [tree])), [[(0,)]] ) tree = ParentedTree.fromstring( "(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))" " (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))" " (NP x))" ) # This matches an NP whose last child is a PP that begins with # the preposition "on": self.assertEqual( list(tgrep.tgrep_positions("NP <' (PP <, (IN < on))", [tree])), [[(0,)]] ) tree = ParentedTree.fromstring( "(S (S (C x) (A (B x))) (S (C x) (A x)) " "(S (D x) (A (B x))))" ) # The following pattern matches an S which has a child A and # another child that is a C and that the A has a child B: self.assertEqual( list(tgrep.tgrep_positions("S < (A < B) < C", [tree])), [[(0,)]] ) tree = ParentedTree.fromstring( "(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))" ) # However, this pattern means that S has child A and that A # has children B and C: self.assertEqual( list(tgrep.tgrep_positions("S < ((A < B) < C)", [tree])), [[(0,)]] ) # It is equivalent to this: self.assertEqual( list(tgrep.tgrep_positions("S < (A < B < C)", [tree])), [[(0,)]] ) def test_use_macros(self): """ Test defining and using tgrep2 macros. """ tree = ParentedTree.fromstring( "(VP (VB sold) (NP (DET the) " "(NN heiress)) (NP (NN deed) (PREP to) " "(NP (DET the) (NN school) (NN house))))" ) self.assertEqual( list( tgrep.tgrep_positions( "@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN", [tree] ) ), [[(1,), (2, 2)]], ) # use undefined macro @CNP self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions( "@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN", [tree] ), ) def test_tokenize_node_labels(self): """Test tokenization of labeled nodes.""" self.assertEqual( tgrep.tgrep_tokenize("S < @SBJ < (@VP < (@VB $.. @OBJ))"), [ "S", "<", "@SBJ", "<", "(", "@VP", "<", "(", "@VB", "$..", "@OBJ", ")", ")", ], ) self.assertEqual( tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))"), [ "S", "<", "@SBJ", "=", "s", "<", "(", "@VP", "=", "v", "<", "(", "@VB", "$..", "@OBJ", ")", ")", ], ) def test_tokenize_segmented_patterns(self): """Test tokenization of segmented patterns.""" self.assertEqual( tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"), [ "S", "<", "@SBJ", "=", "s", "<", "(", "@VP", "=", "v", "<", "(", "@VB", "$..", "@OBJ", ")", ")", ":", "=s", "..", "=v", ], ) def test_labeled_nodes(self): """ Test labeled nodes. Test case from Emily M. Bender. """ search = """ # macros @ SBJ /SBJ/; @ VP /VP/; @ VB /VB/; @ VPoB /V[PB]/; @ OBJ /OBJ/; # 1 svo S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v""" sent1 = ParentedTree.fromstring( "(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))" ) sent2 = ParentedTree.fromstring( "(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))" ) search_firsthalf = search.split("\n\n")[0] + "S < @SBJ < (@VP < (@VB $.. @OBJ))" search_rewrite = "S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))" self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0]) self.assertEqual( list(tgrep.tgrep_positions(search, [sent1])), list(tgrep.tgrep_positions(search_rewrite, [sent1])), ) self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0]) self.assertEqual( list(tgrep.tgrep_positions(search, [sent2])), list(tgrep.tgrep_positions(search_rewrite, [sent2])), ) def test_multiple_conjs(self): """ Test that multiple (3 or more) conjunctions of node relations are handled properly. """ sent = ParentedTree.fromstring("((A (B b) (C c)) (A (B b) (C c) (D d)))") # search = '(A < B < C < D)' # search_tworels = '(A < B < C)' self.assertEqual( list(tgrep.tgrep_positions("(A < B < C < D)", [sent])), [[(1,)]] ) self.assertEqual( list(tgrep.tgrep_positions("(A < B < C)", [sent])), [[(0,), (1,)]] ) def test_trailing_semicolon(self): """ Test that semicolons at the end of a tgrep2 search string won't cause a parse failure. """ tree = ParentedTree.fromstring( "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))" ) self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]]) self.assertEqual(list(tgrep.tgrep_positions("NN;", [tree])), [[(0, 2), (2, 1)]]) self.assertEqual( list(tgrep.tgrep_positions("NN;;", [tree])), [[(0, 2), (2, 1)]] )