# -*- coding: utf-8 -*- import re import weakref import six import unittest import pickle from parsel import Selector from parsel.selector import ( CannotRemoveElementWithoutRoot, CannotRemoveElementWithoutParent, ) class SelectorTestCase(unittest.TestCase): sscls = Selector def test_pickle_selector(self): sel = self.sscls(text=u'

some text

') self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel) def test_pickle_selector_list(self): sel = self.sscls(text=u'') sel_list = sel.css('li') empty_sel_list = sel.css('p') self.assertIsInstance(sel_list, self.sscls.selectorlist_cls) self.assertIsInstance(empty_sel_list, self.sscls.selectorlist_cls) self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list) self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list) def test_simple_selection(self): """Simple selector tests""" body = u"

" sel = self.sscls(text=body) xl = sel.xpath('//input') self.assertEqual(2, len(xl)) for x in xl: assert isinstance(x, self.sscls) self.assertEqual(sel.xpath('//input').extract(), [x.extract() for x in sel.xpath('//input')]) self.assertEqual([x.extract() for x in sel.xpath("//input[@name='a']/@name")], [u'a']) self.assertEqual([x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")], [u'12.0']) self.assertEqual(sel.xpath("concat('xpath', 'rules')").extract(), [u'xpathrules']) self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")], [u'12']) def test_simple_selection_with_variables(self): """Using XPath variables""" body = u"

" sel = self.sscls(text=body) self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)], [u'a']) self.assertEqual([x.extract() for x in sel.xpath("//input[@name=$letter]/@value", letter='b')], [u'2']) self.assertEqual(sel.xpath("count(//input[@value=$number or @name=$letter])", number=2, letter='a').extract(), [u'2.0']) # you can also pass booleans self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=2, test=True).extract(), [u'1']) self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=4, test=True).extract(), [u'0']) self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=4, test=False).extract(), [u'1']) # for named nodes, you need to use "name()=node_name" self.assertEqual(sel.xpath("boolean(count(//*[name()=$tag])=$cnt)=$test", tag="input", cnt=2, test=True).extract(), [u'1']) def test_simple_selection_with_variables_escape_friendly(self): """Using XPath variables with quotes that would need escaping with string formatting""" body = u"""

I'm mixing single and "double quotes" and I don't care :)

""" sel = self.sscls(text=body) t = 'I say "Yeah!"' # naive string formatting with give something like: # ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name self.assertRaises(ValueError, sel.xpath, '//input[@value="{}"]/@name'.format(t)) # with XPath variables, escaping is done for you self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)], [u'a']) lt = """I'm mixing single and "double quotes" and I don't care :)""" # the following gives you something like # ValueError: XPath error: Invalid predicate in //p[normalize-space()='I'm mixing single and "double quotes" and I don't care :)']//@name self.assertRaises(ValueError, sel.xpath, "//p[normalize-space()='{}']//@name".format(lt)) self.assertEqual([x.extract() for x in sel.xpath("//p[normalize-space()=$lng]//@name", lng=lt)], [u'a']) def test_accessing_attributes(self): body = u""" """ sel = self.sscls(text=body) self.assertEqual({'lang': 'en', 'version': '1.0'}, sel.attrib) self.assertEqual({'id': 'some-list', 'class': 'list-cls'}, sel.css('ul')[0].attrib) # for a SelectorList, bring the attributes of first-element only self.assertEqual({'id': 'some-list', 'class': 'list-cls'}, sel.css('ul').attrib) self.assertEqual({'class': 'item-cls', 'id': 'list-item-1'}, sel.css('li').attrib) self.assertEqual({}, sel.css('body').attrib) self.assertEqual({}, sel.css('non-existing-element').attrib) self.assertEqual( [{'class': 'item-cls', 'id': 'list-item-1'}, {'class': 'item-cls active', 'id': 'list-item-2'}, {'class': 'item-cls', 'id': 'list-item-3'}], [e.attrib for e in sel.css('li')]) def test_representation_slice(self): body = u"

".format(50 * 'b') sel = self.sscls(text=body) representation = "".format(37 * 'b') if six.PY2: representation = "".format(37 * 'b') self.assertEqual( [repr(it) for it in sel.xpath('//input/@name')], [representation] ) def test_representation_unicode_query(self): body = u"

".format(50 * 'b') representation = '' if six.PY2: representation = "" sel = self.sscls(text=body) self.assertEqual( [repr(it) for it in sel.xpath(u'//input[@value="\xa9"]/@value')], [representation] ) def test_check_text_argument_type(self): self.assertRaisesRegexp(TypeError, 'text argument should be of type', self.sscls, b'') def test_extract_first(self): """Test if extract_first() returns first element""" body = u'
  • 1
  • 2
' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//ul/li/text()').extract_first(), sel.xpath('//ul/li/text()').extract()[0]) self.assertEqual(sel.xpath('//ul/li[@id="1"]/text()').extract_first(), sel.xpath('//ul/li[@id="1"]/text()').extract()[0]) self.assertEqual(sel.xpath('//ul/li[2]/text()').extract_first(), sel.xpath('//ul/li/text()').extract()[1]) self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(), None) def test_extract_first_default(self): """Test if extract_first() returns default value when no results found""" body = u'
  • 1
  • 2
' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing') def test_selector_get_alias(self): """Test if get() returns extracted value on a Selector""" body = u'
  • 1
  • 2
  • 3
' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//ul/li[position()>1]')[0].get(), u'
  • 2
  • ') self.assertEqual(sel.xpath('//ul/li[position()>1]/text()')[0].get(), u'2') def test_selector_getall_alias(self): """Test if get() returns extracted value on a Selector""" body = u'
    • 1
    • 2
    • 3
    ' sel = self.sscls(text=body) self.assertListEqual(sel.xpath('//ul/li[position()>1]')[0].getall(), [u'
  • 2
  • ']) self.assertListEqual(sel.xpath('//ul/li[position()>1]/text()')[0].getall(), [u'2']) def test_selectorlist_get_alias(self): """Test if get() returns first element for a selection call""" body = u'
    • 1
    • 2
    • 3
    ' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//ul/li').get(), u'
  • 1
  • ') self.assertEqual(sel.xpath('//ul/li/text()').get(), u'1') def test_re_first(self): """Test if re_first() returns first matched element""" body = u'
    • 1
    • 2
    ' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//ul/li/text()').re_first(r'\d'), sel.xpath('//ul/li/text()').re(r'\d')[0]) self.assertEqual(sel.xpath('//ul/li[@id="1"]/text()').re_first(r'\d'), sel.xpath('//ul/li[@id="1"]/text()').re(r'\d')[0]) self.assertEqual(sel.xpath('//ul/li[2]/text()').re_first(r'\d'), sel.xpath('//ul/li/text()').re(r'\d')[1]) self.assertEqual(sel.xpath('/ul/li/text()').re_first(r'\w+'), None) self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first(r'\d'), None) self.assertEqual(sel.re_first(r'id="(\d+)'), '1') self.assertEqual(sel.re_first(r'foo'), None) self.assertEqual(sel.re_first(r'foo', default='bar'), 'bar') def test_extract_first_re_default(self): """Test if re_first() returns default value when no results found""" body = u'
    • 1
    • 2
    ' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//div/text()').re_first(r'\w+', default='missing'), 'missing') self.assertEqual(sel.xpath('/ul/li/text()').re_first(r'\w+', default='missing'), 'missing') def test_select_unicode_query(self): body = u"

    " sel = self.sscls(text=body) self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1']) def test_list_elements_type(self): """Test Selector returning the same type in selection methods""" text = u'

    test

    ' assert isinstance(self.sscls(text=text).xpath("//p")[0], self.sscls) assert isinstance(self.sscls(text=text).css("p")[0], self.sscls) def test_boolean_result(self): body = u"

    " xs = self.sscls(text=body) self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), [u'1']) self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), [u'0']) def test_differences_parsing_xml_vs_html(self): """Test that XML and HTML Selector's behave differently""" # some text which is parsed differently by XML and HTML flavors text = u'

    Hello

    ' hs = self.sscls(text=text, type='html') self.assertEqual(hs.xpath("//div").extract(), [u'

    Hello

    ']) xs = self.sscls(text=text, type='xml') self.assertEqual(xs.xpath("//div").extract(), [u'

    Hello

    ']) def test_error_for_unknown_selector_type(self): self.assertRaises(ValueError, self.sscls, text=u'', type='_na_') def test_text_or_root_is_required(self): self.assertRaisesRegexp(ValueError, 'Selector needs either text or root argument', self.sscls) def test_bool(self): text = u'falsetrue' hs = self.sscls(text=text, type='html') falsish = hs.xpath('//a/@href')[0] self.assertEqual(falsish.extract(), u'') self.assertFalse(falsish) trueish = hs.xpath('//a/@href')[1] self.assertEqual(trueish.extract(), u'nonempty') self.assertTrue(trueish) def test_slicing(self): text = u'

    1

    2

    3

    ' hs = self.sscls(text=text, type='html') self.assertIsInstance(hs.css('p')[2], self.sscls) self.assertIsInstance(hs.css('p')[2:3], self.sscls.selectorlist_cls) self.assertIsInstance(hs.css('p')[:2], self.sscls.selectorlist_cls) self.assertEqual(hs.css('p')[2:3].extract(), [u'

    3

    ']) self.assertEqual(hs.css('p')[1:3].extract(), [u'

    2

    ', u'

    3

    ']) def test_nested_selectors(self): """Nested selector tests""" body = u"""
    • one
    • two
    • four
    • five
    • six
    """ x = self.sscls(text=body) divtwo = x.xpath('//div[@class="two"]') self.assertEqual(divtwo.xpath("//li").extract(), ["
  • one
  • ", "
  • two
  • ", "
  • four
  • ", "
  • five
  • ", "
  • six
  • "]) self.assertEqual(divtwo.xpath("./ul/li").extract(), ["
  • four
  • ", "
  • five
  • ", "
  • six
  • "]) self.assertEqual(divtwo.xpath(".//li").extract(), ["
  • four
  • ", "
  • five
  • ", "
  • six
  • "]) self.assertEqual(divtwo.xpath("./li").extract(), []) def test_selectorlist_getall_alias(self): """Nested selector tests using getall()""" body = u"""
    • one
    • two
    • four
    • five
    • six
    """ x = self.sscls(text=body) divtwo = x.xpath('//div[@class="two"]') self.assertEqual(divtwo.xpath("//li").getall(), ["
  • one
  • ", "
  • two
  • ", "
  • four
  • ", "
  • five
  • ", "
  • six
  • "]) self.assertEqual(divtwo.xpath("./ul/li").getall(), ["
  • four
  • ", "
  • five
  • ", "
  • six
  • "]) self.assertEqual(divtwo.xpath(".//li").getall(), ["
  • four
  • ", "
  • five
  • ", "
  • six
  • "]) self.assertEqual(divtwo.xpath("./li").getall(), []) def test_mixed_nested_selectors(self): body = u'''
    notme

    text

    foo
    ''' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), [u'me']) self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), [u'me']) def test_dont_strip(self): sel = self.sscls(text=u'
    fff: zzz
    ') self.assertEqual(sel.xpath("//text()").extract(), [u'fff: ', u'zzz']) def test_namespaces_simple(self): body = u""" take this found """ x = self.sscls(text=body, type='xml') x.register_namespace("somens", "http://scrapy.org") self.assertEqual(x.xpath("//somens:a/text()").extract(), [u'take this']) def test_namespaces_adhoc(self): body = u""" take this found """ x = self.sscls(text=body, type='xml') self.assertEqual(x.xpath("//somens:a/text()", namespaces={"somens": "http://scrapy.org"}).extract(), [u'take this']) def test_namespaces_adhoc_variables(self): body = u""" take this found """ x = self.sscls(text=body, type='xml') self.assertEqual(x.xpath("//somens:a/following-sibling::a[@id=$identifier]/text()", namespaces={"somens": "http://scrapy.org"}, identifier="bar").extract(), [u'found']) def test_namespaces_multiple(self): body = u""" hello value iron90Dried Rose """ x = self.sscls(text=body, type='xml') x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05") x.register_namespace("p", "http://www.scrapy.org/product") x.register_namespace("b", "http://somens.com") self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) self.assertEqual(x.xpath("//b:Operation/text()").extract()[0], 'hello') self.assertEqual(x.xpath("//xmlns:TestTag/@b:att").extract()[0], 'value') self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:price/text()").extract()[0], '90') self.assertEqual(x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(), '90') self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron') def test_namespaces_multiple_adhoc(self): body = u""" hello value iron90Dried Rose """ x = self.sscls(text=body, type='xml') x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05") self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) # "b" namespace is not declared yet self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att") # "b" namespace being passed ad-hoc self.assertEqual( x.xpath("//b:Operation/text()", namespaces={"b": "http://somens.com"}).extract()[0], 'hello') # "b" namespace declaration is not cached self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att") # "xmlns" is still defined self.assertEqual( x.xpath("//xmlns:TestTag/@b:att", namespaces={"b": "http://somens.com"}).extract()[0], 'value') # chained selectors still have knowledge of register_namespace() operations self.assertEqual( x.xpath("//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"}) .xpath("./xmlns:price/text()")[0].extract(), '90') # but chained selector don't know about parent ad-hoc declarations self.assertRaises( ValueError, x.xpath("//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"}) .xpath, "p:name/text()") # ad-hoc declarations need repeats when chaining self.assertEqual( x.xpath("//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"}) .xpath("p:name/text()", namespaces={"p": "http://www.scrapy.org/product"}) .extract_first(), 'Dried Rose') # declaring several ad-hoc namespaces self.assertEqual( x.xpath( "string(//b:Operation/following-sibling::xmlns:TestTag" "/following-sibling::*//p:name)", namespaces={"b": "http://somens.com", "p": "http://www.scrapy.org/product"}) .extract_first(), 'Dried Rose') # "p" prefix is not cached from previous calls self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()") x.register_namespace("p", "http://www.scrapy.org/product") self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron') def test_make_links_absolute(self): text = u'link to file' sel = Selector(text=text, base_url='http://example.com') sel.root.make_links_absolute() self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first()) def test_re(self): body = u"""
    Name: Mary
    • Name: John
    • Age: 10
    • Name: Paul
    • Age: 20
    Age: 20
    """ x = self.sscls(text=body) name_re = re.compile(r"Name: (\w+)") self.assertEqual(x.xpath("//ul/li").re(name_re), ["John", "Paul"]) self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"), ["10", "20"]) # Test named group, hit and miss x = self.sscls(text=u'foobar') self.assertEqual(x.re('(?Pfoo)'), ['foo']) self.assertEqual(x.re('(?Pbaz)'), []) # A purposely constructed test for an edge case x = self.sscls(text=u'baz') self.assertEqual(x.re('(?Pfoo)|(?Pbaz)'), []) def test_re_replace_entities(self): body = u"""""" x = self.sscls(text=body) name_re = re.compile('{"foo":(.*)}') # by default, only & and < are preserved ; # other entities are converted expected = u'"bar & "baz""' self.assertEqual(x.xpath("//script/text()").re(name_re), [expected]) self.assertEqual(x.xpath("//script").re(name_re), [expected]) self.assertEqual(x.xpath("//script/text()")[0].re(name_re), [expected]) self.assertEqual(x.xpath("//script")[0].re(name_re), [expected]) # check that re_first() works the same way for single value output self.assertEqual(x.xpath("//script").re_first(name_re), expected) self.assertEqual(x.xpath("//script")[0].re_first(name_re), expected) # switching off replace_entities will preserve " also expected = u'"bar & "baz""' self.assertEqual(x.xpath("//script/text()").re(name_re, replace_entities=False), [expected]) self.assertEqual(x.xpath("//script")[0].re(name_re, replace_entities=False), [expected]) self.assertEqual(x.xpath("//script/text()").re_first(name_re, replace_entities=False), expected) self.assertEqual(x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected) def test_re_intl(self): body = u'
    Evento: cumplea\xf1os
    ' x = self.sscls(text=body) self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), [u'cumplea\xf1os']) def test_selector_over_text(self): hs = self.sscls(text=u'lala') self.assertEqual(hs.extract(), u'lala') xs = self.sscls(text=u'lala', type='xml') self.assertEqual(xs.extract(), u'lala') self.assertEqual(xs.xpath('.').extract(), [u'lala']) def test_invalid_xpath(self): "Test invalid xpath raises ValueError with the invalid xpath" x = self.sscls(text=u"") xpath = "//test[@foo='bar]" self.assertRaisesRegexp(ValueError, re.escape(xpath), x.xpath, xpath) def test_invalid_xpath_unicode(self): "Test *Unicode* invalid xpath raises ValueError with the invalid xpath" x = self.sscls(text=u"") xpath = u"//test[@foo='\u0431ar]" encoded = xpath if six.PY3 else xpath.encode('unicode_escape') self.assertRaisesRegexp(ValueError, re.escape(encoded), x.xpath, xpath) def test_http_header_encoding_precedence(self): # u'\xa3' = pound symbol in unicode # u'\xc2\xa3' = pound symbol in utf-8 # u'\xa3' = pound symbol in latin-1 (iso-8859-1) text = u''' \xa3''' x = self.sscls(text=text) self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(), [u'\xa3']) def test_empty_bodies_shouldnt_raise_errors(self): self.sscls(text=u'').xpath('//text()').extract() def test_bodies_with_comments_only(self): sel = self.sscls(text=u'', base_url='http://example.com') self.assertEqual(u'http://example.com', sel.root.base) def test_null_bytes_shouldnt_raise_errors(self): text = u'pre\x00post' self.sscls(text).xpath('//text()').extract() def test_replacement_char_from_badly_encoded_body(self): # \xe9 alone isn't valid utf8 sequence text = u'

    an Jos\ufffd de

    ' self.assertEqual([u'an Jos\ufffd de'], self.sscls(text).xpath('//text()').extract()) def test_select_on_unevaluable_nodes(self): r = self.sscls(text=u'some text') # Text node x1 = r.xpath('//text()') self.assertEqual(x1.extract(), [u'some text']) self.assertEqual(x1.xpath('.//b').extract(), []) # Tag attribute x1 = r.xpath('//span/@class') self.assertEqual(x1.extract(), [u'big']) self.assertEqual(x1.xpath('.//text()').extract(), []) def test_select_on_text_nodes(self): r = self.sscls(text=u'
    Options:opt1
    Otheropt2
    ') x1 = r.xpath("//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]") self.assertEqual(x1.extract(), [u'opt1']) x1 = r.xpath("//div/descendant::text()/preceding-sibling::b[contains(text(), 'Options')]") self.assertEqual(x1.extract(), [u'Options:']) @unittest.skip("Text nodes lost parent node reference in lxml") def test_nested_select_on_text_nodes(self): # FIXME: does not work with lxml backend [upstream] r = self.sscls(text=u'
    Options:opt1
    Otheropt2
    ') x1 = r.xpath("//div/descendant::text()") x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]") self.assertEqual(x2.extract(), [u'Options:']) def test_weakref_slots(self): """Check that classes are using slots and are weak-referenceable""" x = self.sscls(text=u'') weakref.ref(x) assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \ x.__class__.__name__ def test_remove_namespaces(self): xml = u""" """ sel = self.sscls(text=xml, type='xml') self.assertEqual(len(sel.xpath("//link")), 0) self.assertEqual(len(sel.xpath("./namespace::*")), 3) sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link")), 3) self.assertEqual(len(sel.xpath("./namespace::*")), 1) def test_remove_namespaces_embedded(self): xml = u""" """ sel = self.sscls(text=xml, type='xml') self.assertEqual(len(sel.xpath("//link")), 0) self.assertEqual(len(sel.xpath("//stop")), 0) self.assertEqual(len(sel.xpath("./namespace::*")), 2) self.assertEqual(len(sel.xpath("//f:link", namespaces={'f': 'http://www.w3.org/2005/Atom'})), 2) self.assertEqual(len(sel.xpath("//s:stop", namespaces={'s': 'http://www.w3.org/2000/svg'})), 2) sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link")), 2) self.assertEqual(len(sel.xpath("//stop")), 2) self.assertEqual(len(sel.xpath("./namespace::*")), 1) def test_remove_attributes_namespaces(self): xml = u""" """ sel = self.sscls(text=xml, type='xml') self.assertEqual(len(sel.xpath("//link/@type")), 0) sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link/@type")), 3) def test_smart_strings(self): """Lxml smart strings return values""" class SmartStringsSelector(Selector): _lxml_smart_strings = True body = u"""
    • one
    • two
    • four
    • five
    • six
    """ # .getparent() is available for text nodes and attributes # only when smart_strings are on x = self.sscls(text=body) li_text = x.xpath('//li/text()') self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), li_text))) div_class = x.xpath('//div/@class') self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), div_class))) x = SmartStringsSelector(text=body) li_text = x.xpath('//li/text()') self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), li_text))) div_class = x.xpath('//div/@class') self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), div_class))) def test_xml_entity_expansion(self): malicious_xml = u''\ ' ]>&xxe;' sel = self.sscls(text=malicious_xml, type='xml') self.assertEqual(sel.extract(), '&xxe;') def test_configure_base_url(self): sel = self.sscls(text=u'nothing', base_url='http://example.com') self.assertEqual(u'http://example.com', sel.root.base) def test_extending_selector(self): class MySelectorList(Selector.selectorlist_cls): pass class MySelector(Selector): selectorlist_cls = MySelectorList sel = MySelector(text=u'
    foo
    ') self.assertIsInstance(sel.xpath('//div'), MySelectorList) self.assertIsInstance(sel.xpath('//div')[0], MySelector) self.assertIsInstance(sel.css('div'), MySelectorList) self.assertIsInstance(sel.css('div')[0], MySelector) def test_replacement_null_char_from_body(self): text = u'\x00

    Grainy

    ' self.assertEqual(u'

    Grainy

    ', self.sscls(text).extract()) def test_remove_selector_list(self): sel = self.sscls(text=u'
    • 1
    • 2
    • 3
    ') sel_list = sel.css('li') sel_list.remove() self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls) self.assertEqual(sel.css('li'), []) def test_remove_selector(self): sel = self.sscls(text=u'
    • 1
    • 2
    • 3
    ') sel_list = sel.css('li') sel_list[0].remove() self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls) self.assertEqual(sel.css('li::text').getall(), ['2', '3']) def test_remove_pseudo_element_selector_list(self): sel = self.sscls(text=u'
    • 1
    • 2
    • 3
    ') sel_list = sel.css('li::text') self.assertEqual(sel_list.getall(), ['1', '2', '3']) with self.assertRaises(CannotRemoveElementWithoutRoot): sel_list.remove() self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls) self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3']) def test_remove_pseudo_element_selector(self): sel = self.sscls(text=u'
    • 1
    • 2
    • 3
    ') sel_list = sel.css('li::text') self.assertEqual(sel_list.getall(), ['1', '2', '3']) with self.assertRaises(CannotRemoveElementWithoutRoot): sel_list[0].remove() self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls) self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3']) def test_remove_root_element_selector(self): sel = self.sscls(text=u'
    • 1
    • 2
    • 3
    ') sel_list = sel.css('li::text') self.assertEqual(sel_list.getall(), ['1', '2', '3']) with self.assertRaises(CannotRemoveElementWithoutParent): sel.remove() with self.assertRaises(CannotRemoveElementWithoutParent): sel.css('html').remove() self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls) self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3']) sel.css('body').remove() self.assertEqual(sel.get(), '') class ExsltTestCase(unittest.TestCase): sscls = Selector def test_regexp(self): """EXSLT regular expression tests""" body = u"""

    """ sel = self.sscls(text=body) # re:test() self.assertEqual( sel.xpath( '//input[re:test(@name, "[A-Z]+", "i")]').extract(), [x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]')]) self.assertEqual( [x.extract() for x in sel.xpath( r'//a[re:test(@href, "\.html$")]/text()')], [u'first link', u'second link']) self.assertEqual( [x.extract() for x in sel.xpath( '//a[re:test(@href, "first")]/text()')], [u'first link']) self.assertEqual( [x.extract() for x in sel.xpath( '//a[re:test(@href, "second")]/text()')], [u'second link']) # re:match() is rather special: it returns a node-set of nodes # [u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml', # u'http', # u'www.bayes.co.uk', # u'', # u'/xml/index.xml?/xml/utils/rechecker.xml'] self.assertEqual( sel.xpath(r're:match(//a[re:test(@href, "\.xml$")]/@href,' r'"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()').extract(), [u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml', u'http', u'www.bayes.co.uk', u'', u'/xml/index.xml?/xml/utils/rechecker.xml']) # re:replace() self.assertEqual( sel.xpath(r're:replace(//a[re:test(@href, "\.xml$")]/@href,' r'"(\w+)://(.+)(\.xml)", "","https://\2.html")').extract(), [u'https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html']) def test_set(self): """EXSLT set manipulation tests""" # microdata example from http://schema.org/Event body = u""" """ sel = self.sscls(text=body) self.assertEqual( sel.xpath('''//div[@itemtype="http://schema.org/Event"] //@itemprop''').extract(), [u'url', u'name', u'startDate', u'location', u'url', u'address', u'addressLocality', u'addressRegion', u'offers', u'lowPrice', u'offerCount'] ) self.assertEqual(sel.xpath(''' set:difference(//div[@itemtype="http://schema.org/Event"] //@itemprop, //div[@itemtype="http://schema.org/Event"] //*[@itemscope]/*/@itemprop)''').extract(), [u'url', u'name', u'startDate', u'location', u'offers'])