# encoding=utf-8 from unittest import TestCase from protego import Protego class TestProtego(TestCase): def test_allowed(self): content = ("User-agent: * \n" "Disallow: /disallowed \n" "Allow: /allowed \n" "Crawl-delay: 10") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/allowed", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/disallowed", "*")) content = ("User-agent: * \n" "Disallow: /d \n" "Crawl-delay: 10") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/abc/d", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/disallowed", "*")) def test_malformed_disallow(self): content = ("User-agent: * \n" "Disallow: /one \n" "Dissallow: /two \n" "Dissalow: /three \n" "Disalow: /four \n" "Diasllow: /five \n" "Disallaw: /six \n") rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("https://www.site.local/one", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/two", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/three", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/four", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/five", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/six", "*")) def test_length_based_precedence(self): content = ("User-agent: * \n" "Disallow: / \n" "Allow: /page") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/page", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/elsewhere", "*")) content = ("user-agent: FooBot\n" "disallow: /x/page.html\n" "allow: /x/\n") rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) content = ("user-agent: FooBot\n" "allow: /x/page.html\n" "disallow: /x/\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x/", "FooBot")) # In case of equivalent disallow and allow patterns for the same # user-agent, allow is used. content = ("user-agent: FooBot\n" "disallow: \n" "allow: \n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) content = ("user-agent: FooBot\n" "disallow: /\n" "allow: /\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) content = ("user-agent: FooBot\n" "disallow: /x\n" "allow: /x/\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x", "FooBot")) content = ("user-agent: FooBot\n" "disallow: /x/page.html\n" "allow: /x/page.html\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) content = ("user-agent: FooBot\n" "allow: /page\n" "disallow: /*.html\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/page", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/page.html", "FooBot")) content = ("user-agent: FooBot\n" "allow: /x/page.\n" "disallow: /*.html\n") rp = Protego.parse(content=content) # Longest match wins. self.assertTrue(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x/y.html", "FooBot")) content = ("User-agent: *\n" "Disallow: /x/\n" "User-agent: FooBot\n" "Disallow: /y/\n") rp = Protego.parse(content=content) # Most specific group for FooBot allows implicitly /x/page. self.assertTrue(rp.can_fetch("http://foo.bar/x/page", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/y/page", "FooBot")) content = ("user-agent: FooBot\n" "allow: /p\n" "disallow: /\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://example.com/page", "FooBot")) content = ("user-agent: FooBot\n" "allow: /folder\n" "disallow: /folder\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://example.com/folder/page", "FooBot")) content = ("user-agent: FooBot\n" "disallow: /folder\n" "allow: /folder\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://example.com/folder/page", "FooBot")) content = ("user-agent: FooBot\n" "allow: /page\n" "disallow: /*.htm\n") rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("http://example.com/page.htm", "FooBot")) content = ("user-agent: FooBot\n" "allow: /$\n" "disallow: /\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://example.com/", "FooBot")) self.assertFalse(rp.can_fetch("http://example.com/page.html", "FooBot")) def test_escaped_url(self): content = ("User-agent: * \n" "Disallow: / \n" "Allow: /a%3cd.html") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/a