import pytest from protego import Protego @pytest.mark.parametrize( 'path,user_agent', [ ('/group1', 'hedwig-news'), ('/group1', 'hedwig-news2'), ('/group2', 'hedwi'), ('/group2', 'a'), ('/group3', 'hedwig-new'), ('/group3', 'hedwig-images'), ] ) def test_user_agent_precedence(path, user_agent): robotstxt_content = u""" user-agent: hedwig-news disallow: / allow: /group1 user-agent: * disallow: / allow: /group2 user-agent: hedwig disallow: / allow: /group3 """ rp = Protego.parse(content=robotstxt_content) for allowed_path in ('/group1', '/group2', '/group3'): if rp.can_fetch(allowed_path, user_agent): break else: allowed_path = None assert allowed_path == path @pytest.mark.parametrize( 'pattern,path,match', [ ('/', '/harry', True), ('/', '/device/time-turner', True), ('/', '/hogwards.html', True), ('/*', '/harry', True), ('/*', '/device/time-turner', True), ('/*', '/hogwards.html', True), ('/phoenix', '/phoenix', True), ('/phoenix', '/phoenix.html', True), ('/phoenix', '/phoenix/sparky.html', True), ('/phoenix', '/phoenixheads', True), ('/phoenix', '/phoenixheads/yummy.html', True), ('/phoenix', '/phoenix.php?id=anything', True), ('/phoenix', '/Phoenix.asp', False), ('/phoenix', '/redphoenix', False), ('/phoenix', '/?id=phoenix', False), ('/phoenix*', '/phoenix', True), ('/phoenix*', '/phoenix.html', True), ('/phoenix*', '/phoenix/sparky.html', True), ('/phoenix*', '/phoenixheads', True), ('/phoenix*', '/phoenixheads/yummy.html', True), ('/phoenix*', '/phoenix.php?id=anything', True), ('/phoenix*', '/Phoenix.asp', False), ('/phoenix*', '/redphoenix', False), ('/phoenix*', '/?id=phoenix', False), ('/phoenix/', '/phoenix/', True), ('/phoenix/', '/phoenix/?id=anything', True), ('/phoenix/', '/phoenix/sparky.htm', True), ('/phoenix/', '/phoenix', False), ('/phoenix/', '/phoenix.html', False), ('/phoenix/', '/Phoenix/Sparky.asp', False), ('/*.php', '/filename.php', True), ('/*.php', '/folder/filename.php', True), ('/*.php', '/folder/filename.php?parameters', True), ('/*.php', '/folder/any.php.file.html', True), ('/*.php', '/filename.php/', True), ('/*.php', '/windows.PHP', False), ('/*.php', '/', False), ('/*.php', '/index?f=filename.php/', True), ('/*.php', '/index?php', False), ('/*.php$', '/filename.php', True), ('/*.php$', '/folder/filename.php', True), ('/*.php$', '/filename.php?parameters', False), ('/*.php$', '/filename.php/', False), ('/*.php$', '/filename.php5', False), ('/*.php$', '/windows.PHP', False), ('/*.php$', '/filename?php', False), ('/fish*.php', '/fish.php', True), ('/fish*.php', '/fishheads/catfish.php?parameters', True), ('/fish*.php', '/Fish.PHP', False), ] ) def test_path_matching(pattern, path, match): content = """ User-Agent: * disallow: {} """.format(pattern) rp = Protego.parse(content) assert (not rp.can_fetch(path, '*')) == match @pytest.mark.parametrize( 'rules,url,allowed', [ ("allow: /p \n disallow: /", "http://example.com/page", True), ("allow: /folder \n disallow: /folder", "http://example.com/folder/page", True), ("allow: /$ \n disallow: /", "http://example.com/", True), ("allow: /$ \n disallow: /", "http://example.com/page.htm", False), ] ) def test_record_precedence(rules, url, allowed): content = """ User-Agent: * {} """.format(rules) rp = Protego.parse(content) assert rp.can_fetch(url, '*') == allowed