import json from itemadapter import is_item, ItemAdapter from scrapy.contracts import Contract from scrapy.exceptions import ContractFail from scrapy.http import Request # contracts class UrlContract(Contract): """ Contract to set the url of the request (mandatory) @url http://scrapy.org """ name = 'url' def adjust_request_args(self, args): args['url'] = self.args[0] return args class CallbackKeywordArgumentsContract(Contract): """ Contract to set the keyword arguments for the request. The value should be a JSON-encoded dictionary, e.g.: @cb_kwargs {"arg1": "some value"} """ name = 'cb_kwargs' def adjust_request_args(self, args): args['cb_kwargs'] = json.loads(' '.join(self.args)) return args class ReturnsContract(Contract): """ Contract to check the output of a callback general form: @returns request(s)/item(s) [min=1 [max]] e.g.: @returns request @returns request 2 @returns request 2 10 @returns request 0 10 """ name = 'returns' object_type_verifiers = { 'request': lambda x: isinstance(x, Request), 'requests': lambda x: isinstance(x, Request), 'item': is_item, 'items': is_item, } def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if len(self.args) not in [1, 2, 3]: raise ValueError( f"Incorrect argument quantity: expected 1, 2 or 3, got {len(self.args)}" ) self.obj_name = self.args[0] or None self.obj_type_verifier = self.object_type_verifiers[self.obj_name] try: self.min_bound = int(self.args[1]) except IndexError: self.min_bound = 1 try: self.max_bound = int(self.args[2]) except IndexError: self.max_bound = float('inf') def post_process(self, output): occurrences = 0 for x in output: if self.obj_type_verifier(x): occurrences += 1 assertion = (self.min_bound <= occurrences <= self.max_bound) if not assertion: if self.min_bound == self.max_bound: expected = self.min_bound else: expected = f'{self.min_bound}..{self.max_bound}' raise ContractFail(f"Returned {occurrences} {self.obj_name}, expected {expected}") class ScrapesContract(Contract): """ Contract to check presence of fields in scraped items @scrapes page_name page_body """ name = 'scrapes' def post_process(self, output): for x in output: if is_item(x): missing = [arg for arg in self.args if arg not in ItemAdapter(x)] if missing: missing_fields = ", ".join(missing) raise ContractFail(f"Missing fields: {missing_fields}")