先看scrapy-redis源码
1 class RedisMixin(object): 2 """Mixin class to implement reading urls from a redis queue.""" 3 redis_key = None 4 redis_batch_size = None 5 redis_encoding = None 6 7 # Redis client placeholder. 8 server = None 9 10 def start_requests(self): 11 """Returns a batch of start requests from redis.""" 12 return self.next_requests() 13 14 def setup_redis(self, crawler=None): 15 """Setup redis connection and idle signal. 16 17 This should be called after the spider has set its crawler object. 18 """ 19 if self.server is not None: 20 return 21 22 if crawler is None: 23 # We allow optional crawler argument to keep backwards 24 # compatibility. 25 # XXX: Raise a deprecation warning. 26 crawler = getattr(self, ‘crawler‘, None) 27 28 if crawler is None: 29 raise ValueError("crawler is required") 30 31 settings = crawler.settings 32 33 if self.redis_key is None: 34 self.redis_key = settings.get( 35 ‘REDIS_START_URLS_KEY‘, defaults.START_URLS_KEY, 36 ) 37 38 self.redis_key = self.redis_key % {‘name‘: self.name} 39 40 if not self.redis_key.strip(): 41 raise ValueError("redis_key must not be empty") 42 43 if self.redis_batch_size is None: 44 # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). 45 self.redis_batch_size = settings.getint( 46 ‘REDIS_START_URLS_BATCH_SIZE‘, 47 settings.getint(‘CONCURRENT_REQUESTS‘), 48 ) 49 50 try: 51 self.redis_batch_size = int(self.redis_batch_size) 52 except (TypeError, ValueError): 53 raise ValueError("redis_batch_size must be an integer") 54 55 if self.redis_encoding is None: 56 self.redis_encoding = settings.get(‘REDIS_ENCODING‘, defaults.REDIS_ENCODING) 57 58 self.logger.info("Reading start URLs from redis key ‘%(redis_key)s‘ " 59 "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", 60 self.__dict__) 61 62 self.server = connection.from_settings(crawler.settings) 63 # The idle signal is called when the spider has no requests left, 64 # that‘s when we will schedule new requests from redis queue 65 crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 66 67 def next_requests(self): 68 """Returns a request to be scheduled or none.""" 69 use_set = self.settings.getbool(‘REDIS_START_URLS_AS_SET‘, defaults.START_URLS_AS_SET) 70 fetch_one = self.server.spop if use_set else self.server.lpop 71 # XXX: Do we need to use a timeout here? 72 found = 0 73 # TODO: Use redis pipeline execution. 74 while found < self.redis_batch_size: 75 data = fetch_one(self.redis_key) 76 if not data: 77 # Queue empty. 78 break 79 req = self.make_request_from_data(data) 80 if req: 81 yield req 82 found += 1 83 else: 84 self.logger.debug("Request not made from data: %r", data) 85 86 if found: 87 self.logger.debug("Read %s requests from ‘%s‘", found, self.redis_key) 88 89 def make_request_from_data(self, data): 90 """Returns a Request instance from data coming from Redis. 91 92 By default, ``data`` is an encoded URL. You can override this method to 93 provide your own message decoding. 94 95 Parameters 96 ---------- 97 data : bytes 98 Message from redis. 99 100 """101 url = bytes_to_str(data, self.redis_encoding)102 return self.make_requests_from_url(url)103 104 def schedule_next_requests(self):105 """Schedules a request if available"""106 # TODO: While there is capacity, schedule a batch of redis requests.107 for req in self.next_requests():108 self.crawler.engine.crawl(req, spider=self)109 110 def spider_idle(self):111 """Schedules a request if available, otherwise waits."""112 # XXX: Handle a sentinel to close the spider.113 self.schedule_next_requests()114 raise DontCloseSpider115 116 117 class RedisSpider(RedisMixin, Spider):118 """Spider that reads urls from redis queue when idle.119 120 Attributes121 ----------122 redis_key : str (default: REDIS_START_URLS_KEY)123 Redis key where to fetch start URLs from..124 redis_batch_size : int (default: CONCURRENT_REQUESTS)125 Number of messages to fetch from redis on each attempt.126 redis_encoding : str (default: REDIS_ENCODING)127 Encoding to use when decoding messages from redis queue.128 129 Settings130 --------131 REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")132 Default Redis key where to fetch start URLs from..133 REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)134 Default number of messages to fetch from redis on each attempt.135 REDIS_START_URLS_AS_SET : bool (default: False)136 Use SET operations to retrieve messages from the redis queue. If False,137 the messages are retrieve using the LPOP command.138 REDIS_ENCODING : str (default: "utf-8")139 Default encoding to use when decoding messages from redis queue.140 141 """142 143 @classmethod144 def from_crawler(self, crawler, *args, **kwargs):145 obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)146 obj.setup_redis(crawler)147 return obj148 149 150 class RedisCrawlSpider(RedisMixin, CrawlSpider):151 """Spider that reads urls from redis queue when idle.152 153 Attributes154 ----------155 redis_key : str (default: REDIS_START_URLS_KEY)156 Redis key where to fetch start URLs from..157 redis_batch_size : int (default: CONCURRENT_REQUESTS)158 Number of messages to fetch from redis on each attempt.159 redis_encoding : str (default: REDIS_ENCODING)160 Encoding to use when decoding messages from redis queue.161 162 Settings163 --------164 REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")165 Default Redis key where to fetch start URLs from..166 REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)167 Default number of messages to fetch from redis on each attempt.168 REDIS_START_URLS_AS_SET : bool (default: True)169 Use SET operations to retrieve messages from the redis queue.170 REDIS_ENCODING : str (default: "utf-8")171 Default encoding to use when decoding messages from redis queue.172 173 """174 175 @classmethod176 def from_crawler(self, crawler, *args, **kwargs):177 obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)178 obj.setup_redis(crawler)179 return obj
仔细看完的话会发现
make_request_from_data(self, data)
这个方法是从redis中返回一个请求实例 默认是一个url
接下来重写一下这个方法直接传入到
self.make_requests_from_url
一个json串就好了
在这个方法里面可以把这个串解析了请求url或者生产url
代码如下
1 def make_request_from_data(self, data): 2 ‘‘‘ 3 :params data bytes, Message from redis 4 ‘‘‘ 5 company = bytes_to_str(data, self.redis_encoding) 6 return self.make_requests_from_url(company) 7 8 def make_requests_from_url(self, company): 9 data = eval(company)10 url = data["url"]11 headers = {12 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",13 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"14 }15 return Request(url, self.parse, meta={"data": data}, dont_filter=True, headers=headers)
值得注意的是
不能在make_request_from_data方法中直接使用Request(其他第三方的也不支持),会导致方法无法执行,也不抛出异常
但是同时重写make_request_from_data和make_requests_from_url方法则可以执行