爬取小程序所有教程scrapy

setting:

from fake_useragent import UserAgentBOT_NAME = wxappSPIDER_MODULES = [wxapp.spiders]NEWSPIDER_MODULE = wxapp.spidersROBOTSTXT_OBEY = FalseDOWNLOAD_DELAY = 1DEFAULT_REQUEST_HEADERS = { Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8, User-Agent: str(UserAgent().random),}ITEM_PIPELINES = { wxapp.pipelines.WxappPipeline: 300,}

 

wxapp_spider

import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom wxapp.items import WxappItemclass WxappSpiderSpider(CrawlSpider): name = wxapp_spider allowed_domains = [wxapp-union.com] start_urls = [https://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1] rules = ( Rule(LinkExtractor(allow=r.+mod=list&catid=2&page=d), follow=True), Rule(LinkExtractor(allow=r".+article-.+.html"),callback="parse_detail", follow=False) ) def parse_detail(self, response): title = response.xpath("//h1[@class=‘ph‘]/text()").get() author_p = response.xpath("//p[@class=‘authors‘]") author = author_p.xpath(".//a/text()").get() time = author_p.xpath(".//span[@class = ‘time‘]/text()").get() article = response.xpath("//td[@id=‘article_content‘]//text()").getall() article = "".join(article).strip() print(title, author, time) print(article) item = WxappItem(title=title,author=author,time=time,content=article) yield item

items.py:

# Define here the models for your scraped items## See documentation in:# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass WxappItem(scrapy.Item): title = scrapy.Field() author = scrapy.Field() time = scrapy.Field() content = scrapy.Field()

pipelines.py:

爬取小程序所有教程scrapy
# Define your item pipelines here## Dont forget to add your pipeline to the ITEM_PIPELINES setting# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html# useful for handling different item types with a single interfacefrom itemadapter import ItemAdapterfrom scrapy.exporters import JsonLinesItemExporterclass WxappPipeline: def __init__(self): self.fp = open("wxjc.json","wb") self.export = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding = utf-8) def process_item(self, item, spider): self.export.export_item(item) return item def close_spider(self,spider): self.fp.close()

 

相关文章