Scrapy 采集需要登录注册的网站

#!/usr/bin/py2# -*- coding: utf-8 -*-#encoding=utf-8from bs4 import BeautifulSoupfrom scrapy.http import Request, FormRequestfrom spider_test.items import *from scrapy.spiders import CrawlSpiderfrom spider_test import settings# @author Funsion Wuclass ScrapyTestSpider(CrawlSpider): name = "spider_test" allowed_domains = [settings.SPIDER_DOMAIN] def start_requests(self): """第一次请求一下登录页面,设置开启cookie使其得到cookie,设置回调函数""" yield Request(http://%s/admin/account/login.html % settings.SPIDER_DOMAIN, meta={cookiejar: 1}, callback=self.parse) def parse(self, response): data = dict(username="xiaoming", # 登录页表单的账号字段 password="888888") # 登录页表单的密码字段 print(登录中....!) """第二次用表单post请求,携带Cookie、浏览器代理、用户登录信息,进行登录给Cookie授权""" yield FormRequest(url=http://%s/admin/account/dologin.html % settings.SPIDER_DOMAIN, # 真实post地址 meta={cookiejar: 1}, formdata=data, callback=self.jump_office_list) def jump_office_list(self, response): print(正在请需要登录才可以访问的页面....!) yield Request(http://%s/admin/office/getofficelist.html % settings.SPIDER_DOMAIN, meta={cookiejar: 1}, callback=self.parser_office_list) def parser_office_list(self, response): soup = BeautifulSoup(response.body, html.parser) page_list = soup.find(attrs={class: pagination}).find_all(a) if page_list: for page in page_list: page_url = http://%s%s % (settings.SPIDER_DOMAIN, page.get(href)) yield Request(page_url, meta={cookiejar: 1}, callback=self.parser_office_list) office_list = soup.find_all(a, attrs={class: ui-office-list}) if office_list: for office in office_list: office_url = http://%s%s % (settings.SPIDER_DOMAIN, office.attrs[href]) yield Request(office_url, meta={cookiejar: 1}, callback=self.parse_article) def parse_article(self, response): test_item = SpiderTestItem() soup = BeautifulSoup(response.body, html.parser) container = soup.find(table, attrs={class: index-statistics-table}) test_item[source_url] = response.url test_item[title] = soup.title.get_text() test_item[article_content] = container.prettify() return test_item

 

相关文章