“”” 抓取 解析 存储 “”” import re #import ast from urllib import parse from datetime import datetime
import requests import time from scrapy import Selector
from models import *
domain = “http://www.91jf.com/”
#函数用来保存写入测试文本 def write_txt(html_data): f = open(“a.txt”, ‘w‘) f.write(html_data) f.close()
def get_nodes_json(): left_menu_text = requests.get(“http://www.91jf.com/”).text #write_txt(left_menu_text) #etree.HTML(res0.text) sel = Selector(text=left_menu_text) all_divs = sel.xpath(“//div[@class=‘class_child_li‘]//a[@href]”).extract() if all_divs: nodes_lists = [] for i in range(len(all_divs)): nodes_str = all_divs[i] nodes_str = nodes_str.replace(“&”,”&”) # 此处&由于被转义成&导致需要重新进行处理 nodes_lists.append(nodes_str) return nodes_lists return []
url_list_names = [] def process_nodes_list(nodes_list): #将js的格式提取出url到list中 for item in nodes_list: #此处为对应的url数据 url = re.search(‘\”.*\d\”‘, item) url = url.group(0).replace(“\””, “”) url = parse.urljoin(domain,url) #此处为url对应的商品标签 name = re.search(‘<span>.*</span>‘,item) name = name.group(0).replace(“<span>”,””) name = name.replace(“</span>”,””) url_list_name = [url,name] url_list_names.append(url_list_name) return url_list_names
def get_level1_list(nodes_list): level1_url = [] #将js的格式提取出url到list中 for item in nodes_list: #此处为对应的url数据 url = re.search(‘\”.*\d\”‘, item) url = url.group(0).replace(“\””, “”) url1 = parse.urljoin(domain,url + “&okey=salenum&order=desc&page=1”) level1_url.append(url1) return level1_url
def get_last_urls(): #获取最终需要抓取的url url_list = [] nodes_list = get_nodes_json() url_names = process_nodes_list(nodes_list) level1_url = get_level1_list(nodes_list) for url in level1_url: #print(url) #parse_product(url) parse_data_last(url) #url_list.extend(parse_data_last(url)) return url_list
def parse_product(url): #获取商品的详情以及销售数量 res_text = requests.get(url).text print(url) #print(res_text) sel = Selector(text=res_text) res_li = sel.xpath(“//div[@class=‘pro_list_div g-clearfix c‘]/ul//li[@class=‘goods_offset‘]”) flag_num = 0 for item in res_li: name = item.xpath(“./div[@class=‘row row-2 title‘]/a/text()”).extract() # 产品名字 name = ‘‘.join(name) price = item.xpath(‘./div[@id=”goods_detail_b”]/div[@class=”row row-1″]/div[@class=”g_price fm2″]/strong/text()‘).extract() # 显示价格 price = ‘‘.join(price) try: price = float(price) except: print(“价格会员可见|价格请咨询商家”) continue sales_num = item.xpath(“./div[@id=‘goods_detail_b‘]/div[2]/p[1]/text()”).extract() # 销售数量 sales_num= ‘‘.join(sales_num) sales_num = sales_num.split(‘销量:‘)[1] sales_num = int(sales_num) flag_num = sales_num if sales_num < 1: continue merchant = item.xpath(“./div[@id=‘goods_detail_b‘]/div[2]/p[2]/text()”).extract() # 商家 merchant = ‘‘.join(merchant)
main_Products = item.xpath(“./div[@id=‘goods_detail_b‘]/div[2]/p[3]/text()”).extract() # 主营 main_Products = ‘‘.join(main_Products)
merchant_Place = item.xpath(“./div[@id=‘goods_detail_b‘]/div[2]/p[4]/text()”).extract() # 地址 merchant_Place = ‘‘.join(merchant_Place) product = Product() product.name = name product.price = price product.sales_num = sales_num product.merchant = merchant product.main_Products = main_Products product.merchant_Place = merchant_Place existed_name = Product.select().where(Product.name==product.name) if existed_name: product.save() else: product.save(force_insert=True) next_page = sel.xpath(“//*[@class=‘pagination2‘]/a[@href]”).extract() if len(next_page) > 2 and flag_num > 0: url_next = re.search(‘\”.*\d\”‘,next_page[-1]) url_next = url_next.group().replace(“&”,”&”) # 此处&由于被转义成&导致需要重新进行处理 url_next = url_next.replace(“\””,””) url_next = parse.urljoin(domain,url_next) #print(url_next) parse_product(url_next) else: pass
#获取商品链接 def parse_data_last(url): url_list = [] flag_num = 0 #获取商品的详情标签 res_text = requests.get(url).text sel = Selector(text=res_text) res_li = sel.xpath(“//div[@class=‘pro_list_div g-clearfix c‘]/ul//li[@class=‘goods_offset‘]”) for item in res_li: sales_num = item.xpath(“./div[@id=‘goods_detail_b‘]/div[2]/p[1]/text()”).extract() # 销售数量 sales_num= ‘‘.join(sales_num) sales_num = sales_num.split(‘销量:‘)[1] sales_num = int(sales_num) flag_num = int(sales_num)
data = item.xpath(“./div[@class=‘pro_pic_box‘]/a”).extract() data = re.search(‘\”.*\d\”‘,data[0]) data = data.group().replace(“&”,”&”) data = data.replace(“\””,””) data_url = parse.urljoin(domain,data) # 链接为销量排序之后的单个商品链接,传出链接 print(“开始获取商品:{}”.format(data_url)) parse_product_data(data_url) url_list.append(data_url)
#此处代码用来切到下一页链接数据,商品的详情排布页 next_page = sel.xpath(“//*[@class=‘pagination2‘]/a[@href]”).extract() if len(next_page) > 2 and flag_num > 0: url_next = re.search(‘\”.*\d\”‘,next_page[-1]) url_next = url_next.group().replace(“&”,”&”) # 此处&由于被转义成&导致需要重新进行处理 url_next = url_next.replace(“\””,””) url_next = parse.urljoin(domain,url_next) parse_data_last(url_next)
return url_list
#获取商品详细数据 def parse_product_data(url): #获取商品的详情以及销售数量 #print(url) # 打印当前商品页的url用来定位 product_id = url.split(‘id=‘)[1] # 对商品id进行切片处理,用来获取ajax数据 res_text = requests.get(url).text sel = Selector(text=res_text) #筛选规则,当is_price之后的value属性值为0的时候,说明不需要咨询商家,同时需要注意的是,商品会有打折批次数量的差异导致价格差异, #这一点需要根据具体的显示页面来处理,现在忽略,由于可能存在打折段的数据差异,所以暂时不考虑 Is_price = sel.xpath(“//input[contains(@id,‘is_price‘)]”).extract()#取到的数据用来判断价格是否需要咨询商家 print(Is_price) if len(Is_price) < 1: print(“页面数据为空”) is_value = re.search(‘\d‘,Is_price[0]) if is_value.group() == ‘0‘: # 0表示商品价格不需要咨询商户 #datas = sel.xpath(“//table[contains(@class,‘goods_spec_list‘)]”).extract() datas = sel.xpath(“//div[contains(@class,‘show_all‘)]/table[contains(@class,‘goods_spec_list‘)]//tr”) #price_base price_base = 0.0 for item in range(len(datas)): price = datas[item].xpath(“./input[3]”).extract() price = re.search(‘value=\”.*\”‘,price[0]) price = re.search(‘\d.*\d‘,price[0]) price = price.group() price_base = price_base + float(price) price_base = price_base / len(datas) # 商品基准价格计算 #此处获取商品的描述信息 attributes_list = sel.xpath(“//span[contains(@class,‘attributes-list‘)]//li/text()”).extract() str_attributes = ‘ ‘.join(attributes_list) str_attributes = str_attributes.replace(“ ”,” “) # 商品信息描述 #此处发送请求获取商品购买数据 url_sales = parse.urljoin(domain,‘default.php?act=evallist‘) data = { ‘id‘: product_id, ‘page‘: ‘0‘, ‘info_type‘: ‘sale‘ } response = requests.post(url_sales, data=data) buyer_num = response.json().get(“member”) # 购买人数 sale_num = response.json().get(‘num‘) # 销售数量 buyer_rate = response.json().get(‘re_buyer_rate‘) # 商品复购率 product_id = int(product_id) # 此处对商品ID进行转换
product_attributes = Product_attributes() product_attributes.product_id = product_id product_attributes.price_base = price_base product_attributes.attributes = str_attributes product_attributes.buyer_num = buyer_num product_attributes.sale_num = sale_num product_attributes.buyer_rate = buyer_rate
existed_id = Product_attributes.select().where(Product_attributes.product_id==product_id) if existed_id: product_attributes.save() else: product_attributes.save(force_insert=True)
else : price = “价格请咨询商家” #print(price) #price1 = sel.xpath(“//input[@id=‘goods_spec_price_0_0‘]”).extract() #print(“获取指定商品失败,不存在的商品”)
if __name__ == “__main__”: start_time = datetime.now() print(start_time) last_urls = get_last_urls() end_time = datetime.now() print(end_time) print(“一共使用时间:”,end_time – start_time) ‘‘‘ for url in last_urls: #parse_product_data(url) #print(“开始获取商品:{}”.format(url)) ‘‘‘