urllib库解析

Urllib库详解

什么是Urllib:

Python内置的HTTP请求库

  • urllib.request 请求模块
  • urllib.error 异常处理模块
  • urllib.parse url 解析模块

urlopen

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

  • url 传入url
  • data 用于POST提交数据
  • timeout 设置最大响应接受时间
import urllib.requestresponse = urllib.request.urlopen(‘http://www.baidu.com‘)print(response.read().decode(‘utf-8‘))
import urllib.parseimport urllib.requestdata = bytes(urllib.parse.urlencode({‘word‘: ‘hello‘}), encoding=‘utf8‘)response = urllib.request.urlopen(‘http://httpbin.org/post‘, data=data)print(response.read())
import socketimport urllib.requestimport urllib.errortry: response = urllib.request.urlopen(‘http://httpbin.org/get‘, timeout=0.1)except urllib.error.URLError as e: if isinstance(e.reason, socket.timeout): print(‘TIME OUT‘)

响应

响应类型

import urllib.requestresponse = urllib.request.urlopen(‘https://www.python.org‘)print(type(response))

状态码、响应头

import urllib.requestresponse = urllib.request.urlopen(‘https://www.python.org‘)print(response.status)print(response.getheaders())print(response.getheader(‘Server‘))
import urllib.requestresponse = urllib.request.urlopen(‘https://www.python.org‘)print(response.read().decode(‘utf-8‘))

Request

import urllib.requestrequest = urllib.request.Request(‘https://python.org‘)response = urllib.request.urlopen(request)print(response.read().decode(‘utf-8‘))
from urllib import request, parseurl = ‘http://httpbin.org/post‘headers = { ‘User-Agent‘: ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘, ‘Host‘: ‘httpbin.org‘}dict = { ‘name‘: ‘Germey‘}data = bytes(parse.urlencode(dict), encoding=‘utf8‘)req = request.Request(url=url, data=data, headers=headers, method=‘POST‘)response = request.urlopen(req)print(response.read().decode(‘utf-8‘))
输出:{ "args": {}, "data": "", "files": {}, "form": { "name": "Germey" }, "headers": { "Accept-Encoding": "identity", "Connect-Time": "1", "Connection": "close", "Content-Length": "11", "Content-Type": "application/x-www-form-urlencoded", "Host": "httpbin.org", "Total-Route-Time": "0", "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)", "Via": "1.1 vegur", "X-Request-Id": "f96e736e-0b8a-4ab4-9dcc-a970fcd2fbbf" }, "json": null, "origin": "219.238.82.169", "url": "http://httpbin.org/post"}
from urllib import request, parseurl = ‘http://httpbin.org/post‘dict = { ‘name‘: ‘Germey‘}data = bytes(parse.urlencode(dict), encoding=‘utf8‘)req = request.Request(url=url, data=data, method=‘POST‘)req.add_header(‘User-Agent‘, ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘)response = request.urlopen(req)print(response.read().decode(‘utf-8‘))
输出:{ "args": {}, "data": "", "files": {}, "form": { "name": "Germey" }, "headers": { "Accept-Encoding": "identity", "Connect-Time": "0", "Connection": "close", "Content-Length": "11", "Content-Type": "application/x-www-form-urlencoded", "Host": "httpbin.org", "Total-Route-Time": "0", "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)", "Via": "1.1 vegur", "X-Request-Id": "a624bcaa-3581-4b93-84b0-037940338e71" }, "json": null, "origin": "219.238.82.169", "url": "http://httpbin.org/post"}

Handler

代理

import urllib.requestproxy_handler = urllib.request.ProxyHandler({ ‘http‘: ‘http://127.0.0.1:9743‘, ‘https‘: ‘https://127.0.0.1:9743‘})opener = urllib.request.build_opener(proxy_handler)response = opener.open(‘http://httpbin.org/get‘)print(response.read())

Cookie

import http.cookiejar, urllib.requestcookie = http.cookiejar.CookieJar()handler = urllib.request.HTTPCookieProcessor(cookie)opener = urllib.request.build_opener(handler)response = opener.open(‘http://www.baidu.com‘)for item in cookie: print(item.name+"="+item.value)
输出:BAIDUID=E77BF84491E332F6F8F1D451AD0063D3:FG=1BIDUPSID=E77BF84491E332F6F8F1D451AD0063D3H_PS_PSSID=1466_21127_22075PSTM=1490198051BDSVRTM=0BD_HOME=0

把cookie保存到文件中

import http.cookiejar, urllib.requestfilename = "cookie.txt"cookie = http.cookiejar.MozillaCookieJar(filename)handler = urllib.request.HTTPCookieProcessor(cookie)opener = urllib.request.build_opener(handler)response = opener.open(‘http://www.baidu.com‘)cookie.save(ignore_discard=True, ignore_expires=True)

从cookie保存文件中读取cookie信息

import http.cookiejar, urllib.requestfilename = ‘cookie.txt‘cookie = http.cookiejar.LWPCookieJar(filename)handler = urllib.request.HTTPCookieProcessor(cookie)opener = urllib.request.build_opener(handler)response = opener.open(‘http://www.baidu.com‘)cookie.save(ignore_discard=True, ignore_expires=True)
import http.cookiejar, urllib.requestcookie = http.cookiejar.LWPCookieJar()cookie.load(‘cookie.txt‘, ignore_discard=True, ignore_expires=True)handler = urllib.request.HTTPCookieProcessor(cookie)opener = urllib.request.build_opener(handler)response = opener.open(‘http://www.baidu.com‘)print(response.read().decode(‘utf-8‘))

异常处理

from urllib import request, errortry: response = request.urlopen(‘http://cuiqingcai.com/index.htm‘)except error.URLError as e: print(e.reason)
from urllib import request, errortry: response = request.urlopen(‘http://cuiqingcai.com/index.htm‘)except error.HTTPError as e: print(e.reason, e.code, e.headers, sep=‘\n‘)except error.URLError as e: print(e.reason)else: print(‘Request Successfully‘)
输出:Not Found404Server: nginx/1.10.1Date: Wed, 22 Mar 2017 15:59:55 GMTContent-Type: text/html; charset=UTF-8Transfer-Encoding: chunkedConnection: closeVary: CookieExpires: Wed, 11 Jan 1984 05:00:00 GMTCache-Control: no-cache, must-revalidate, max-age=0Link: <http://cuiqingcai.com/wp-json/>; rel="https://api.w.org/"
import socketimport urllib.requestimport urllib.errortry: response = urllib.request.urlopen(‘https://www.baidu.com‘, timeout=0.01)except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print(‘TIME OUT‘)
<class ‘socket.timeout‘>TIME OUT

URL解析

urlparse

urllib.parse.urlparse(urlstring, scheme=‘‘, allow_fragments=True)

from urllib.parse import urlparseresult = urlparse(‘http://www.baidu.com/index.html;user?id=5#comment‘)print(type(result), result)
输出:<class ‘urllib.parse.ParseResult‘> ParseResult(scheme=‘http‘, netloc=‘www.baidu.com‘, path=‘/index.html‘, params=‘user‘, query=‘id=5‘, fragment=‘comment‘)
from urllib.parse import urlparseresult = urlparse(‘www.baidu.com/index.html;user?id=5#comment‘, scheme=‘https‘)print(result)
输出:ParseResult(scheme=‘https‘, netloc=‘‘, path=‘www.baidu.com/index.html‘, params=‘user‘, query=‘id=5‘, fragment=‘comment‘)
from urllib.parse import urlparseresult = urlparse(‘http://www.baidu.com/index.html;user?id=5#comment‘, allow_fragments=False)print(result)
输出:ParseResult(scheme=‘http‘, netloc=‘www.baidu.com‘, path=‘/index.html‘, params=‘user‘, query=‘id=5#comment‘, fragment=‘‘)
from urllib.parse import urlparseresult = urlparse(‘http://www.baidu.com/index.html#comment‘, allow_fragments=False)print(result)
输出:ParseResult(scheme=‘http‘, netloc=‘www.baidu.com‘, path=‘/index.html#comment‘, params=‘‘, query=‘‘, fragment=‘‘)

urlunparse

from urllib.parse import urlunparsedata = [‘http‘, ‘www.baidu.com‘, ‘index.html‘, ‘user‘, ‘a=6‘, ‘comment‘]print(urlunparse(data))
输出:http://www.baidu.com/index.html;user?a=6#comment

urljoin

from urllib.parse import urljoinprint(urljoin(‘http://www.baidu.com‘, ‘FAQ.html‘))print(urljoin(‘http://www.baidu.com‘, ‘https://cuiqingcai.com/FAQ.html‘))print(urljoin(‘http://www.baidu.com/about.html‘, ‘https://cuiqingcai.com/FAQ.html‘))print(urljoin(‘http://www.baidu.com/about.html‘, ‘https://cuiqingcai.com/FAQ.html?question=2‘))print(urljoin(‘http://www.baidu.com?wd=abc‘, ‘https://cuiqingcai.com/index.php‘))print(urljoin(‘http://www.baidu.com‘, ‘?category=2#comment‘))print(urljoin(‘www.baidu.com‘, ‘?category=2#comment‘))print(urljoin(‘www.baidu.com#comment‘, ‘?category=2‘))
输出:http://www.baidu.com/FAQ.htmlhttps://cuiqingcai.com/FAQ.htmlhttps://cuiqingcai.com/FAQ.htmlhttps://cuiqingcai.com/FAQ.html?question=2https://cuiqingcai.com/index.phphttp://www.baidu.com?category=2#commentwww.baidu.com?category=2#commentwww.baidu.com?category=2

urlencode

from urllib.parse import urlencodeparams = { ‘name‘: ‘germey‘, ‘age‘: 22}base_url = ‘http://www.baidu.com?‘url = base_url + urlencode(params)print(url)
输出:http://www.baidu.com?name=germey&age=22

相关文章