1.urlib三个模块:
request:模拟发送请求
error:异常处理,对请求错错误进行捕获
parse: 提供处理URL的方法
robotparser: 识别网站的robots.txt文件,判断哪些网站可以爬取
2.发送请求
- urlopen
import urllib.request
response=urllib.request.urlopen('https://www.python.org')
#print(response.read().decode('utf-8'))
print(type(response))#返回一个HTTPResponse
print(response.status)
print(response.getheaders())
print(response.getheader('server'))
data参数,添加该参数需要将参数转化为bytes类型
import urllib.parse
import urllib.request
data=bytes(urllib.parse.urlencode({'world':'hello'}),encoding='utf-8')#urlcoding将字典类型转化为字符串
#bytes的第一个参数应为str类型
response =urllib.request.urlopen('https://httpbin.org/post',data=data)
print(response.read())
timeout参数,用于设置超时时间,单位秒,超过此时间抛出异常
import socket
import urllib.request
try:
response=urllib.request.urlopen('https://httpbin.org/get',timeout=0.1)
except urllib.error.URLError as e:
if isinstance(e.reason,socket.timeout):
print("Timeout Out")
else:
print(response.read())
Request类
import urllib.request #Request的构造方法 # class urllib.request.Request(url,data=None,headers={},origin_req_host=Node,unverifiable=False,method=None) #url:必传参数,data可选,bytes类型 #headers字典修改User-Agent伪装浏览器 #origin_req_host请求方的host名字或ip #请求使用的方法,get,post,put request=urllib.request.Request("https://python.org") response=urllib.request.urlopen(request) print(response.read().decode('utf-8'))
from urllib import request,parse url='https://www.httpbin.org/post' headers={ 'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host':'httpbin.org' } dict={ 'name':'Tom' } data=bytes(parse.urlencode(dict),encoding='utf8') req=request.Request(url=url,data=data,headers=headers,method='POST') response=request.urlopen(req) print(response.read().decode('utf-8'))
高级用法
urllib.request模块的BaseHandler类,用来处理Cookie,代理。所有Handler的父类。
子类:
HTTPDefaultErrorHandler:处理HTTP响应错误,错误会抛出HTTPError类型的异常
HTTPRedirectHandler:处理重定向
HTTPCookieProcessor:处理Cookie
ProxyHandler:设置代理
HTTPPasswordMgr:管理密码,维护用户名和密码的表
HTTPBasicAuthHandler:管理认证,链接打开需要认证时,可以用它解决认证问题
利用Handler来创建更底层的类opener,来实现更加高级的功能
#跳过弹框身份验证
from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener
from urllib.error import URLError
username='username'
password='password'
url='http://116.62.203.42/empManagement/login.php'
p=HTTPPasswordMgrWithDefaultRealm()
p.add_password(None,url,username,password)
auth_handler=HTTPBasicAuthHandler(p)
opener=build_opener(auth_handler)
try:
result=opener.open(url)
html=result.read().decode('utf-8')
print(html)
except URLError as e:
print(e.reason)
#代理
from urllib.request import ProxyHandler ,build_opener
from urllib.error import URLError
#本地代理服务器,运行在9743端口
proxy_handler=ProxyHandler({
'http':'http://127.0.0.1:9743',
'https':'https://127.0.0.1:9743'
})
opener=build_opener(proxy_handler)
try:
result=opener.open('https://www.baidu.com')
html=result.read().decode('utf-8')
except URLError as e:
print(e.reason)
#Cookies,获取百度Cookie
import http.cookiejar ,urllib.request
cookie=http.cookiejar.CookieJar()#声明CookieJar对象
handler=urllib.request.HTTPCookieProcessor(cookie)#HTTPCookieProcessor构建handler
opener=urllib.request.build_opener(handler)
response=opener.open('https://www.baidu.com')
for item in cookie:
print(item.name+" = "+item.value)
3.处理异常
URLError
URLError类来自urllib.error类,由request类产生的异常都可以通过捕获这个类来处理
from urllib import request,error try: response=request.urlopen('http://116.62.203.42/index.php') except error.URLError as e: print(e.reason)
HTTPError,error的子类,专门用来处理HTTP请求错误
三个属性
code:返回HTTP状态,404,500
reason:饭返回错误的原因
headers:返回请求头from urllib import request,error try: response=request.urlopen('http://116.62.203.42/index.php') except error.HTTPError as e: print(e.reason,e.code,e.headers,sep='\n') except error.URLError as e: print (e.reason) else: print('Request Successfully')
3.解析链接
- urlparse可以实现url的解析和识别
from urllib.parse import urlparse result=urlparse('http://www.baidu.com/index.html;user?id=5#comment/') print(type(result),result) print(result.scheme) print(result[0])
- url构造 urlunparse()
from urllib.parse import urlunparse #urlunparse的参数长度必须是6 data=['http','www.baidu.com','index.html','user','a=6','comment'] print(urlunparse(data))
urlsplit()与urlparse(),urlunsplit()与urlunparse()类似,只是参数的长度为5
urljoin(),生成链接
第一个参数为基础链接,新的链接作为第二个参数,解析基础链接的scheme,netloc,path,
若新链接缺少这三个部分,就对新连接进行补充
from urllib.parse import urljoin print(urljoin('http://www.baidu.com','FAQ.html')) print(urljoin('http://www.baidu.com','https://www.taobao.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html','https://www.taobao.com/FAQ.html')) print(urljoin('http://www.baidu.com?wd=avv','https://www.taobao.com/index.html')) print(urljoin('http://www.baidu.com','?category=2#comment'))
- urlencode(),用于构造get请求参数
from urllib.parse import urlencode params={#字典类型 'name':'Tom', 'age':'12' } base_url='https://www.baidu.com' url=base_url+urlencode(params) print(url)
- parse_qs 反序列化,将一个GET请求转回字典
from urllib.parse import parse_qs query='name=Tom&age=12' print(parse_qs(query))
- parse_qsl 将参数转化为元组组成的列表
from urllib.parse import parse_qsl query='name=Tom&age=12' list1=parse_qsl(query) print(parse_qsl(query)) print(list1[0])
- quote 将内容转化为url编码的格式,例如将中文转化为URL编码
from urllib.parse import quote keyword='沙雕' url='https://www.baidu.com?wd='+quote(keyword) print(url)
4.robots协议
robots协议,爬虫协议,用来告诉爬虫和搜索引擎哪些页面可以抓取,哪些不可以。通常是一个robots.txt的文本文件,放在网站的根目录
https://www.jianshu.com/robots.txt
2.robotparser
robotparser模块来解析robots.txt,该模块提供了一个类RobotFileParser,它可以根据网站的robots.txt文件来判断一个爬取爬虫施是否有权限爬取这个网页
from urllib.robotparser import RobotFileParser
rp=RobotFileParser()
rp.set_url('https://www.jianshu.com/robots.txt')
rp.read()
print(rp.can_fetch('*','https://www.jianshu.com/p/0826cf4692f9'))
print(rp.can_fetch('*','https://www.jianshu.com/p/5f054ad47f50'))
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen
rp=RobotFileParser()
rp.parse(urlopen('https://www.jd.com/robots.txt').read().decode('utf-8').split('\n'))
print(rp.can_fetch('*','https://search.jd.com/Search?keyword=%E6%99%BA%E8%83%BD%E5%86%B0%E7%AE%B1&enc=utf-8&spm=2.1.1'))
print(rp.can_fetch('*','https://mall.jd.com/index-1000000950.html'))