urllib库 | White Spider

urlib python3

爬虫

发布日期: 2019-11-22

文章字数: 1.4k

阅读时长: 6 分

阅读次数:

1.urlib三个模块:

request：模拟发送请求

error：异常处理，对请求错错误进行捕获

parse: 提供处理URL的方法

robotparser: 识别网站的robots.txt文件，判断哪些网站可以爬取

2.发送请求

urlopen


import urllib.request
response=urllib.request.urlopen('https://www.python.org')
#print(response.read().decode('utf-8'))
print(type(response))#返回一个HTTPResponse


print(response.status)
print(response.getheaders())
print(response.getheader('server'))

data参数，添加该参数需要将参数转化为bytes类型

import urllib.parse
import urllib.request
data=bytes(urllib.parse.urlencode({'world':'hello'}),encoding='utf-8')#urlcoding将字典类型转化为字符串
#bytes的第一个参数应为str类型
response =urllib.request.urlopen('https://httpbin.org/post',data=data)
print(response.read())

timeout参数,用于设置超时时间，单位秒，超过此时间抛出异常

import socket
import urllib.request
try:
    response=urllib.request.urlopen('https://httpbin.org/get',timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason,socket.timeout):
        print("Timeout Out")
    else:
        print(response.read())

Request类

import urllib.request

#Request的构造方法
# class urllib.request.Request(url,data=None,headers={},origin_req_host=Node,unverifiable=False,method=None)
#url:必传参数,data可选，bytes类型
#headers字典修改User-Agent伪装浏览器
#origin_req_host请求方的host名字或ip
#请求使用的方法，get,post,put
request=urllib.request.Request("https://python.org")
response=urllib.request.urlopen(request)

print(response.read().decode('utf-8'))


from urllib import request,parse
url='https://www.httpbin.org/post'
headers={
    'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
    'Host':'httpbin.org'
}
dict={
    'name':'Tom'
}
data=bytes(parse.urlencode(dict),encoding='utf8')
req=request.Request(url=url,data=data,headers=headers,method='POST')
response=request.urlopen(req)
print(response.read().decode('utf-8'))

高级用法

urllib.request模块的BaseHandler类，用来处理Cookie,代理。所有Handler的父类。

子类：

HTTPDefaultErrorHandler:处理HTTP响应错误，错误会抛出HTTPError类型的异常

HTTPRedirectHandler:处理重定向

HTTPCookieProcessor:处理Cookie

ProxyHandler:设置代理

HTTPPasswordMgr:管理密码，维护用户名和密码的表

HTTPBasicAuthHandler:管理认证，链接打开需要认证时，可以用它解决认证问题

利用Handler来创建更底层的类opener,来实现更加高级的功能

#跳过弹框身份验证
from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener
from urllib.error import URLError
username='username'
password='password'
url='http://116.62.203.42/empManagement/login.php'

p=HTTPPasswordMgrWithDefaultRealm()
p.add_password(None,url,username,password)
auth_handler=HTTPBasicAuthHandler(p)
opener=build_opener(auth_handler)
try:

    result=opener.open(url)
    html=result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

#代理
from urllib.request import ProxyHandler ,build_opener
from urllib.error import URLError

#本地代理服务器，运行在9743端口
proxy_handler=ProxyHandler({
'http':'http://127.0.0.1:9743',
'https':'https://127.0.0.1:9743'
})
opener=build_opener(proxy_handler)
try:
    result=opener.open('https://www.baidu.com')
    html=result.read().decode('utf-8')
except URLError as e:
    print(e.reason)

#Cookies，获取百度Cookie
import http.cookiejar ,urllib.request
cookie=http.cookiejar.CookieJar()#声明CookieJar对象
handler=urllib.request.HTTPCookieProcessor(cookie)#HTTPCookieProcessor构建handler
opener=urllib.request.build_opener(handler)
response=opener.open('https://www.baidu.com')
for item in cookie:
    print(item.name+" = "+item.value)

3.处理异常

URLError

URLError类来自urllib.error类，由request类产生的异常都可以通过捕获这个类来处理

from urllib import request,error

try:
    response=request.urlopen('http://116.62.203.42/index.php')
except error.URLError as e:
    print(e.reason)

HTTPError,error的子类，专门用来处理HTTP请求错误
三个属性
code:返回HTTP状态，404，500
reason:饭返回错误的原因
headers:返回请求头

from urllib import request,error

try:
    response=request.urlopen('http://116.62.203.42/index.php')
except error.HTTPError as e:
    print(e.reason,e.code,e.headers,sep='\n')
except error.URLError as e:
    print (e.reason)
else:
    print('Request Successfully')

3.解析链接

urlparse可以实现url的解析和识别

from urllib.parse import urlparse
result=urlparse('http://www.baidu.com/index.html;user?id=5#comment/')
print(type(result),result)
print(result.scheme)
print(result[0])

url构造 urlunparse()

from urllib.parse import urlunparse
#urlunparse的参数长度必须是6
data=['http','www.baidu.com','index.html','user','a=6','comment']
print(urlunparse(data))

urlsplit()与urlparse(),urlunsplit()与urlunparse()类似，只是参数的长度为5
urljoin()，生成链接

第一个参数为基础链接，新的链接作为第二个参数，解析基础链接的scheme,netloc,path，
若新链接缺少这三个部分，就对新连接进行补充


from urllib.parse import urljoin
print(urljoin('http://www.baidu.com','FAQ.html'))
print(urljoin('http://www.baidu.com','https://www.taobao.com/FAQ.html'))
print(urljoin('http://www.baidu.com/about.html','https://www.taobao.com/FAQ.html'))
print(urljoin('http://www.baidu.com?wd=avv','https://www.taobao.com/index.html'))
print(urljoin('http://www.baidu.com','?category=2#comment'))

urlencode(),用于构造get请求参数

from urllib.parse import urlencode
params={#字典类型

'name':'Tom',
'age':'12'
}
base_url='https://www.baidu.com'
url=base_url+urlencode(params)
print(url)

parse_qs 反序列化，将一个GET请求转回字典

from urllib.parse import parse_qs
query='name=Tom&age=12'
print(parse_qs(query))

parse_qsl 将参数转化为元组组成的列表

from urllib.parse import parse_qsl
query='name=Tom&age=12'
list1=parse_qsl(query)
print(parse_qsl(query))
print(list1[0])

quote 将内容转化为url编码的格式，例如将中文转化为URL编码

from urllib.parse import quote
keyword='沙雕'
url='https://www.baidu.com?wd='+quote(keyword)
print(url)

4.robots协议

robots协议，爬虫协议，用来告诉爬虫和搜索引擎哪些页面可以抓取，哪些不可以。通常是一个robots.txt的文本文件，放在网站的根目录

https://www.jianshu.com/robots.txt

2.robotparser

robotparser模块来解析robots.txt，该模块提供了一个类RobotFileParser，它可以根据网站的robots.txt文件来判断一个爬取爬虫施是否有权限爬取这个网页

from urllib.robotparser import RobotFileParser
rp=RobotFileParser()
rp.set_url('https://www.jianshu.com/robots.txt')
rp.read()
print(rp.can_fetch('*','https://www.jianshu.com/p/0826cf4692f9'))
print(rp.can_fetch('*','https://www.jianshu.com/p/5f054ad47f50'))

from urllib.robotparser import RobotFileParser
from urllib.request import urlopen
rp=RobotFileParser()
rp.parse(urlopen('https://www.jd.com/robots.txt').read().decode('utf-8').split('\n'))
print(rp.can_fetch('*','https://search.jd.com/Search?keyword=%E6%99%BA%E8%83%BD%E5%86%B0%E7%AE%B1&enc=utf-8&spm=2.1.1'))
print(rp.can_fetch('*','https://mall.jd.com/index-1000000950.html'))