实战1:图片爬取
网站:https://www.mzitu.com/mm
import requests
from pyquery import PyQuery as pq
def get_page(page):
url='https://www.mzitu.com/mm/page/'+str(page)
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.text
else :
return None
def parse_inpage(html):
doc = pq(html)
img = doc('.main .content .main-image img')
url=img.attr('src')
return url
def get_inpage(item):
url=item.get('image')
title=item.get('title')
headers={
'Referer': 'https://www.mzitu.com',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
for i in range(1,11):
response=requests.get(url+'/'+str(i),headers=headers)
if response.status_code==200:
in_url=parse_inpage(response.text)
img_name=str(in_url)[-9:-4]
yield{
'image':in_url,
'img_name':img_name,
'title':title
}
def page_parese(html):
if (html!=None) :
doc=pq(html)
if (doc != None):
items=doc('.main .main-content .postlist ul li').items()
for item in items:
image=item.find('a').attr('href')
title=item.find('img').attr('alt')
yield{
'image':image,
'title':title
}
import os
def save_images(item):
if not os.path.exists('img/'+item.get('title')):
os.mkdir('img/'+item.get('title'))
try:
headers = {
'Referer': 'https://www.mzitu.com',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
response=requests.get(item.get('image'),headers=headers)
if response.status_code == 200 :
file_path='img/{0}/{1}.{2}'.format(item.get('title'),item.get('img_name'),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(response.content)
else:
print('Already Downloaded',file_path)
except requests.ConnectionError as e:
print(e.args)
print('Filed to save images')
def main(page):
html=get_page(page)
for item in page_parese(html):
print(item)
for innerpage in get_inpage(item):
save_images(innerpage)
print(innerpage)
print('='*50)
starting=1
ending=20
from multiprocessing.pool import Pool
if __name__ == '__main__':
pool=Pool()
groups=([x for x in range(starting,ending+1)])
pool.map(main,groups)
pool.close()
pool.join()
实战2.微博动态获取
from urllib.parse import urlencode
import requests
base_url='https://m.weibo.cn/api/container/getIndex?'
headers={
'Host':'m.weibo.cn',
'Referer': 'https://m.weibo.cn/u/XXX',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
def get_page(page):
params={
'type':'uid',
'values': 'XXX',
'containerid': '107603XXX',
'page': page
}
url=base_url+urlencode(params)
try:
response=requests.get(url,headers=headers)
if response.status_code==200 :
return response.json()
except requests.ConnectionError as e:
print('Error',e.args)
from pyquery import PyQuery as pq
def parse_page(json):
if json:
cards=json.get('data').get('cards')
for card in cards:
card=card.get('mblog')
weibo={}
weibo['id']=card.get('id')
weibo['text']=pq(card.get('text')).text()
weibo['attitudes'] = card.get('attitudes_count')
weibo['comments'] = card.get('comments_count')
weibo['reposts'] = card.get('reposts_count')
yield weibo
from pymongo import MongoClient
client = MongoClient()
db = client['weibo']
collection = db['weibo']
def save_to_mongo(result):
if collection.insert(result):
print('saved to mongodb')
if __name__ =='__main__':
for page in range(1,11):
json=get_page(page)
print(json)
results=parse_page(json)
for result in results:
print(result)
save_to_mongo(result)