爬取猫眼电影排行榜TOP100

#爬取猫眼电影TOP100的电影名称,时间,评分,图片等信息
#requests库
#1.抓取首页
#2.正则提取

import re,requests,json,time
def get_one_page(url):
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'

    }

    response=requests.get(url,headers=headers)
    if response.status_code==200:
        return response.text
    return None

def parse_one_page(html):
    pattern = re.compile(
        '<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>'
        , re.S)
    items = re.findall(pattern, html)

    for item in items:#将结果生成字典
        yield{
            'index':item[0],
            'image':item[1],
            'title':item[2],
            'actor':item[3].strip()[3:] if len(item[3])> 3 else '',
            'time':item[4].strip()[5:] if len(item[4])> 5 else '',
            'score':item[5].strip()+item[6].strip()
        }
    return items

#写入文件
def write_to_file(content):
    with open('result.txt','a',encoding='utf-8') as f:
        #print(type(json.dumps(content)))
        f.write(json.dumps(content,ensure_ascii=False)+'\n')#ensure_ascii为False,输出结果是中文形式而不是Unicode

def main(offset):
    url="https://www.maoyan.com/board/4?offset="+str(offset)
    html=get_one_page(url)
    items=parse_one_page(html)
    for item in items:
        write_to_file(item)
if __name__=='__main__':
    for i in range(10):
        main(i*10)
        time.sleep(1)

输出:

"index": "1", "image": "https://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg@160w_220h_1e_1c", "title": "霸王别姬", "actor": "张国荣,张丰毅,巩俐", "time": "1993-07-26", "score": "9.5"}
{"index": "2", "image": "https://p0.meituan.net/movie/283292171619cdfd5b240c8fd093f1eb255670.jpg@160w_220h_1e_1c", "title": "肖申克的救赎", "actor": "蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿", "time": "1994-09-10(加拿大)", "score": "9.5"}
{"index": "3", "image": "https://p0.meituan.net/movie/289f98ceaa8a0ae737d3dc01cd05ab052213631.jpg@160w_220h_1e_1c", "title": "罗马假日", "actor": "格利高里·派克,奥黛丽·赫本,埃迪·艾伯特", "time": "1953-09-02(美国)", "score": "9.1"}
{"index": "4", "image": "https://p1.meituan.net/movie/6bea9af4524dfbd0b668eaa7e187c3df767253.jpg@160w_220h_1e_1c", "title": "这个杀手不太冷", "actor": "让·雷诺,加里·奥德曼,娜塔莉·波特曼", "time": "1994-09-14(法国)", "score": "9.5"}
{"index": "5", "image": "https://p1.meituan.net/movie/b607fba7513e7f15eab170aac1e1400d878112.jpg@160w_220h_1e_1c", "title": "泰坦尼克号", "actor": "莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩", "time": "1998-04-03", "score": "9.5"}
{"index": "6", "image": "https://p0.meituan.net/movie/da64660f82b98cdc1b8a3804e69609e041108.jpg@160w_220h_1e_1c", "title": "唐伯虎点秋香", "actor": "周星驰,巩俐,郑佩佩", "time": "1993-07-01(中国香港)", "score": "9.1"}
{"index": "7", "image": "https://p0.meituan.net/movie/223c3e186db3ab4ea3bb14508c709400427933.jpg@160w_220h_1e_1c", "title": "乱世佳人", "actor": "费雯·丽,克拉克·盖博,奥利维娅·德哈维兰", "time": "1939-12-15(美国)", "score": "9.1"}
{"index": "8", "image": "https://p0.meituan.net/movie/46c29a8b8d8424bdda7715e6fd779c66235684.jpg@160w_220h_1e_1c", "title": "魂断蓝桥", "actor": "费雯·丽,罗伯特·泰勒,露塞尔·沃特森", "time": "1940-05-17(美国)", "score": "9.2"}
{"index": "9", "image": "https://p1.meituan.net/movie/ba1ed511668402605ed369350ab779d6319397.jpg@160w_220h_1e_1c", "title": "天空之城", "actor": "寺田农,鹫尾真知子,龟山助清", "time": "1992-05-01", "score": "9.0"}
{"index": "10", "image": "https://p0.meituan.net/movie/b0d986a8bf89278afbb19f6abaef70f31206570.jpg@160w_220h_1e_1c", "title": "辛德勒的名单", "actor": "连姆·尼森,拉尔夫·费因斯,本·金斯利", "time": "1993-12-15(美国)", "score": "9.2"}
.......

   转载规则


《爬取猫眼电影排行榜TOP100》 White Spider 采用 知识共享署名 4.0 国际许可协议 进行许可。
  目录