首先新建一个.py脚本文件
[root@VM_0_13_centos ~]# vim top250.py

#!/bin/python3
# -*- coding: utf-8 -*-
# author:ujslxw time:2020/10/24
import re,json
def getPage(url):
    #伪造浏览器信息
    from urllib.request import urlopen, Request
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
    url = Request(url, headers=headers)
    response = urlopen(url, timeout=10)
    return response.read().decode('utf-8')
def parsePage(s):
    ret = re.findall(
        '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>'
        '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>.*?<span class="inq">(?P<inq>.*?)</span>',s,re.S)
    return ret
def main(num):
    url = 'https://movie.douban.com/top250?start=%s&filter=' % num
    response_html = getPage(url)
    ret = parsePage(response_html)
    print(ret)
    f=open("Douban_top250.txt","a",encoding="utf8")
    for obj in ret:
        print(obj)
        data = json.dumps(obj, ensure_ascii=False)
        f.write(data+'\n')
if __name__ == '__main__':
    count = 0
    for i in range(10):   # 10页
        main(count)
        count += 25

为top250.py添加执行权限
[root@VM_0_13_centos ~]# chmod a+x top250.py

执行脚本
[root@VM_0_13_centos ~]# ./top250.py &>/dev/null
# &>/dev/null 将命令的输出重定向到/dev/null(不显示命令的执行过程)

查看文件Douban_top250
[root@VM_0_13_centos ~]# ls
Douban_top250.txt  top250.py
[root@VM_0_13_centos ~]# vim Douban_top250.txt

Douban_top250.txt

最后修改:2020 年 10 月 24 日 10 : 23 AM
如果觉得我的文章对你有用,请随意赞赏