我自己爬——Beautiful Soup/XPath/正则三种方式爬取豆瓣电影top250

准备将自己之前用Beautiful Soup乱写的豆瓣爬虫作为Python大作业交上去，结果发现要求用正则orz...

于是便有了这篇——用三种不同的方式来爬豆瓣电影top250

观察页面结构不难发现这250条记录分布在10页上，每页25条，于是，可以找到url的规律：

for offset in range(0, 250, 25):
    url = 'https://movie.douban.com/top250?start=' + \
        str(offset) + '&filter='

接下来还是老套路，先爬一爬看看html(豆瓣有反爬，带上响应头)

def get_url(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

分别用三种不同的方式爬：

BeautifulSoup:

def html_parser(url):
    html = get_url(url)
    soup = BeautifulSoup(html, 'html.parser')
    items = soup.find_all(name='li')
    content = []
    for item in items[18:]:
        index = item.find(name='em').get_text()
        title = item.find(name='span', attrs='title').get_text()
        rating = item.find(name='span', attrs='rating_num').get_text()
        link = item.find(name='a')['href']
        tmp_tuple = (index, title, rating, link)
        content.append(tmp_tuple)
    return content

XPath:

def html_parser(url):
    html = get_url(url)
    tree = etree.HTML(html)
    rank = tree.xpath('//em[@class]/text()') # 排名
    name = tree.xpath('//div[@class="hd"]//a[@href]//span[1]/text()') # 标题
    rating_num = tree.xpath('//span[@class="rating_num"]/text()') # 评分
    link = tree.xpath('//div[@class="hd"]//a[@href]/@href') # 链接
    return zip(rank, name, rating_num, link)

正则：(我怎么写的这么复杂)

def html_parser(url):
    html = get_url(url)
    Patt_index = re.compile('<em class="">(.*?)</em>')  # 排名
    Patt_name = re.compile('<span class="title">(.*?)</span>')  # 电影名称
    Patt_rating = re.compile(
        '<span class="rating_num" property="v:average">(.*?)</span>')  # 评分
    Patt_link = re.compile('<a href="(.*?)/">')  # 链接
    rank = re.findall(Patt_index, html)
    name = re.findall(Patt_name, html)
    rating_num = re.findall(Patt_rating, html)
    link = re.findall(Patt_link, html)
    pro_name = []
    for i in name:
        if '&' not in i:
            pro_name.append(i)
    return zip(rank, pro_name, rating_num, link)

写入文件：

def write_movies_file(items, tplt):
    with open(filename, 'a', encoding='utf-8') as f:
        # f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
        f.write(tplt.format(items[0], items[1],
                            items[2], items[3], chr(12288)))
        f.write('\n')

全部代码：

BeautifulSoup:

import requests
from bs4 import BeautifulSoup


def get_url(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text


def html_parser(url):
    html = get_url(url)
    soup = BeautifulSoup(html, 'html.parser')
    items = soup.find_all(name='li')
    content = []
    for item in items[18:]:
        index = item.find(name='em').get_text()
        title = item.find(name='span', attrs='title').get_text()
        rating = item.find(name='span', attrs='rating_num').get_text()
        link = item.find(name='a')['href']
        tmp_tuple = (index, title, rating, link)
        content.append(tmp_tuple)
    return content


def write_movies_file(items, tplt):
    with open('movies_bs4.txt', 'a', encoding='utf-8') as f:
        # f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
        f.write(tplt.format(items[0], items[1],
                            items[2], items[3], chr(12288)))
        f.write('\n')


def main():
    tplt = "{0:^10}\t{1:^30}\t{2:^30}\t{3:30}"
    for offset in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start=' + \
            str(offset) + '&filter='
        for item in html_parser(url):
            write_movies_file(item, tplt)
            print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))


if __name__ == '__main__':
    main()

XPath:

import requests
from lxml import etree


def get_url(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text


def html_parser(url):
    html = get_url(url)
    tree = etree.HTML(html)
    rank = tree.xpath('//em[@class]/text()') # 排名
    name = tree.xpath('//div[@class="hd"]//a[@href]//span[1]/text()') # 标题
    rating_num = tree.xpath('//span[@class="rating_num"]/text()') # 评分
    link = tree.xpath('//div[@class="hd"]//a[@href]/@href') # 链接
    return zip(rank, name, rating_num, link)


def write_movies_file(items, tplt):
    with open('movies_xpath.txt', 'a', encoding='utf-8') as f:
        # f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
        f.write(tplt.format(items[0], items[1],
                            items[2], items[3], chr(12288)))
        f.write('\n')


def main():
    tplt = "{0:^10}\t{1:^30}\t{2:^30}\t{3:30}"
    for offset in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start=' + \
            str(offset) + '&filter='
        items = list(html_parser(url))
        for item in items:
            write_movies_file(item, tplt)
            print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))


if __name__ == '__main__':
    main()

正则：

import requests
import re


def get_url(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text


def html_parser(url):
    html = get_url(url)
    Patt_index = re.compile('<em class="">(.*?)</em>')  # 排名
    Patt_name = re.compile('<span class="title">(.*?)</span>')  # 电影名称
    Patt_rating = re.compile(
        '<span class="rating_num" property="v:average">(.*?)</span>')  # 评分
    Patt_link = re.compile('<a href="(.*?)/">')  # 链接
    rank = re.findall(Patt_index, html)
    name = re.findall(Patt_name, html)
    rating_num = re.findall(Patt_rating, html)
    link = re.findall(Patt_link, html)
    pro_name = []
    for i in name:
        if '&' not in i:
            pro_name.append(i)
    return zip(rank, pro_name, rating_num, link)


def write_movies_file(items, tplt):
    with open('movies_re.txt', 'a', encoding='utf-8') as f:
        # f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
        f.write(tplt.format(items[0], items[1],
                            items[2], items[3], chr(12288)))
        f.write('\n')


def main():
    tplt = "{0:^10}\t{1:^30}\t{2:^30}\t{3:30}"
    for offset in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start=' + \
            str(offset) + '&filter='
        items = list(html_parser(url))
        for item in items:
            write_movies_file(item, tplt)
            print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))


if __name__ == '__main__':
    main()