美文网首页Python
我自己爬——Beautiful Soup/XPath/正则三种方

我自己爬——Beautiful Soup/XPath/正则三种方

作者: migugu | 来源:发表于2020-05-23 09:46 被阅读0次

    我自己爬——Beautiful Soup/XPath/正则三种方式爬取豆瓣电影top250

    准备将自己之前用Beautiful Soup乱写的豆瓣爬虫作为Python大作业交上去,结果发现要求用正则orz...

    于是便有了这篇——用三种不同的方式来爬豆瓣电影top250

    爬取url: https://movie.douban.com/top250

    观察页面结构不难发现这250条记录分布在10页上,每页25条,于是,可以找到url的规律:

    for offset in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start=' + \
            str(offset) + '&filter='
    

    接下来还是老套路,先爬一爬看看html(豆瓣有反爬,带上响应头)

    def get_url(url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    

    分别用三种不同的方式爬:

    BeautifulSoup:

    def html_parser(url):
        html = get_url(url)
        soup = BeautifulSoup(html, 'html.parser')
        items = soup.find_all(name='li')
        content = []
        for item in items[18:]:
            index = item.find(name='em').get_text()
            title = item.find(name='span', attrs='title').get_text()
            rating = item.find(name='span', attrs='rating_num').get_text()
            link = item.find(name='a')['href']
            tmp_tuple = (index, title, rating, link)
            content.append(tmp_tuple)
        return content
    

    XPath:

    def html_parser(url):
        html = get_url(url)
        tree = etree.HTML(html)
        rank = tree.xpath('//em[@class]/text()') # 排名
        name = tree.xpath('//div[@class="hd"]//a[@href]//span[1]/text()') # 标题
        rating_num = tree.xpath('//span[@class="rating_num"]/text()') # 评分
        link = tree.xpath('//div[@class="hd"]//a[@href]/@href') # 链接
        return zip(rank, name, rating_num, link)
    

    正则:(我怎么写的这么复杂)

    def html_parser(url):
        html = get_url(url)
        Patt_index = re.compile('<em class="">(.*?)</em>')  # 排名
        Patt_name = re.compile('<span class="title">(.*?)</span>')  # 电影名称
        Patt_rating = re.compile(
            '<span class="rating_num" property="v:average">(.*?)</span>')  # 评分
        Patt_link = re.compile('<a href="(.*?)/">')  # 链接
        rank = re.findall(Patt_index, html)
        name = re.findall(Patt_name, html)
        rating_num = re.findall(Patt_rating, html)
        link = re.findall(Patt_link, html)
        pro_name = []
        for i in name:
            if '&' not in i:
                pro_name.append(i)
        return zip(rank, pro_name, rating_num, link)
    

    写入文件:

    def write_movies_file(items, tplt):
        with open(filename, 'a', encoding='utf-8') as f:
            # f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
            f.write(tplt.format(items[0], items[1],
                                items[2], items[3], chr(12288)))
            f.write('\n')
    

    全部代码:

    BeautifulSoup:

    import requests
    from bs4 import BeautifulSoup
    
    
    def get_url(url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    
    def html_parser(url):
        html = get_url(url)
        soup = BeautifulSoup(html, 'html.parser')
        items = soup.find_all(name='li')
        content = []
        for item in items[18:]:
            index = item.find(name='em').get_text()
            title = item.find(name='span', attrs='title').get_text()
            rating = item.find(name='span', attrs='rating_num').get_text()
            link = item.find(name='a')['href']
            tmp_tuple = (index, title, rating, link)
            content.append(tmp_tuple)
        return content
    
    
    def write_movies_file(items, tplt):
        with open('movies_bs4.txt', 'a', encoding='utf-8') as f:
            # f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
            f.write(tplt.format(items[0], items[1],
                                items[2], items[3], chr(12288)))
            f.write('\n')
    
    
    def main():
        tplt = "{0:^10}\t{1:^30}\t{2:^30}\t{3:30}"
        for offset in range(0, 250, 25):
            url = 'https://movie.douban.com/top250?start=' + \
                str(offset) + '&filter='
            for item in html_parser(url):
                write_movies_file(item, tplt)
                print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))
    
    
    if __name__ == '__main__':
        main()
    

    XPath:

    import requests
    from lxml import etree
    
    
    def get_url(url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    
    def html_parser(url):
        html = get_url(url)
        tree = etree.HTML(html)
        rank = tree.xpath('//em[@class]/text()') # 排名
        name = tree.xpath('//div[@class="hd"]//a[@href]//span[1]/text()') # 标题
        rating_num = tree.xpath('//span[@class="rating_num"]/text()') # 评分
        link = tree.xpath('//div[@class="hd"]//a[@href]/@href') # 链接
        return zip(rank, name, rating_num, link)
    
    
    def write_movies_file(items, tplt):
        with open('movies_xpath.txt', 'a', encoding='utf-8') as f:
            # f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
            f.write(tplt.format(items[0], items[1],
                                items[2], items[3], chr(12288)))
            f.write('\n')
    
    
    def main():
        tplt = "{0:^10}\t{1:^30}\t{2:^30}\t{3:30}"
        for offset in range(0, 250, 25):
            url = 'https://movie.douban.com/top250?start=' + \
                str(offset) + '&filter='
            items = list(html_parser(url))
            for item in items:
                write_movies_file(item, tplt)
                print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))
    
    
    if __name__ == '__main__':
        main()
    

    正则:

    import requests
    import re
    
    
    def get_url(url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    
    def html_parser(url):
        html = get_url(url)
        Patt_index = re.compile('<em class="">(.*?)</em>')  # 排名
        Patt_name = re.compile('<span class="title">(.*?)</span>')  # 电影名称
        Patt_rating = re.compile(
            '<span class="rating_num" property="v:average">(.*?)</span>')  # 评分
        Patt_link = re.compile('<a href="(.*?)/">')  # 链接
        rank = re.findall(Patt_index, html)
        name = re.findall(Patt_name, html)
        rating_num = re.findall(Patt_rating, html)
        link = re.findall(Patt_link, html)
        pro_name = []
        for i in name:
            if '&' not in i:
                pro_name.append(i)
        return zip(rank, pro_name, rating_num, link)
    
    
    def write_movies_file(items, tplt):
        with open('movies_re.txt', 'a', encoding='utf-8') as f:
            # f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
            f.write(tplt.format(items[0], items[1],
                                items[2], items[3], chr(12288)))
            f.write('\n')
    
    
    def main():
        tplt = "{0:^10}\t{1:^30}\t{2:^30}\t{3:30}"
        for offset in range(0, 250, 25):
            url = 'https://movie.douban.com/top250?start=' + \
                str(offset) + '&filter='
            items = list(html_parser(url))
            for item in items:
                write_movies_file(item, tplt)
                print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))
    
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

        本文标题:我自己爬——Beautiful Soup/XPath/正则三种方

        本文链接:https://www.haomeiwen.com/subject/xjkkahtx.html