美文网首页
Python爬虫 猫眼电影TOP100

Python爬虫 猫眼电影TOP100

作者: Assassin007 | 来源:发表于2019-04-13 10:28 被阅读0次

    爬虫练习, 使用几种不同的方式爬取猫眼电影TOP100

    猫眼电影TOP100的页面结构比较简单, 电影的信息都存储在dl标签下, 每一个dd标签存储一部电影的信息


    在这里插入图片描述
    • requests + 正则表达式提取

    import requests
    import re
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/67.0.3396.99 Safari/537.36'}
    
    
    def get_one_page(url):
        try:
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            return r.text
        except Exception as e:
            print(e)
            return ''
    
    
    def parse_one_page(html):
        pattern = '<dd>.*?index.*?>(.*?)</i>.*?name.*?a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)' \
                  '</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>'
        items = re.findall(pattern, html, re.S)
        for item in items:
            index = item[0]
            name = item[1]
            star = item[2].strip()[3:]
            time = item[3].strip()[5:]
            score = item[4] + item[5]
            print(index, name, star, time, score)
    
    
    def main():
        for i in range(10):
            url = "http://maoyan.com/board/4?offset=" + str(i * 10)
            html = get_one_page(url)
            parse_one_page(html)
    
    
    if __name__ == '__main__':
        main()
    
    
    • requests + BeautifulSoup提取, 信息保存到excel表格内

    import requests
    from bs4 import BeautifulSoup
    from openpyxl import Workbook
    from openpyxl.styles import Alignment
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/67.0.3396.99 Safari/537.36'}
    
    
    def get_one_page(url):
        try:
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            return r.text
        except Exception as e:
            print(e)
            return ''
    
    
    def parse_one_page(html, sheet):
        soup = BeautifulSoup(html, 'lxml')
    
        # 每一部电影的信息都在<dl class='board-wrapper'>下面的<dd>标签里面
        items = soup.find(class_='board-wrapper').find_all(name='dd')
    
        for item in items:
            # 遍历每一个dd标签, 将找到的数据追加到表格末尾
            index = item.find(name='i').string
            name = item.find(class_='name').string
            star = item.find(class_='star').string.replace('主演:', '').strip()
            time = item.find(class_='releasetime').string.replace('上映时间:', '')
            score = item.find(class_='integer').string + item.find(class_='fraction').string
            sheet.append([index, name, star, time, score])
    
    
    def main():
        # 新建表格
        wb = Workbook()
        # 使用默认的工作表
        ws = wb.active
    
        # 设置列宽
        ws.column_dimensions['A'].width = 8
        ws.column_dimensions['B'].width = 20
        ws.column_dimensions['C'].width = 50
        ws.column_dimensions['D'].width = 26
        ws.column_dimensions['E'].width = 8
    
        # 填写标题, 在openpyxl中行列号均从1开始
        ws.cell(1, 1).value = '排名'
        ws.cell(1, 2).value = '电影名'
        ws.cell(1, 3).value = '主演'
        ws.cell(1, 4).value = '上映时间'
        ws.cell(1, 5).value = '评分'
    
        for i in range(10):
            url = "http://maoyan.com/board/4?offset=" + str(i * 10)
            html = get_one_page(url)
            parse_one_page(html, ws)
    
        # 设置单元格居中对齐
        ag = Alignment(horizontal='center', vertical='center')
        for i in range(ws.max_row):
            for j in range(ws.max_column):
                ws.cell(i + 1, j + 1).alignment = ag
    
        wb.save('./info.xlsx')
    
    
    if __name__ == '__main__':
        main()
    
    
    • selenium自动化爬取, 保存为csv文件

    from selenium import webdriver
    from selenium.common.exceptions import WebDriverException
    from selenium.webdriver.chrome.options import Options
    import csv
    
    
    def get_one_page(driver, url, writer):
        try:
            driver.get(url)
            # 每一部电影的信息都在<dl class='board-wrapper'>下面的<dd>标签里面
            data = driver.find_element_by_class_name("board-wrapper").find_elements_by_tag_name("dd")
    
            for j in range(len(data)):
                # 遍历每一个dd标签, 将找到的数据追加到文件末尾
                index = data[j].find_element_by_tag_name("i").text
                name = data[j].find_element_by_class_name("name").text
                star = data[j].find_element_by_class_name("star").text.replace("主演:", "")
                time = data[j].find_element_by_class_name("releasetime").text.replace("上映时间:", "")
                score = data[j].find_element_by_class_name("integer").text + data[j].find_element_by_class_name(
                    "fraction").text
                writer.writerow([index, name, star, time, score])
    
        except TimeoutError or WebDriverException as e:
            print(e)
    
    
    def main():
        # 使用chrome的无头模式
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        with open('info.csv', 'w', encoding='utf-8') as f:
            writer = csv.writer(f)
            for i in range(10):
                url = "http://maoyan.com/board/4?offset=" + str(i * 10)
                get_one_page(driver, url, writer)
            driver.quit()
    
    
    if __name__ == '__main__':
        main()
    
    
    

    相关文章

      网友评论

          本文标题:Python爬虫 猫眼电影TOP100

          本文链接:https://www.haomeiwen.com/subject/frhlwqtx.html