美文网首页我爱编程
Python学习:爬个电影资源网站

Python学习:爬个电影资源网站

作者: youmu178 | 来源:发表于2018-05-04 09:56 被阅读55次
    image
    我们抓的网站地址是 http://xwxmovie.cn/

    用了seleniumBeautifulSoup

    首先还是最基本的初始化代码

    baseURL = "http://xwxmovie.cn/"
    headers = {
        'Host': 'xwxmovie.cn',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/64.0.3282.167 Safari/537.36"'
    }
    
    def browser_get():
        browser = webdriver.Chrome()
        browser.get(baseURL)
        html_text = browser.page_source
        page_count = get_page_count(html_text)
        get_page_data(html_text)
    
    

    一开始想用BeautifulSoup抓取片段的,犹豫刚学,很多API还不会用,最后用正则先匹配自己想要的区域,然后用BeautifulSoup匹配电影名等信息;

    items = re.findall(re.compile('<div id="post-.*?class="post-.*?style="position:.*?>'
                                      '.*?<div class="pinbin-image">(.*?)</div>'
                                      '.*?<div class="pinbin-category">(.*?)</div>'
                                      '.*?<div class="pinbin-copy">(.*?)</div>'
                                      '.*?</div>', re.S), html)
    

    这时候我们就要循环挨个是找自己想要的了;

        for item in items:
            if item[0].strip():
                soup = BeautifulSoup(item[0].strip(), 'html.parser')
                img = soup.find('img', attrs={'class': 'attachment-detail-image wp-post-image'})
                # 图片
                print("海报:" + img.get('src'))
            if item[1].strip():
                soup = BeautifulSoup(item[1].strip(), 'html.parser')
                categorys = soup.find_all('a')
                for category in categorys:
                    print(category.get_text())
            if item[2].strip():
                soup = BeautifulSoup(item[2].strip(), 'html.parser')
                title = soup.find('a', attrs={'class': 'front-link'})
                print("电影名:" + title.get_text())
                print("链接地址:" + title.get('href'))
                date = soup.find('p', attrs={'class': 'pinbin-date'})
                print("日期:" + date.get_text())
                brief = soup.find_all('p')
                print("简介:" + brief[1].string)
    

    以上就是得到一页的数据;


    image

    如果想得到总得就需要得到总页面,然后循环获取;

    # 得到总页数
    def get_page_count(html):
        soup = BeautifulSoup(html, 'html.parser')
        page_count = soup.find('span', attrs={'class': 'pages'})
        return int(page_count.get_text()[-4:-2])
    

    最终代码如下:

    # -*- coding: UTF-8 -*-
    
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import re
    
    baseURL = "http://xwxmovie.cn/"
    headers = {
        'Host': 'xwxmovie.cn',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/64.0.3282.167 Safari/537.36"'
    }
    
    
    def browser_get():
        browser = webdriver.Chrome()
        browser.get(baseURL)
        html_text = browser.page_source
        page_count = get_page_count(html_text)
        get_page_data(html_text)
    
    
    # 得到总页数
    def get_page_count(html):
        soup = BeautifulSoup(html, 'html.parser')
        page_count = soup.find('span', attrs={'class': 'pages'})
        return int(page_count.get_text()[-4:-2])
    
    
    def get_page_data(html):
        items = re.findall(re.compile('<div id="post-.*?class="post-.*?style="position:.*?>'
                                      '.*?<div class="pinbin-image">(.*?)</div>'
                                      '.*?<div class="pinbin-category">(.*?)</div>'
                                      '.*?<div class="pinbin-copy">(.*?)</div>'
                                      '.*?</div>', re.S), html)
        for item in items:
            if item[0].strip():
                soup = BeautifulSoup(item[0].strip(), 'html.parser')
                img = soup.find('img', attrs={'class': 'attachment-detail-image wp-post-image'})
                # 图片
                print("海报:" + img.get('src'))
            if item[1].strip():
                soup = BeautifulSoup(item[1].strip(), 'html.parser')
                categorys = soup.find_all('a')
                for category in categorys:
                    print(category.get_text())
            if item[2].strip():
                soup = BeautifulSoup(item[2].strip(), 'html.parser')
                title = soup.find('a', attrs={'class': 'front-link'})
                print("电影名:" + title.get_text())
                print("链接地址:" + title.get('href'))
                date = soup.find('p', attrs={'class': 'pinbin-date'})
                print("日期:" + date.get_text())
                brief = soup.find_all('p')
                print("简介:" + brief[1].string)
    
    if __name__ == '__main__':
        browser_get()
    
    

    相关文章

      网友评论

        本文标题:Python学习:爬个电影资源网站

        本文链接:https://www.haomeiwen.com/subject/ycarrftx.html