美文网首页python爬虫入门看这个就够了Python五期爬虫作业
【python爬虫】第十六次 xpath整站抓取阳光电影网电影资

【python爬虫】第十六次 xpath整站抓取阳光电影网电影资

作者: 急躁的假汉子 | 来源:发表于2018-01-19 17:11 被阅读10次

    一、解析电影url

    请求15题构造出的每个电影菜单的分页url,解析出每个电影url

    二、xpath解析电影资源

    对第一题解析出来的电影url进行请求,解析出最终的电影名称 电影下载地址

    import requests
    from lxml import etree
    root_url = 'http://www.ygdy8.com'
    #请求阳光电影网站
    req = requests.get(root_url)
    #输出请求的状态码
    status_code = req.status_code
    #print(status_code)
    #输出网页源码
    req.encoding = 'gb2312'
    html = req.text
    selector = etree.HTML(html)
    infos = selector.xpath('//div[@class="contain"]/ul/li[position()<10]/a')
    for info in infos:
        info_text = info.xpath('text()')[0]
        if info_text == '经典影片':
            continue
        info_url = root_url + info.xpath('@href')[0]
        #将阳光电影网首页导航栏前9个菜单url抓取,输出结果为可以正常访问的url, 并过滤掉"经典影片"的菜单url
        #print(info_text,info_url)
        req1 = requests.get(info_url)
        req1.encoding = 'gb2312'
        html1 = req1.text
        selector1 = etree.HTML(html1)
        page = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//text()')[1].split('/')[0].replace('共','').replace('页','').strip()
        page = int(page)
        page_list = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//a/@href')[0].replace('2.html','')
        url_list = []
        for i in range(1,page+1):
            page_url = info_url.replace('index.html','')+page_list+str(i)+'.html'
            url_list.append(page_url)
        #print(info_text,'共'+str(page)+'页',url_list)
        for page_url in url_list:
            req2 = requests.get(page_url)
            req2.encoding ='gb2312'
            html2 = req2.text
            selector2 = etree.HTML(html2)
            movie_part_urls = selector2.xpath('//div[@class="co_content8"]/ul//a/@href')
            for movie_part_url in movie_part_urls:
                movie_url = root_url + movie_part_url
                req3 = requests.get(movie_url)
                req3.encoding = 'gb2312'
                html3 = req3.text
                selector3 = etree.HTML(html3)
                movie_name = selector3.xpath('//div[@class="co_area2"]//h1/font/text()')[0]
                movie_download_url = selector3.xpath('//div[@class="co_content8"]//table//a/@href')[0].split('/[')[0]
                print(movie_name,movie_download_url)
    

    结果如下

    image.png

    三、对代码进行分装成函数

    import requests
    from lxml import etree
    from multiprocessing import Pool
    #获取阳光电影菜单栏
    def get_menu_url(url):
        # 请求阳光电影网站
        req = requests.get(url)
        # 输出请求的状态码
        status_code = req.status_code
        # print(status_code)
        # 输出网页源码
        req.encoding = 'gb2312'
        html = req.text
        selector = etree.HTML(html)
        infos = selector.xpath('//div[@class="contain"]/ul/li[position()<10]/a')
        info_urls = []
        for info in infos:
            info_text = info.xpath('text()')[0]
            if info_text == '经典影片':
                continue
            info_url = root_url + info.xpath('@href')[0]
            info_urls.append(info_url)
        return info_urls
    
    def get_page_url(urls):
        url_list = []
        for url in urls:
            req1 = requests.get(url)
            req1.encoding = 'gb2312'
            html1 = req1.text
            selector1 = etree.HTML(html1)
            page = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//text()')[1].split('/')[0].replace('共',
                                                                                                                   '').replace(
                '页', '').strip()
            page = int(page)
            page_list = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//a/@href')[0].replace('2.html', '')
            for i in range(1, page + 1):
                page_url = url.replace('index.html', '') + page_list + str(i) + '.html'
                url_list.append(page_url)
        return url_list
    
    def get_movie_url(url):
        root_url = 'http://www.ygdy8.com'
        req2 = requests.get(url)
        req2.encoding = 'gb2312'
        html2 = req2.text
        selector2 = etree.HTML(html2)
        movie_part_urls = selector2.xpath('//div[@class="co_content8"]/ul//a/@href')
        for movie_part_url in movie_part_urls:
            movie_url = root_url + movie_part_url
            req3 = requests.get(movie_url)
            req3.encoding = 'gb2312'
            html3 = req3.text
            selector3 = etree.HTML(html3)
            movie_name = selector3.xpath('//div[@class="co_area2"]//h1/font/text()')[0]
            movie_download_url = selector3.xpath('//div[@class="co_content8"]//table//a/@href')[0]
            print(movie_name, movie_download_url)
    
    
    
    if __name__ == '__main__':
        root_url = 'http://www.ygdy8.com'
        ##获取阳光电影菜单栏网址
        menu_urls = get_menu_url(root_url)
        ##获取菜单栏下各分页的网址
        page_urls = get_page_url(menu_urls)
        ##获取电影以及电影网址
        p = Pool(4)
        p.map(get_movie_url, page_urls)
    

    结果如下如图

    image.png

    相关文章

      网友评论

        本文标题:【python爬虫】第十六次 xpath整站抓取阳光电影网电影资

        本文链接:https://www.haomeiwen.com/subject/kizooxtx.html