美文网首页
爬取梨视频2018-11-02

爬取梨视频2018-11-02

作者: Mr_Du_Biao | 来源:发表于2018-11-02 17:22 被阅读0次

    爬虫文件

    # -*- coding: utf-8 -*-
    import scrapy
    import time
    from selenium import webdriver
    from lxml import etree
    import re
    from liVedioPro.items import LivedioproItem
    class LidemoSpider(scrapy.Spider):
        name = 'liDemo'
        #allowed_domains = ['www.xxx.com']
        start_urls = ['http://www.pearvideo.com/category_6']
    
        #获取首页更多视频数据
        def getpageSource(self):
            bro = webdriver.PhantomJS(
                executable_path=r'C:\Users\Administrator\Desktop\12期爬虫授课\part_one\4.selenium&phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')
            bro.get(url=self.start_urls[0])
            time.sleep(3)
            js = "window.scrollTo(0,document.body.scrollHeight)"
            bro.execute_script(js)
            time.sleep(2)
            bro.execute_script(js)
            time.sleep(2)
            a = bro.find_element_by_id("listLoadMore")
            if a:
                a.click()
            # 获取的更多的视频数据对应的页面数据
            page_text = bro.page_source
            return page_text
    
        #1.使用了selenium加载出了更多的页面数据,获取了该页面数据,然后进行了解析(二级子页面的url)操作,对解析出的url发起了请求
        def start_requests(self):
            #获取的更多的视频数据对应的页面数据
            page_text = self.getpageSource()
            #解析页面数据中的url
            urls_list = self.myParse(page_text)
            for url in urls_list:
                yield scrapy.Request(url=url,callback=self.getSecondPage)
    
        #获取视频和视频名字
        def getVideoData(self,response):
            videoData = response.body
            item = LivedioproItem()
            item['videoData'] = videoData
            item['name'] = response.url.split('/')[-1]
            yield item
    
        #获取二级页面数据
        def getSecondPage(self,response):
            page_text = response.text
            video_url = re.findall('srcUrl="(.*?)",',page_text,re.S)[0]
            yield scrapy.Request(url=video_url,callback=self.getVideoData)
    
        #在首页面也中解析出所有的视频二级子页面的url
        def myParse(self,pageText):
            tree = etree.HTML(pageText)
            li_list = tree.xpath('//li[@class="categoryem"]')
            urls_list = []
            for li in li_list:
                if not li.xpath('./div/a/@href'):
                    continue
                secondPage_url ="http://www.pearvideo.com/"+li.xpath('./div/a/@href')[0]
                urls_list.append(secondPage_url)
            return urls_list
    

    item

    
    import scrapy
    
    
    class LivedioproItem(scrapy.Item):
        # define the fields for your item here like:
        name = scrapy.Field()
        videoData = scrapy.Field()
    

    管道文件

    import os
    class LivedioproPipeline(object):
        def open_spider(self,spider):
            #创建一个文件夹
            if not os.path.exists('PearVideo'):
                os.mkdir('PearVideo')
        def process_item(self, item, spider):
            filePath = 'PearVideo/'+item['name']
            with open(filePath,'wb') as fp:
                fp.write(item['videoData'])
                print(filePath+" 下载成功!")
            return item
    

    相关文章

      网友评论

          本文标题:爬取梨视频2018-11-02

          本文链接:https://www.haomeiwen.com/subject/ivmcxqtx.html