美文网首页
python学习记录1 - 极客学院视频爬虫

python学习记录1 - 极客学院视频爬虫

作者: zenos876 | 来源:发表于2019-03-06 22:07 被阅读0次
    背景:

    极客学院网站,课程标签下有个知识体系图。因为体系比较全,因此冲了一个月vip,想要将系列课程下载下来以后慢慢看,也就有了下面的爬虫。(仅供个人学习,请勿商用,侵删)

    技术文档:

    想要学习python爬虫,当然少不了request用户指南Beautiful Soup技术文档

    代码块:
    • config.py 爬虫基础的设置
    proxies = {
        'https': '42.123.125.181:8088',
    }
    headers = {
        #这里填写自己的cookie
        'Cookie': '_uab_collina=xxxxx; PHPSESSID=xxxxx; jkxyid_v2=xxxx; _ga=xxxx; _gid=xxxx; gr_user_id=xxxx; uname=xxxxx; uid=xxxx; code=xxxx; authcode=xxxxx',
       
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    KnowledgeSystemUrl = 'https://www.jikexueyuan.com/path/'
    

    为避免网站的反爬机制,对本地ip的访问限制,这里添加了proxies 代理服务器地址,可以在国内高匿代理获取代理服务器;
    下载部分视频时候需要用户vip登陆,这里获取浏览器访问极客学院的request header,使用其中的cookie来跳过登陆步骤查看http请求的header信息

    • crawl.py 获取html
    import requests
    from config import headers
    from config import proxies
    class Crawl(object):
    
    
        def getText(self,url):
            try:
                r = requests.get(url,headers = headers,proxies = proxies,timeout = 20)
                r.encoding = r.apparent_encoding
                print (r.status_code,'request')
                self.html = r.text
                return r.text
            except:
                return 'getText error'
    
        def getResponse(self, url):
            try:
                r = requests.get(url, headers = headers,proxies = proxies,timeout = 20)
                r.encoding = r.apparent_encoding
                print (r.status_code,'request')
                return r
            except:
                return 'getResponse error'
    
    • KnowledgeSystem.py 获取知识体系列表
    from config import KnowledgeSystemUrl
    from crawl import Crawl
    from bs4 import BeautifulSoup
    class KnowledgeSystem(Crawl):
        class listData():
            nameList = []
            srcList = []
                
        def getList(self):
            try:
                html = self.getText(KnowledgeSystemUrl)
                soup = BeautifulSoup(html,'html.parser')
                print('----正在查找知识体系列表----')
                srcList = []
                nameList = []
                index = 1
                cf = soup.find_all(attrs = 'pathlist-one cf')
                for member in cf:
                    h2 = member.find('h2')
                    print('%d  '%(index) + h2.string)
                    nameList.append(h2.string)
                    srcList.append('https:' + member['href'])
                    index = index + 1
                
                ld = self.listData()
                ld.nameList = nameList
                ld.srcList = srcList
                return ld
            except:
                print('getList error')
    
        def sellect(self):
            n = input('-----请输入你想要下载的课程号-----\n')
            return int(n)
    
    • courseList.py 获取每个章节的课程列表
    from crawl import Crawl
    from bs4 import BeautifulSoup
    class CourseList(Crawl):
        class CourseData:
            chapterName = ''
            lessonNameList = []
            lessonSrcList = []
    
        class CourseList:
            #存放CourseData类
            chapterList = []
    
        def getCourse(self,url):
            print('-------正在获取该系列课程信息---------')
            chapterListHtml = self.getText(url)
            chapterListSoup = BeautifulSoup(chapterListHtml,'html.parser')
    
            temp = chapterListSoup.find_all(attrs = 'pathstage mar-t30')
            self.CourseList.chapterList = []
            for each in temp:
                #获取该系列的章节名字,存放在CourseData类中
                CD = self.CourseData()
                CD.chapterName = each.find('h2').string
                
                lessonInfoList = each.find_all(attrs = 'lesson-info-h2')
                index = 1
                #初始化课程名列表、url源列表
                CD.lessonNameList = []
                CD.lessonSrcList = []
                for info in lessonInfoList:
                    #获取课程名字,存放在CourseData类中的名字列表中
                    courseName = str(index) + '.' + info.string
                    CD.lessonNameList.append(courseName)
                    
                    #获取课程名字,存放在CourseData类中的url列表中
                    lessonSrc = 'https:'+ info.a['href']
                    CD.lessonSrcList.append(lessonSrc)
                    index = index + 1
                #将处理好的课程数据类保存在chapterList中
                self.CourseList.chapterList.append(CD)
    
        def printChapterNameList(self):
            print('-----查找到该知识体系有如下章节:-----')
            for each in self.CourseList.chapterList:
                print(each.chapterName)
    
        def printLessonNameList(self):
            index = 0
            for each in self.CourseList.chapterList:
                for lessonName in each.lessonNameList:
                    print(lessonName)
                index = index + 1
                    
        def printLessonSrcList(self):
            index = 0
            for each in self.CourseList.chapterList:
                for lessonSrc in each.lessonSrcList:
                    print(lessonSrc)
                index = index + 1
    
    • section.py 获取每一课程的小节信息
    from crawl import Crawl 
    from bs4 import BeautifulSoup
    import bs4
    class Section(Crawl):
        class SectionData:
            sectionNameList = []
            sectionSrcList = []
    
        def getSection(self,url):
            print('--------正在获取该知识体系的小节信息-------')
            lessonHtml = self.getText(url)
            soup = BeautifulSoup(lessonHtml,'html.parser') 
            temp = soup.find(attrs='lessonvideo-list')
            while(isinstance(temp,bs4.element.Tag) == False): 
                lessonHtml = self.getText(url)
                soup = BeautifulSoup(lessonHtml,'html.parser')
                print('isinstance(temp,bs4.element.Tag) == False')        
                temp = soup.find(attrs='lessonvideo-list')
            aTag = temp.find_all('a')
    
            self.SectionData.sectionNameList = []
            self.SectionData.sectionSrcList = []
            for each in aTag:
                #print(each.string)
                #print('https:' + each['href']) 
                self.SectionData.sectionNameList.append(each.string)
                self.SectionData.sectionSrcList.append('https:' + each['href'])
    
    • download.py 下载视频
    from crawl import Crawl
    from section import Section
    from bs4 import BeautifulSoup
    import bs4
    import os
    import requests
    class Download(Crawl):
        class DownloadData:
            sourceList = []
            nameList = []
        
        def findVideoSrc(self,SectionData):
            print('-----正在获取课程的视频链接-------')
            self.DownloadData.sourceList = []
            self.DownloadData.nameList = SectionData.sectionNameList
    
            for Src in SectionData.sectionSrcList:
                html = self.getText(Src)
                soup = BeautifulSoup(html,'html.parser')
                sourceTag = soup.find('source')
                while(isinstance(sourceTag,bs4.element.Tag) == False): 
                    print('isinstance(sourceTag,bs4.element.Tag) == False')   
                    html = self.getText(Src)
                    soup = BeautifulSoup(html,'html.parser')
                    sourceTag = soup.find('source')
                source = sourceTag['src']
                #print(source)
                self.DownloadData.sourceList.append(source)
    
        def makeDir(self,dirName):
            print('-------正在创建路径:%s------'%dirName)
            try:
                if(os.path.exists(dirName)):
                    return dirName
                else:
                    os.mkdir(dirName)
                    return dirName
            except:
                print('当前要创建的路径为:'+ dirName)
                dirName = input('创建失败,请手动输入路径')
                dirName = self.makeDir(dirName)
                return dirName
    
        def saveVideoFile(self,path,videoName,videoSrc):
            videoFilePath = path +'/'+ videoName + '.mp4'
            if(os.path.exists(videoFilePath)):
                print('        ' + '视频已存在。    %s'%(videoName))
                return
            else:
                video = requests.get(videoSrc)
                print('        ' + '开始下载视频    %s'%(videoName))
                f = open(videoFilePath, 'ab')
                print('        ' + '开始保存视频    %s'%(videoName))
                f.write(video.content)    
                f.close()
    
        def downloadVideo(self,path):
            path = self.makeDir(path)
            for i in range(len(self.DownloadData.sourceList)):
                self.saveVideoFile(path,self.DownloadData.nameList[i],self.DownloadData.sourceList[i])
    
    
    • main.py 主函数
    import sys
    import io
    from KnowledgeSystem import KnowledgeSystem
    from courseList import CourseList
    from section import Section
    from download import Download
    import os
    #sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
    
    if __name__ == '__main__':
        #声明类
        KS = KnowledgeSystem()
        KSLD = KS.listData()
    
        #获取知识体系列表
        KSLD = KS.getList()
        
        #用户选择想要下载的某一条知识体系
        num = KS.sellect()
    
        #该体系的名字、该体系的url源
        ksName = KSLD.nameList[num - 1]
        ksSrc = KSLD.srcList[num - 1]
        
        #获取该体系所有的课程
        CL = CourseList()
        CL.getCourse(ksSrc)
        CL.printChapterNameList()
        
        sec = Section()
        dld = Download()
        pathTemp = './'+ksName
        pathTemp = dld.makeDir('./'+ksName)#./andorid
        for each in CL.CourseList.chapterList:
            pathTemp2 = dld.makeDir(pathTemp + '/' + each.chapterName)#./andorid/1.环境搭建
            for i in range(len(each.lessonSrcList)):
                path = pathTemp2 + '/' + each.lessonNameList[i] #./andorid/1.环境搭建/1.Android 集成开发环境搭建
                sec.getSection(each.lessonSrcList[i])
                videoFilePath = path +'/'+ sec.SectionData.sectionNameList[len(sec.SectionData.sectionNameList)-1] + '.mp4'
                if(os.path.exists(videoFilePath)):
                    print('文件已存在,跳过     %s'%videoFilePath)
                    pass
                else:
                    dld.findVideoSrc(sec.SectionData)
                    dld.downloadVideo(path)
    
    
        print('download successful')
    
    运行结果预览:
    获取想要的资源 正在下载视频 下载好的视频文件

    相关文章

      网友评论

          本文标题:python学习记录1 - 极客学院视频爬虫

          本文链接:https://www.haomeiwen.com/subject/xeggjqtx.html