美文网首页
下载36氪 播氪的音频

下载36氪 播氪的音频

作者: Do_More | 来源:发表于2017-07-26 17:26 被阅读0次
    image.png
    #coding=utf-8
    import re,urllib2,os,urllib,requests
    
    def getHtmlCode(url):
        response = urllib2.urlopen(url)
        return response.read()
    
    def getEntityId(htmlString):
        regEntityId = re.compile("\"(.+?)\",\"goods_id")
        return regEntityId.findall(htmlString)
    
    def getMp3Url(htmlString):
        regMp3Url = re.compile("mp3\",\"url\":\"(.+?).mp3")
        return regMp3Url.findall(htmlString)
    
    def getTitle(htmlString):
        regTitle = re.compile("\"title\":\"(.+?)\"")
        return regTitle.findall(htmlString)
    
    def getPublishTime(htmlString):
        regUpdateTime = re.compile("published_at\":\"(.+?) ")
        return regUpdateTime.findall(htmlString)
    
    if __name__ == '__main__':
        url = 'http://36kr.com/user/947181171'
        htmlCode = getHtmlCode(url)
        entityIds = getEntityId(htmlCode)
        for content in entityIds:
            contentSplitArray = content.split("\"")
            entityId = contentSplitArray[len(contentSplitArray) - 1]
            contentUrl = 'http://36kr.com/p/' + entityId + '.html'
            contentHtmlCode = getHtmlCode(contentUrl)
            mp3Urls = getMp3Url(contentHtmlCode)
            titles = getTitle(contentHtmlCode)
            publishTimes = getPublishTime(contentHtmlCode)
            fileName = publishTimes[0] + ' ' + titles[0]
            # if len(fileName) > 35:
            #     fileName = fileName[0:35]
            print fileName
            mp3Url = mp3Urls[0] + '.mp3'
            print mp3Url
            if os.path.isfile('/Users/wuxueying/Desktop/python/audios/' + fileName + '.mp3') == False:
                urllib.urlretrieve(mp3Url,"%s.mp3" %("audios/" + fileName))
                print (fileName + '------done')
    

    相关文章

      网友评论

          本文标题:下载36氪 播氪的音频

          本文链接:https://www.haomeiwen.com/subject/yqeokxtx.html