美文网首页
爬取百度贴吧帖子内每个人的发言

爬取百度贴吧帖子内每个人的发言

作者: yousa_ | 来源:发表于2019-09-26 13:41 被阅读0次
    import re
    # 正则表达式模块,用来匹配图片地址
    import urllib.request
    # 用来获取HTML源码
    import sys
    import os
    import re
    from bs4 import BeautifulSoup
    
    
    def geturls(path):
        urls = []
        with open(path, 'r', encoding='utf-8') as f1:
            url_list = f1.readlines()
            for url in url_list:
                urls.append(url[3:])
        print('get urls list ready!')
        return urls
    
    def getHtml(urls):
        num = 546
        for url in urls[num:]:
            print('第'+str(num)+'条')
            page = urllib.request.urlopen(url)
            html = page.read()
            num += 1
            yield html
        print('get htmls list ready!')
    
    
    
    
    def getcontents(htmls, dir):
        if not os.path.exists(dir):
            os.makedirs(dir)
    
        with open(dir +'/' + 'contents'+ ".txt", 'a') as f1:
            reg = r'<div|<.jpg|.png|div|img'
            contgre = re.compile(reg)
            while (htmls.__next__()):
                try:
                    html = htmls.__next__()
                    # 将网页内容格式化利用bs4库
                    soup = BeautifulSoup(html, 'lxml')
    
                    reg = r'<div id="post_content_.*?>(.*?)</div>'
                    contentre = re.compile(reg)
                    contentList = re.findall(contentre, html.decode('utf-8'))
                    for content in contentList:
                        if not bool(re.findall(contgre, content)):
                            f1.write(content)
                    f1.write('\n')
                    print('共' + str(len(contentList)) + '条文本')
                except Exception as e:
                    print(' error~, continue')
                    print(e)
    
    
    
    
    def main():
        content_dir = 'content'
        urls = geturls('_防诈骗.txt')
        htmls = getHtml(urls)
        getcontents(htmls, content_dir)
    
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

          本文标题:爬取百度贴吧帖子内每个人的发言

          本文链接:https://www.haomeiwen.com/subject/ekbuyctx.html