美文网首页
爬取百度贴吧帖子内每个人的发言

爬取百度贴吧帖子内每个人的发言

作者: yousa_ | 来源:发表于2019-09-26 13:41 被阅读0次
import re
# 正则表达式模块,用来匹配图片地址
import urllib.request
# 用来获取HTML源码
import sys
import os
import re
from bs4 import BeautifulSoup


def geturls(path):
    urls = []
    with open(path, 'r', encoding='utf-8') as f1:
        url_list = f1.readlines()
        for url in url_list:
            urls.append(url[3:])
    print('get urls list ready!')
    return urls

def getHtml(urls):
    num = 546
    for url in urls[num:]:
        print('第'+str(num)+'条')
        page = urllib.request.urlopen(url)
        html = page.read()
        num += 1
        yield html
    print('get htmls list ready!')




def getcontents(htmls, dir):
    if not os.path.exists(dir):
        os.makedirs(dir)

    with open(dir +'/' + 'contents'+ ".txt", 'a') as f1:
        reg = r'<div|<.jpg|.png|div|img'
        contgre = re.compile(reg)
        while (htmls.__next__()):
            try:
                html = htmls.__next__()
                # 将网页内容格式化利用bs4库
                soup = BeautifulSoup(html, 'lxml')

                reg = r'<div id="post_content_.*?>(.*?)</div>'
                contentre = re.compile(reg)
                contentList = re.findall(contentre, html.decode('utf-8'))
                for content in contentList:
                    if not bool(re.findall(contgre, content)):
                        f1.write(content)
                f1.write('\n')
                print('共' + str(len(contentList)) + '条文本')
            except Exception as e:
                print(' error~, continue')
                print(e)




def main():
    content_dir = 'content'
    urls = geturls('_防诈骗.txt')
    htmls = getHtml(urls)
    getcontents(htmls, content_dir)


if __name__ == '__main__':
    main()

相关文章

网友评论

      本文标题:爬取百度贴吧帖子内每个人的发言

      本文链接:https://www.haomeiwen.com/subject/ekbuyctx.html