美文网首页
使用python3内置库urllib和BeautifulSoup

使用python3内置库urllib和BeautifulSoup

作者: leslie_aLIE | 来源:发表于2018-06-28 11:18 被阅读0次

    # coding=utf-8

    '''

    download_html:接受url,返回html和BeautifulSoup实例spider接受html返回url和数据process_data:处理字符串及保存数据controller:控制,调用'''

    __author__ ='Leslie'

    from urllib.requestimport urlopen

    from bs4import BeautifulSoup

    import re,collections,os

    # 接受url,返回html和BeautifulSoup实例

    def download_html(url):

    html = urlopen(url).read().decode('utf-8')#获取页面数据

        soup = BeautifulSoup(html,'lxml')#实例化BeautifulSoup

        return html,soup

    #spider接受html返回url队列、title队列、数据

    def spider(html=False,soup=False):

    # 爬取首页的url和title

        if not htmland soup:

    queue_url = collections.deque()# 存储url队列

            queue_title = collections.deque()# 存储标题队列

            # 定位元素,提取a标签href和title属性

            for itemin soup.find_all("div", {"class":"box"}):

    for Alabelin item.find_all("a"):

    queue_url.append(Alabel["href"])

    # 处理title字符串中多余的字符

                    Str1 = Alabel["title"]

    Str2 ='_盗墓笔记9在线阅读_盗墓笔记全集'

                    if Str2in Str1:

    Str1 = Str1.replace(Str2,'')

    index = Str1.index(']')

    Str1 = Str1[index +1:].strip()

    queue_title.append(Str1)

    return queue_url,queue_title

    # 爬取文字

        if htmland soup:

    all_p_label = soup.find("div",class_="content-body").find_all("p")

    return all_p_label

    # 处理字符串及保存数据

    def process_data(Data,title):

    # 标题名去除不可用字符[\/?:*<>"|]

        while '\\' in title:

    index = title.index('\\')

    title = title[:index] + title[index +1:]

    matchList = re.findall('[/?:*<>"|]*', title)

    matchStr =''.join(matchList)# '?><'

        title =list(title)

    for jin matchStr:

    title.remove(j)

    title =''.join(title)

    #保存文件的绝对路径

        abspath = os.path.join(os.path.abspath(r'.\daomubiji1'), title)

    #去除文字中多余的字符串如:www.setupu.com

        CMP = re.compile("(http://)?([a-zA-Z]+\.)+com")# 编译正则表达式对象

        for iin Data:

    each_string =str(i.string).replace(" ","").strip()

    if each_string !="None":

    Match = re.search(CMP, each_string)# 匹配字符串

                # 保存文字到txt文件

                with open(abspath,'a',encoding='utf-8')as fp:

    if Match !=None:

    Newstring = each_string[:Match.span()[0]]

    fp.write(Newstring +'\n')

    else:

    fp.write(each_string +'\n')

    # 控制,调用

    def controller(url):

    # 获取要爬取的url队列和文件名标题

        html,soup = download_html(url)

    queue_url,queue_title = spider(soup=soup)

    # 循环爬取url知道队列为空

        while url:

    url = queue_url.popleft()

    title = queue_title.popleft() +'.txt'

            print(title,url)

    html,soup = download_html(url)

    text_data = spider(html,soup)

    process_data(text_data,title)

    url =r'http://www.seputu.com/'

    os.mkdir(os.path.abspath(r'.\daomubiji1'))

    controller(url)

    相关文章

      网友评论

          本文标题:使用python3内置库urllib和BeautifulSoup

          本文链接:https://www.haomeiwen.com/subject/izcgyftx.html