美文网首页
使用python3内置库urllib和BeautifulSoup

使用python3内置库urllib和BeautifulSoup

作者: leslie_aLIE | 来源:发表于2018-06-28 11:18 被阅读0次

# coding=utf-8

'''

download_html:接受url,返回html和BeautifulSoup实例spider接受html返回url和数据process_data:处理字符串及保存数据controller:控制,调用'''

__author__ ='Leslie'

from urllib.requestimport urlopen

from bs4import BeautifulSoup

import re,collections,os

# 接受url,返回html和BeautifulSoup实例

def download_html(url):

html = urlopen(url).read().decode('utf-8')#获取页面数据

    soup = BeautifulSoup(html,'lxml')#实例化BeautifulSoup

    return html,soup

#spider接受html返回url队列、title队列、数据

def spider(html=False,soup=False):

# 爬取首页的url和title

    if not htmland soup:

queue_url = collections.deque()# 存储url队列

        queue_title = collections.deque()# 存储标题队列

        # 定位元素,提取a标签href和title属性

        for itemin soup.find_all("div", {"class":"box"}):

for Alabelin item.find_all("a"):

queue_url.append(Alabel["href"])

# 处理title字符串中多余的字符

                Str1 = Alabel["title"]

Str2 ='_盗墓笔记9在线阅读_盗墓笔记全集'

                if Str2in Str1:

Str1 = Str1.replace(Str2,'')

index = Str1.index(']')

Str1 = Str1[index +1:].strip()

queue_title.append(Str1)

return queue_url,queue_title

# 爬取文字

    if htmland soup:

all_p_label = soup.find("div",class_="content-body").find_all("p")

return all_p_label

# 处理字符串及保存数据

def process_data(Data,title):

# 标题名去除不可用字符[\/?:*<>"|]

    while '\\' in title:

index = title.index('\\')

title = title[:index] + title[index +1:]

matchList = re.findall('[/?:*<>"|]*', title)

matchStr =''.join(matchList)# '?><'

    title =list(title)

for jin matchStr:

title.remove(j)

title =''.join(title)

#保存文件的绝对路径

    abspath = os.path.join(os.path.abspath(r'.\daomubiji1'), title)

#去除文字中多余的字符串如:www.setupu.com

    CMP = re.compile("(http://)?([a-zA-Z]+\.)+com")# 编译正则表达式对象

    for iin Data:

each_string =str(i.string).replace(" ","").strip()

if each_string !="None":

Match = re.search(CMP, each_string)# 匹配字符串

            # 保存文字到txt文件

            with open(abspath,'a',encoding='utf-8')as fp:

if Match !=None:

Newstring = each_string[:Match.span()[0]]

fp.write(Newstring +'\n')

else:

fp.write(each_string +'\n')

# 控制,调用

def controller(url):

# 获取要爬取的url队列和文件名标题

    html,soup = download_html(url)

queue_url,queue_title = spider(soup=soup)

# 循环爬取url知道队列为空

    while url:

url = queue_url.popleft()

title = queue_title.popleft() +'.txt'

        print(title,url)

html,soup = download_html(url)

text_data = spider(html,soup)

process_data(text_data,title)

url =r'http://www.seputu.com/'

os.mkdir(os.path.abspath(r'.\daomubiji1'))

controller(url)

相关文章

网友评论

      本文标题:使用python3内置库urllib和BeautifulSoup

      本文链接:https://www.haomeiwen.com/subject/izcgyftx.html