使用python3内置库urllib和BeautifulSoup

作者: leslie_aLIE | 来源:发表于2018-06-28 11:18 被阅读0次

使用python3内置库urllib和BeautifulSoup
Urllib库基本使用
使用python3 requests和bs4进行爬虫（二）爬取文
爬虫常用库介绍
Python网络——Urllib&Requests
03_基本库的使用
Python数据挖掘03-requests库
自然语言处理（NLP）-1 从爬虫开始
urllib库python2和python3区别
常用Python爬虫库介绍

# coding=utf-8

'''

download_html:接受url，返回html和BeautifulSoup实例spider接受html返回url和数据process_data:处理字符串及保存数据controller:控制，调用'''

__author__ ='Leslie'

from urllib.requestimport urlopen

from bs4import BeautifulSoup

import re,collections,os

# 接受url，返回html和BeautifulSoup实例

def download_html(url):

html = urlopen(url).read().decode('utf-8')#获取页面数据

soup = BeautifulSoup(html,'lxml')#实例化BeautifulSoup

return html,soup

#spider接受html返回url队列、title队列、数据

def spider(html=False,soup=False):

# 爬取首页的url和title

if not htmland soup:

queue_url = collections.deque()# 存储url队列

queue_title = collections.deque()# 存储标题队列

# 定位元素，提取a标签href和title属性

for itemin soup.find_all("div", {"class":"box"}):

for Alabelin item.find_all("a"):

queue_url.append(Alabel["href"])

# 处理title字符串中多余的字符

Str1 = Alabel["title"]

Str2 ='_盗墓笔记9在线阅读_盗墓笔记全集'

if Str2in Str1:

Str1 = Str1.replace(Str2,'')

index = Str1.index(']')

Str1 = Str1[index +1:].strip()

queue_title.append(Str1)

return queue_url,queue_title

# 爬取文字

if htmland soup:

all_p_label = soup.find("div",class_="content-body").find_all("p")

return all_p_label

# 处理字符串及保存数据

def process_data(Data,title):

# 标题名去除不可用字符[\/?:*<>"|]

while '\\' in title:

index = title.index('\\')

title = title[:index] + title[index +1:]

matchList = re.findall('[/?:*<>"|]*', title)

matchStr =''.join(matchList)# '?><'

title =list(title)

for jin matchStr:

title.remove(j)

title =''.join(title)

#保存文件的绝对路径

abspath = os.path.join(os.path.abspath(r'.\daomubiji1'), title)

#去除文字中多余的字符串如：www.setupu.com

CMP = re.compile("(http://)?([a-zA-Z]+\.)+com")# 编译正则表达式对象

for iin Data:

each_string =str(i.string).replace(" ","").strip()

if each_string !="None":

Match = re.search(CMP, each_string)# 匹配字符串

# 保存文字到txt文件

with open(abspath,'a',encoding='utf-8')as fp:

if Match !=None:

Newstring = each_string[:Match.span()[0]]

fp.write(Newstring +'\n')

else:

fp.write(each_string +'\n')

# 控制，调用

def controller(url):

# 获取要爬取的url队列和文件名标题

html,soup = download_html(url)

queue_url,queue_title = spider(soup=soup)

# 循环爬取url知道队列为空

while url:

url = queue_url.popleft()

title = queue_title.popleft() +'.txt'

print(title,url)

html,soup = download_html(url)

text_data = spider(html,soup)

process_data(text_data,title)

url =r'http://www.seputu.com/'

os.mkdir(os.path.abspath(r'.\daomubiji1'))

controller(url)

网友评论

本文标题：使用python3内置库urllib和BeautifulSoup

本文链接：https://www.haomeiwen.com/subject/izcgyftx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

使用python3内置库urllib和BeautifulSoup

相关文章