# coding=utf-8
'''
download_html:接受url,返回html和BeautifulSoup实例spider接受html返回url和数据process_data:处理字符串及保存数据controller:控制,调用'''
__author__ ='Leslie'
from urllib.requestimport urlopen
from bs4import BeautifulSoup
import re,collections,os
# 接受url,返回html和BeautifulSoup实例
def download_html(url):
html = urlopen(url).read().decode('utf-8')#获取页面数据
soup = BeautifulSoup(html,'lxml')#实例化BeautifulSoup
return html,soup
#spider接受html返回url队列、title队列、数据
def spider(html=False,soup=False):
# 爬取首页的url和title
if not htmland soup:
queue_url = collections.deque()# 存储url队列
queue_title = collections.deque()# 存储标题队列
# 定位元素,提取a标签href和title属性
for itemin soup.find_all("div", {"class":"box"}):
for Alabelin item.find_all("a"):
queue_url.append(Alabel["href"])
# 处理title字符串中多余的字符
Str1 = Alabel["title"]
Str2 ='_盗墓笔记9在线阅读_盗墓笔记全集'
if Str2in Str1:
Str1 = Str1.replace(Str2,'')
index = Str1.index(']')
Str1 = Str1[index +1:].strip()
queue_title.append(Str1)
return queue_url,queue_title
# 爬取文字
if htmland soup:
all_p_label = soup.find("div",class_="content-body").find_all("p")
return all_p_label
# 处理字符串及保存数据
def process_data(Data,title):
# 标题名去除不可用字符[\/?:*<>"|]
while '\\' in title:
index = title.index('\\')
title = title[:index] + title[index +1:]
matchList = re.findall('[/?:*<>"|]*', title)
matchStr =''.join(matchList)# '?><'
title =list(title)
for jin matchStr:
title.remove(j)
title =''.join(title)
#保存文件的绝对路径
abspath = os.path.join(os.path.abspath(r'.\daomubiji1'), title)
#去除文字中多余的字符串如:www.setupu.com
CMP = re.compile("(http://)?([a-zA-Z]+\.)+com")# 编译正则表达式对象
for iin Data:
each_string =str(i.string).replace(" ","").strip()
if each_string !="None":
Match = re.search(CMP, each_string)# 匹配字符串
# 保存文字到txt文件
with open(abspath,'a',encoding='utf-8')as fp:
if Match !=None:
Newstring = each_string[:Match.span()[0]]
fp.write(Newstring +'\n')
else:
fp.write(each_string +'\n')
# 控制,调用
def controller(url):
# 获取要爬取的url队列和文件名标题
html,soup = download_html(url)
queue_url,queue_title = spider(soup=soup)
# 循环爬取url知道队列为空
while url:
url = queue_url.popleft()
title = queue_title.popleft() +'.txt'
print(title,url)
html,soup = download_html(url)
text_data = spider(html,soup)
process_data(text_data,title)
url =r'http://www.seputu.com/'
os.mkdir(os.path.abspath(r'.\daomubiji1'))
controller(url)
网友评论