第一次写爬虫,还是爬简书文章内的图片,小记一下,希望各位大神给一下意见,能更好的学习python,感谢~
spider_image.py文件负责调度
from html_parser import HtmlParser
from download import Download
from url_manager import UrlManger
from save_results import SaveResult
class SpiderImages():
#init all instance
def __init__(self):
self.download = Download()
self.htmlparser = HtmlParser()
self.urlmanager = UrlManger()
self.saveresult = SaveResult()
def run(self,urls):
i = 1
for url in urls:
file_dir = url.split('/')[-1]
self.urlmanager.add_new_url(url)
while self.urlmanager.has_new_url():
new_url = self.urlmanager.get_new_url()
html_cont = self.download.download(new_url)
new_urls,name,html_cont,t = self.htmlparser.parser(html_cont)
self.urlmanager.add_new_urls(new_urls)
self.saveresult.save(html_cont,file_dir,name,t)
print("{} {}".format(i,new_url))
i += 1
def main(self,url):
#self.craw(url)
self.run(url)
url = ["https://www.jianshu.com/p/cafdb41e186a","https://www.jianshu.com/p/d2a1490c785c","https://www.jianshu.com/p/cce86949fc9a"]
spider = SpiderImages()
spider.main(url)
download.py文件负责网页内容下载
import requests
import os
class Download():
def download(self,url):
try:
imagename = url.split('/')[-1]
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
res = requests.get(url,headers=headers)
res.raise_for_status()
res.encoding = "utf-8"
#print({'contents':res.content,'imagename':imagename})
return {'contents':res.text,'imagename':imagename,'binary':res.content}
except:
print("download error")
html_parser.py文件负责解析网页内容
from bs4 import BeautifulSoup
class HtmlParser():
def parser(self,html_content):
try:
soup = BeautifulSoup(html_content['contents'],"html.parser")
name = soup.find("div",class_="author").find_all(class_='info')[0].find('a').text
images = soup.find("div",class_="show-content-free").find_all('img')
new_images = []
for image in images:
new_images.append("http:"+image['data-original-src'])
return new_images,name,html_content,1
except:
return '','',html_content,0
url_manager.py负责url管理
class UrlManger():
def __init__(self):
self.wait_urls = set()
self.downloaded_urls = set()
def add_new_url(self,url):
if url and self.checkaddwaiturl(url) and self.checkaddurldownload(url):
self.wait_urls.add(url)
else:
return
def add_new_urls(self,urls):
if urls:
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.wait_urls) != 0
def get_new_url(self):
download_url = self.wait_urls.pop()
self.downloaded_urls.add(download_url)
return download_url
def checkaddwaiturl(self,url):
if url not in self.wait_urls:
return True
else:
return False
def checkaddurldownload(self,url):
if url not in self.downloaded_urls:
return True
else:
return False
save_results.py负责保存图片到本地
import os
class SaveResult():
root = './download'
def saveimage(self,res):
imagename = '/'.join([self.__root.strip('/'),res['imagename']])
imagename = self.__checkfiletype(imagename)
if self.__checkfile(imagename):
print('{} is exists'.format(imagename))
else:
try:
with open(imagename,"wb") as f:
f.write(res['binary'])
f.close()
except:
print("save image fail")
def createDir(self,path):
self.__root = '/'.join([self.root,path])
print(self.__root)
if os.path.exists(self.__root):
return
os.makedirs(self.__root)
def __checkfile(self,filename):
return os.path.exists(filename)
def __checkfiletype(self,filename):
return filename if filename.rfind('.') >0 else '.'.join([filename,'jpg'])
def save(self,contents,file_dir,author,t):
if t == 1:
self.createDir('/'.join([author,file_dir]))
else:
self.saveimage(contents)
网友评论