#开始导入我们爬取过程中要使用的包
import urllib.request
import osfrom bs4
import BeautifulSoup
def getSrc(url):
#先读取出我们要爬取的网址信息
headers = ("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
res = urllib.request.urlopen(url)
html = res.read()
#构建一个BeautifulSoup对象
soup = BeautifulSoup(html,'html.parser', from_encoding='utf-8')
#找出所以'img class="BDE_Image"'标签对应的值 result = soup.find_all('img',class_="BDE_Image")
links = []
for content in result: s = content['src']
if s is None:
continue
else:
links.append(s)
#输出一共找出几个符合的图片地址
print("共有"+str(len(links))+"张图片")
return links
def save(path,links=[]):
if(len(links)==0):
print('没有符合条件的图片')
return
#判断本地是否有photo这个路径,没有的话创建一个
if not os.path.exists(path):
os.makedirs(path)
#循环把图片下载到本地photo路径下
i = 0
#for循环 循环读取我们爬取到的 图片地址列表
for link in links:
i+=1
filename =path+'/photo'+str(i)+'.jpg'
with open(filename,'w'):
urllib.request.urlretrieve(link,filename)
print('正在下载:'+str(link))
print("图片下载完成")
def doCrew(url,path='path'):
links=getSrc(url)
save(path,links)
if __name__ == "__main__":
# 这个是要爬取的网站地址
url = 'http://tieba.baidu.com/p/5698856079'
path='photo'
doCrew(url,path)
网友评论