第一次做爬虫小项目,代码很粗糙,而且我只抓取了第一页上的所有图片集
其实也可以翻页抓,只是觉得太麻烦,学精了之后再抓吧,再说了抓那么多浪费电脑存储空间,妹子图什么的哪里有实战好,有那个时间看黄图,不如撩妹,谈个女朋友天天实战。
废话到此截止:
爬取网址为:https://www.mzitu.com/tag/youhuo/
我们先打开网站主页:
妹子很多,很好看,我不挑,给我哪个我都要。
按部就班,打开开发页面:
蓝色部分就是我们要提取的第一个图集的地址,下面的href依次为第二个第三个.......,代码如下:
import requests
from requests.exceptions import RequestException
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
#请求头的这个Referer一定要加,妹子网有反爬,反正粘贴复制就行,多加几个信息无所谓
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
get_page(url)
if __name__ == '__main__':
main()
很容易我们就获得了输出:
分析一个得到的这段HTML,红框框显然就是我们要得到的每一个图集的地址。
我们引"靓汤",开始提取图集地址:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('#pins li')
for link in items:
href = link.a['href']
print(href)
# print(items)
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
html = get_page(url)
parse_page(html)
if __name__ == '__main__':
main()
BeautifulSoup库很强大,我决定多多学习,靓汤的输出书一个list,必须先先遍历才能输出*,输入如下:
我们得到了每一个图集的地址,打开一个图集看看:
每个图集的详情页里只有一张图,不过下边有一个索引,这时就需要考虑一下详情页中的翻页问题:
图集中第三幅图的地址
不同图集存在相同的地址排列方式,我们就可以创建一个循环列表来进行每个图集的同图片的详情页获取:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('#pins li')
for link in items:
href = link.a['href']
get_detail_page(href)
# print(items)
def get_detail_page(href):
for i in range(1,100):
detail_url = href + '/' + str(i)
if requests.get(detail_url, headers=headers).status_code == 200:
print(detail_url)
else:
print('已至末尾页')
return None
response = requests.get()
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
html = get_page(url)
parse_page(html)
if __name__ == '__main__':
main()
我稍微解释一下,我设置的是循环到100,因为有的图片只有46页嘛,我就加了一个网页状态码判断,如果返回200,就是正常网址,不是200我就终止网址获取,这样我们就可以将每个图片集内,所有详情页的网址取出来了:
下一步就是通过网址获得网址对应的html:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('#pins li')
for link in items:
href = link.a['href']
get_detail_page(href)
# print(items)
def get_detail_page(href):
for i in range(1,100):
detail_url = href + '/' + str(i)
if requests.get(detail_url, headers=headers).status_code == 200:
parse_detail_page(detail_url)
else:
print('已至末尾页')
return None
def parse_detail_page(detail_url):
try:
response = requests.get(detail_url, headers=headers)
if response.status_code == 200:
print('获取详情页成功')
detail_html = response.text
print(detail_html)
# get_image(detail_html)
return None
except RequestException:
print('获取详情页失败')
return None
# def get_image(detail_html):
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
html = get_page(url)
parse_page(html)
if __name__ == '__main__':
main()
获得每个详情页的html:
再用美丽汤解析出即可:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('#pins li')
for link in items:
href = link.a['href']
get_detail_page(href)
# print(items)
def get_detail_page(href):
for i in range(1,100):
detail_url = href + '/' + str(i)
if requests.get(detail_url, headers=headers).status_code == 200:
parse_detail_page(detail_url)
else:
print('已至末尾页')
return None
def parse_detail_page(detail_url):
try:
response = requests.get(detail_url, headers=headers)
if response.status_code == 200:
print('获取详情页成功')
detail_html = response.text
# print(detail_html)
get_image(detail_html)
return None
except RequestException:
print('获取详情页失败')
return None
def get_image(detail_html):
soup = BeautifulSoup(detail_html, 'lxml')
items= soup.select('.main-image')
# print(items)
for item in items:
return item.img['src']
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
html = get_page(url)
parse_page(html)
if __name__ == '__main__':
main()
下面就是保存图片了,下面是完整的代码,代码的模块化不高,后期再改进。
import requests
import os
from hashlib import md5
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('#pins li')
for link in items:
href = link.a['href']
get_detail_page(href)
# print(items)
def get_detail_page(href):
for i in range(1,100):
detail_url = href + '/' + str(i)
if requests.get(detail_url, headers=headers).status_code == 200:
parse_detail_page(detail_url)
else:
print('已至末尾页')
return None
def parse_detail_page(detail_url):
try:
response = requests.get(detail_url, headers=headers)
if response.status_code == 200:
print('获取详情页成功')
detail_html = response.text
# print(detail_html)
get_image(detail_html)
return None
except RequestException:
print('获取详情页失败')
return None
def get_image(detail_html):
soup = BeautifulSoup(detail_html, 'lxml')
items= soup.select('.main-image')
# print(items)
for item in items:
image = item.img['src']
save_image(image)
def save_image(image):
response = requests.get(image,headers=headers)
if response.status_code == 200:
data = response.content
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(data).hexdigest(), 'jpg')
print(file_path)
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(data)
f.close()
print('保存成功')
else:
print('保存失败')
return None
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
html = get_page(url)
parse_page(html)
if __name__ == '__main__':
main()
讲实话,我个人比较排斥YY,想实战,哈哈哈
网友评论