一些特殊的网站,比如漫画网站,并没有对图片做访问登陆的限制,并且图片访问呈现一定的规律性,就可以通过编程简单爬虫的手段下载。在这边我上传了一个我下载哈哈漫画的示例程序,有兴趣的童鞋可以看看。
# 爬取特殊网站图片方法一
import requests
import os
import glob
import shutil
prefix = "有意思的网址/files/"
base_url = "有意思的网址/files/80606/"
begin_page = 27956
end_page = 63655
for page_id in range(begin_page, end_page + 1):
url = base_url + str(page_id) + "/"
index = 1
response_id = True
dir_path = url.replace(prefix, "")
if not os.path.exists(dir_path):
os.makedirs(dir_path)
while response_id != False:
url_path = url + str(index) + ".jpg"
response = requests.get(url_path)
response_id = response.ok
filename = url_path.replace(prefix, "")
if response_id == True:
print(url_path, filename)
with open(filename,'wb') as f:
for chunk in response.iter_content(128):
f.write(chunk)
index += 1
此外还有一种更加优秀的解决方法
# 爬取特殊网站图片方法二
"""
really used in fetching url from 不可描述的网站
"""
import requests
import glob
import shutil
from selenium import webdriver
import time
import os
import sys
import re
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
def downloading_images(prefix, url):
filename = url.replace(prefix, '')
basename = os.path.basename(filename)
dirname = filename.replace(basename, '')
dirurl = url.replace(basename, '')
if not os.path.exists(dirname):
os.makedirs(dirname)
response_id = True
index = 1
while response_id != False:
url_path = dirurl + str(index) + ".jpg"
response = requests.get(url_path)
response_id = response.ok
filename = dirname + str(index) + ".jpg"
if response_id == True:
print(url_path, filename)
with open(filename,'wb') as f:
for chunk in response.iter_content(128):
f.write(chunk)
index += 1
if __name__ == "__main__":
prefix = "不可描述的域名"
down_loading_urls = [
['不可描述的网站', 49]
]
for down_loading_url, count in down_loading_urls:
chrome_options = Options()
chrome_options.add_argument('-headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options = chrome_options)
try:
print(down_loading_url)
browser.get(down_loading_url)
time.sleep(4)
for num in range(1, count + 1):
browser.find_element_by_xpath('/html/body/div[2]/div[3]/div[2]/ul/li[%d]/a'%num).click()
time.sleep(4)
values = browser.find_elements_by_xpath('/html/body/div[2]/article/div/div/img[1]')[0].get_attribute('data-original')
downloading_images(prefix, values)
browser.back()
time.sleep(4)
except Exception as e:
print("global",e)
finally:
browser.close()
完成之后最好把图片都放一个文件夹,并且放在手机上观看
# 将图片集中到一个文件夹
import os
import shutil
dirname = "files/80606"
dirs = os.listdir(dirname)
dirs = [int(value) for value in dirs]
dirs.sort()
new_dirname = dirname + "all"
index = 1
if not os.path.exists(new_dirname):
os.makedirs(new_dirname)
for dir_ in dirs:
dir_path = os.path.join(dirname, str(dir_))
for file in os.listdir(dir_path):
filename = os.path.join(dir_path, file)
shutil.copy(filename, os.path.join(new_dirname, str(index) + ".jpg"))
index += 1
import os
import shutil
dirname = "files/80648"
new_dirname = dirname + "all"
index = 1
if not os.path.exists(new_dirname):
os.makedirs(new_dirname)
dirs = sorted([int(value) for value in os.listdir(dirname)])
t_dir = dirs[0]
t_path = os.path.join(dirname, str(t_dir))
mark = False
for f in os.listdir(t_path):
t_t_dir = os.path.join(t_path, f)
if os.path.isdir(t_t_dir):
mark = True
shutil.move(t_t_dir, os.path.join(dirname, f))
if mark == True:
os.rmdir(t_path)
dirs = sorted([int(value) for value in os.listdir(dirname)])
for dir_ in dirs:
path_dir = os.path.join(dirname, str(dir_))
# print(path_dir)
dir_ = sorted([int(value.split(".")[0]) for value in os.listdir(path_dir)])
for file in dir_:
filename = os.path.join(path_dir,str(file) + ".jpg")
new_filename = os.path.join(new_dirname,str(index) + ".jpg")
print(filename, new_filename)
shutil.copy(filename,new_filename)
index += 1
网友评论