从豆瓣网下载整个相册的图片
从西祠代理网站爬取免费高匿ip 西祠代理
import os,time,requests,random
from bs4 import BeautifulSoup
import urllib
import urllib.request
# Get IP #
def get_proxy(num):
os.chdir(r'C:\Users\xxxx\Desktop\Python')
xiciurl = 'http://www.xicidaili.com/nn/{}'
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
fp = open('host.txt','a+',encoding=('utf-8'))
for i in range(num+1):
api = xiciurl.format(i)
response = requests.get(url=api,headers=header)
soup = BeautifulSoup(response.text,'lxml')
container = soup.find_all(name='tr',attrs={'class':'odd'})
#print(container)
for tag in container:
try:
con_soup = BeautifulSoup(str(tag),'lxml')
td_list = con_soup.find_all('td')
#print(td_list)
ip = str(td_list[1])[4:-5]
port = str(td_list[2])[4:-5]
#print(ip,port)
IPport = ip + '\t' + port + '\n'
fp.write(IPport)
except Exception as e:
print('No IP!')
time.sleep(1)
fp.close()
get_proxy(5) #抓取5页的代理IP地址
验证抓取的IP是否可用
# verify IP #
def verify_proxy():
n = 1
os.chdir(r'C:\Users\xxxx\Desktop\Python')
url = 'http://www.baidu.com'
fp = open('host.txt','r')
ips = fp.readlines()
proxys = list()
for p in ips:
ip = p.strip('\n').split('\t')
print(ip)
proxy = 'http:\\' + ip[0] + ip[1]
proxies = {'proxy':proxy}
proxys.append(proxies)
for pro in proxys:
try:
s = requests.get(url,proxies=pro)
print('第{}个ip:{} 状态{}'.format(N,pro,s.status_code))
except Exception as e:
print(e)
n+=1
verify_proxy()
建立代理IP池
# Build IP pool
def proxypool(num):
n = 1
os.chdir(r'C:\Users\xxxx\Desktop\Python')
fp = open('host.txt','r')
proxys = list()
ips = fp.readlines()
while n<num:
for p in ips:
ip = p.strip('\n').split('\t')
proxy = 'http:\\' + ip[0] + ip[1]
proxies = {'proxy':proxy}
proxys.append(proxies)
n+=1
#print(proxys)
return proxys
抓取豆瓣相册 再见台湾
def download_album(pages,proxys):
os.chdir(r'C:\Users\xxxx\Desktop\Python\Douban')
download_dir = "C:\\Users\\xxxx\\Desktop\\Python\\Douban"
url = 'https://www.douban.com/photos/album/1634496188/?start='
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
x = 1
for i in range(0,pages):
print(('第{}页').format(i + 1))
url_cur = url + str(i * 18)
#print(url_cur)
try:
response = requests.get(url_cur, headers=headers, proxies=random.choice(proxys))
time.sleep(2)
while response.status_code != 200:
response = requests.get(url_cur, headers=headers, proxies=random.choice(proxys))
time.sleep(2)
soup = BeautifulSoup(respones.text,'lxml')
imgs = soup.find_all(name='div',attrs={'class':'photo_wrap'})
#print(imgs)
y = 0
for img in imgs:
imgurls = BeautifulSoup(str(img),'lxml').find_all('img')
time.sleep(2)
for u in imgurls:
img_url = u.get('src')
img_dir = download_dir + '\\'
z = str(x) + '_' + str(y)
print(('第{}张').format(y+1))
#print(img_url)
urllib.request.urlretrieve(img_url,'{}{}.jpg'.format(img_dir,z))
y = y + 1
time.sleep(2)
except:
time.sleep(5)
continue
x = x + 1
time.sleep(5)
start = time.time()
proxyPool = proxypool(100)
download_album(17,proxyPool)
end = time.time()
timeUse = int(end-start)
print('耗时{}s'.format(timeUse))
网友评论