偶有兴致想找点图片来作视频设计的素材,然而百度图片质量不高,像素太渣,网上找了几个提供妹纸摄影照片的网站,嫌手动太慢,花了1个小时写了两个爬虫脚本,亲测效率还不错,爬了几千张照片,可以慢慢找需要的美图了(@_@)
爬取蜂鸟网
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2018-01-16 18:34:12
# @Author : bb (317716008@qq.com)
# @Word : python can change world!
# @Version : python3.6
import requests
from bs4 import BeautifulSoup
import urllib
import time
import os
url="http://bbs.fengniao.com/forum/10344999_1.html"
agents = {'User-Agent':"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0"}
def get_url(url,agents):
res=requests.get(url,headers=agents)
html=res.text
soup=BeautifulSoup(html,"html.parser")
return soup
#print(soup)
def return_html(soup):
html_list=[]
for link in soup.find_all('a',attrs={"target":"_blank"}):
if str(link['href'])[1:6]=='forum' and '_' not in str(link['href']):
html='http://bbs.fengniao.com'+(link['href'])
html_list.append(html)
#print("获取帖子",html)
print("----------------------------")
print("总共获取帖子数------->>>>>>>",len(html_list))
print("----------------------------")
time.sleep(3)
print('倒计时......'+'\n'*3)
return html_list
def guolv(html_list):
new_list=[]
nab=set()
for i in html_list:
if str(str(i).split('_')[0].split('/')[-1]) not in nab:
new_list.append(i)
nab.add(str(i).split('_')[0].split('/')[-1])
print('过滤完成!')
for i in new_list:
print("获取帖子",i)
return new_list
def img_save(soup):
for link in soup.find_all('img'):
tj=str(link['src'])[-2]
if tj =='1':
img_src=link['src']
print('获取图片地址',link['src'])
nn=img_src.split('.jpg')
nn=str(nn[0]).split('/')[-1]
filename=nn+'.jpg'
img = requests.get(img_src)
with open(filename,'ab') as f: #存储图片,多媒体文件需要参数b(二进制文件)
f.write(img.content) #多媒体存储content
def main():
page=1 ##设置起始页
while page<2: ##设置最大页数
print("这是第%s页!"%page)
url='http://bbs.fengniao.com/forum/forum_101_'+str(page)+'_execpost.html' ##精选地址
soup=get_url(url,agents)
html_list=return_html(soup) ##帖子地址
#new_list=guolv(html_list)
for i in html_list:
soup=get_url(i,agents)
img_save(soup)
with open('output.txt','a') as f:
print("第%s页完成!"%page,file=f)
print("第%s页完成!"%page)
print('\n'*5)
page=page+1
time==time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()[0:9])
# ostime='2018-01-17 17:00:00'
# if time < ostime:
# os.system("shutdown -h now") ##定时关机
def test():
url='http://bbs.fengniao.com/forum/forum_101_2_execpost.html'
soup=get_url(url,agents)
html_list=return_html(soup)
print(html_list)
if __name__ == '__main__':
main()
爬取煎蛋网
import threading
import time
import requests
import urllib.request
import re
from queue import Queue
import random
from bs4 import BeautifulSoup
agents = [
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5"]
class jandanspridy(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self._queue=queue
def run(self):
while not self._queue.empty():
each=self._queue.get_nowait()
filename = each.split('/')[-1]
req=urllib.request.urlopen(each)
img=req.read()
with open(filename, 'wb') as f:
f.write(img)
def getProxyIp():
proxy = []
header = {'User-Agent':random.choice(agents)}
for i in range(1,3):
url = 'http://www.xicidaili.com/nn/'+str(i)
req = requests.get(url,headers=header)
res = req.text
soup = BeautifulSoup(res,"html.parser")
ips = soup.findAll('tr')
for x in range(1,len(ips)):
ip = ips[x]
tds = ip.findAll("td")
ip_temp = tds[1].contents[0]+":"+tds[2].contents[0]
proxy.append(ip_temp)
return proxy
def get_proxies():
proxy=getProxyIp()
a=1
print('正在核对ip代理。。。。。。')
while a>0:
try:
ipchoice=random.choice(proxy)
proxies={'http':ipchoice}
res=requests.get("https://www.baidu.com/",proxies=proxies,timeout=2)
if res.status_code==200:
print(proxies,'is up')
return proxies
break
else:
print(ipchoice+'连接不上哦')
proxy.remove(ipchoice)
except :
#print('有错误,让我重启下')
pass
a=1
def get_page():
url="http://jandan.net/ooxx"
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0')
response = urllib.request.urlopen(url)
html= response.read().decode('utf-8')
page_list=re.findall(r'<span class="current-comment-page">\[(.*?)\]</span>',html)
page=page_list[0]
return page
def eachpageurl():
global queue
#proxies=get_proxies()
#print(proxies,'可以使用')
page=int(get_page())
f=open('output1.txt','w+')
html_list=[]
queue=Queue()
while page>200:
print("这是第%s页哦!"% page)
page_list=str("这是第%s页哦!"% page)
f.write(page_list+'\n')
url='http://jandan.net/ooxx/page-'+str(page)+'#comments'
page=page-1
req=requests.get(url) ##如果代理不稳定可以去掉代理参数proxies再运行
try:
html=req.text
html2=re.findall('</a></span><p><a href="(.*?)" target=',html)
for i in html2:
each='http:'+str(i)
f.write(each+'\n')
html_list.append(each)
queue.put(each)
print(each)
# return html_list
except requests.exceptions.ConnectTimeout as e:
print('代理连接超时哦!换一个先,稍等')
eachpageurl()
pass
except requests.exceptions.ProxyError as e:
print('代理不稳定哦!换一个先,稍等')
eachpageurl()
pass
def main():
threads=[]
thread_count=10
eachpageurl()
#queue=Queue()
# queue1.empty()
for i in range(thread_count):
threads.append(jandanspridy(queue))
for t in threads:
t.start()
for t in threads:
t.join()
#fun1()
if __name__ == '__main__':
main()
网友评论