爬虫寻图记

作者: 网络安全自修室 | 来源:发表于2018-01-18 23:38 被阅读0次

爬虫寻图记
思维导图用于记笔记20180514
又过一周
思(诗)言(337)七绝一一动物世界
bab
禅绕图样练习----绽放【D5】
2019年3月读书《鬼谷子的说话智慧》
2019.3《传习录》
2020-04-13
思维导图实战

偶有兴致想找点图片来作视频设计的素材，然而百度图片质量不高，像素太渣，网上找了几个提供妹纸摄影照片的网站，嫌手动太慢，花了1个小时写了两个爬虫脚本，亲测效率还不错，爬了几千张照片，可以慢慢找需要的美图了(@_@)

爬取蜂鸟网

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2018-01-16 18:34:12
# @Author  : bb (317716008@qq.com)
# @Word    : python can change world!
# @Version : python3.6

import requests
from bs4 import BeautifulSoup 
import urllib
import time
import os

url="http://bbs.fengniao.com/forum/10344999_1.html"

agents = {'User-Agent':"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0"} 
def get_url(url,agents):
    res=requests.get(url,headers=agents)
    html=res.text
    soup=BeautifulSoup(html,"html.parser")
    return soup
#print(soup)

def return_html(soup):
    html_list=[]
    for link in soup.find_all('a',attrs={"target":"_blank"}):
        if str(link['href'])[1:6]=='forum' and '_' not in str(link['href']):
            html='http://bbs.fengniao.com'+(link['href'])        
            html_list.append(html)
            #print("获取帖子",html)
    print("----------------------------")
    print("总共获取帖子数------->>>>>>>",len(html_list))
    print("----------------------------")
    time.sleep(3)
    print('倒计时......'+'\n'*3)
    return html_list

def guolv(html_list):
    new_list=[]
    nab=set()
    for i in html_list:    
        if str(str(i).split('_')[0].split('/')[-1]) not in nab:
            new_list.append(i)
            nab.add(str(i).split('_')[0].split('/')[-1])
    print('过滤完成！')
    for i in new_list:
        print("获取帖子",i)
    return new_list

    

def img_save(soup):
    for link in soup.find_all('img'):
        tj=str(link['src'])[-2]
        if tj =='1':
            img_src=link['src']
            print('获取图片地址',link['src'])
            nn=img_src.split('.jpg')
            nn=str(nn[0]).split('/')[-1]
            filename=nn+'.jpg'
            img = requests.get(img_src)   
            with open(filename,'ab') as f: #存储图片，多媒体文件需要参数b（二进制文件）  
                f.write(img.content) #多媒体存储content  
def main():
    page=1 ##设置起始页
    while page<2:  ##设置最大页数
        print("这是第%s页!"%page)   
        url='http://bbs.fengniao.com/forum/forum_101_'+str(page)+'_execpost.html' ##精选地址
        soup=get_url(url,agents)
        html_list=return_html(soup) ##帖子地址
        #new_list=guolv(html_list)   
        for i in html_list:
            soup=get_url(i,agents)
            img_save(soup)
        with open('output.txt','a') as f:
            print("第%s页完成!"%page,file=f)
        print("第%s页完成!"%page)
        print('\n'*5)
        page=page+1
        time==time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()[0:9])
        # ostime='2018-01-17 17:00:00'  
        # if time < ostime:
        #     os.system("shutdown -h now")  ##定时关机

def test():
    url='http://bbs.fengniao.com/forum/forum_101_2_execpost.html'
    soup=get_url(url,agents)
    html_list=return_html(soup)
    print(html_list)
if __name__ == '__main__':
    main()

爬取煎蛋网

import threading
import time
import requests
import urllib.request
import re
from queue import Queue
import random
from bs4 import BeautifulSoup
agents = [  
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",  
    "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",  
    "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",  
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5"]


class jandanspridy(threading.Thread):
    
    def __init__(self,queue):
        threading.Thread.__init__(self)
        self._queue=queue
    
    def run(self):
        while not self._queue.empty():
            each=self._queue.get_nowait()
            filename = each.split('/')[-1]
            req=urllib.request.urlopen(each)
            img=req.read()
            with open(filename, 'wb') as f:
                f.write(img)

def getProxyIp():
 proxy = []
 header = {'User-Agent':random.choice(agents)}
 for i in range(1,3):
    
    url = 'http://www.xicidaili.com/nn/'+str(i)
    req = requests.get(url,headers=header)
    res = req.text
    soup = BeautifulSoup(res,"html.parser")
    ips = soup.findAll('tr')
    for x in range(1,len(ips)):
        ip = ips[x]
        tds = ip.findAll("td")
        ip_temp = tds[1].contents[0]+":"+tds[2].contents[0]
        proxy.append(ip_temp)
 return proxy

def get_proxies():
    proxy=getProxyIp()
    a=1
    print('正在核对ip代理。。。。。。')
    while a>0:
        try:
            ipchoice=random.choice(proxy)
            proxies={'http':ipchoice}
            res=requests.get("https://www.baidu.com/",proxies=proxies,timeout=2)
            if res.status_code==200:
                
                print(proxies,'is up')
                return proxies
                break
            else:
                print(ipchoice+'连接不上哦')
                proxy.remove(ipchoice)
        except :
            #print('有错误，让我重启下')
            pass
            a=1

def get_page():
    url="http://jandan.net/ooxx"
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0')
    response = urllib.request.urlopen(url)
    html= response.read().decode('utf-8')
    page_list=re.findall(r'<span class="current-comment-page">\[(.*?)\]</span>',html)

    page=page_list[0]
    return page


def eachpageurl():
    global queue
    #proxies=get_proxies()
    #print(proxies,'可以使用')
    page=int(get_page())
    f=open('output1.txt','w+')
    html_list=[]
    queue=Queue()
    
    while page>200:

        print("这是第%s页哦！"% page)
        page_list=str("这是第%s页哦！"% page)
        f.write(page_list+'\n')
        url='http://jandan.net/ooxx/page-'+str(page)+'#comments'
        page=page-1
        req=requests.get(url) ##如果代理不稳定可以去掉代理参数proxies再运行
        try:
            html=req.text
            html2=re.findall('</a></span><p><a href="(.*?)" target=',html)
            for i in html2:
                each='http:'+str(i)
                f.write(each+'\n')
                html_list.append(each)
                
                queue.put(each)
                
                print(each)
                
                # return html_list
        except requests.exceptions.ConnectTimeout as e:
            print('代理连接超时哦！换一个先，稍等')
            eachpageurl()
            pass
        except requests.exceptions.ProxyError as e:
            print('代理不稳定哦！换一个先，稍等')
            eachpageurl()
            pass




def main():
    threads=[]
    thread_count=10
    
    eachpageurl()
    #queue=Queue()
    
        # queue1.empty()

    for i in range(thread_count):
        
        threads.append(jandanspridy(queue))

    for t in threads:
        t.start()

    for t in threads:
        t.join()


#fun1()
if __name__ == '__main__':
    main()