美文网首页Python
Python 学习记录1

Python 学习记录1

作者: 夏秋之萌 | 来源:发表于2017-12-20 16:56 被阅读0次
    从豆瓣网下载整个相册的图片

    从西祠代理网站爬取免费高匿ip 西祠代理

    import os,time,requests,random
    from bs4 import BeautifulSoup
    import urllib
    import urllib.request
    
    # Get IP #
    def get_proxy(num):
        os.chdir(r'C:\Users\xxxx\Desktop\Python')
        xiciurl = 'http://www.xicidaili.com/nn/{}'
        header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
        fp = open('host.txt','a+',encoding=('utf-8'))
        for i in range(num+1):
            api = xiciurl.format(i)
            response = requests.get(url=api,headers=header)
            soup = BeautifulSoup(response.text,'lxml')
            container = soup.find_all(name='tr',attrs={'class':'odd'})
            #print(container)
            for tag in container:
                try:
                    con_soup = BeautifulSoup(str(tag),'lxml')
                    td_list = con_soup.find_all('td')
                    #print(td_list)
                    ip = str(td_list[1])[4:-5]
                    port = str(td_list[2])[4:-5]
                    #print(ip,port)
                    IPport = ip + '\t' + port + '\n'
                    fp.write(IPport)
                 except Exception as e:
                    print('No IP!')
            time.sleep(1)
        fp.close()  
    get_proxy(5) #抓取5页的代理IP地址
    

    验证抓取的IP是否可用

     # verify IP #
    def verify_proxy():
        n = 1
        os.chdir(r'C:\Users\xxxx\Desktop\Python')
        url = 'http://www.baidu.com'
        fp = open('host.txt','r')
        ips = fp.readlines()
        proxys = list()
        for p in ips:
            ip = p.strip('\n').split('\t')
            print(ip)
            proxy = 'http:\\' + ip[0] + ip[1]
            proxies = {'proxy':proxy}
            proxys.append(proxies)
        for pro in proxys:
            try:
                s = requests.get(url,proxies=pro)
                print('第{}个ip:{} 状态{}'.format(N,pro,s.status_code))
            except Exception as e:
                print(e)
            n+=1  
    verify_proxy()
    

    建立代理IP池

    # Build IP pool 
    def proxypool(num):
        n = 1
        os.chdir(r'C:\Users\xxxx\Desktop\Python')
        fp = open('host.txt','r')
        proxys = list()
        ips = fp.readlines()
        while n<num:
            for p in ips:
                ip = p.strip('\n').split('\t')
                proxy = 'http:\\' + ip[0] + ip[1]
                proxies = {'proxy':proxy}
                proxys.append(proxies)
                n+=1
        #print(proxys)
        return proxys  
    

    抓取豆瓣相册 再见台湾

    def download_album(pages,proxys):
        os.chdir(r'C:\Users\xxxx\Desktop\Python\Douban')
        download_dir = "C:\\Users\\xxxx\\Desktop\\Python\\Douban"
        url = 'https://www.douban.com/photos/album/1634496188/?start='
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
        x = 1
        for i in range(0,pages):
            print(('第{}页').format(i + 1))
            url_cur = url + str(i * 18)
            #print(url_cur)
            try:
                response = requests.get(url_cur, headers=headers, proxies=random.choice(proxys))
                time.sleep(2)
                while response.status_code != 200:
                    response = requests.get(url_cur, headers=headers, proxies=random.choice(proxys))
                    time.sleep(2)
                soup = BeautifulSoup(respones.text,'lxml')
                imgs = soup.find_all(name='div',attrs={'class':'photo_wrap'})
                #print(imgs)
                y = 0
                for img in imgs:
                    imgurls = BeautifulSoup(str(img),'lxml').find_all('img')
                    time.sleep(2)
                    for u in imgurls:
                        img_url = u.get('src')
                        img_dir = download_dir + '\\'
                        z = str(x) + '_' + str(y)
                        print(('第{}张').format(y+1))
                        #print(img_url)
                        urllib.request.urlretrieve(img_url,'{}{}.jpg'.format(img_dir,z))
                        y = y + 1
                        time.sleep(2)
            except:
                time.sleep(5)
                continue
            x = x + 1
        time.sleep(5)
    
    start = time.time()
    proxyPool = proxypool(100)
    download_album(17,proxyPool)
    end = time.time()
    timeUse = int(end-start)
    print('耗时{}s'.format(timeUse))

    相关文章

      网友评论

        本文标题:Python 学习记录1

        本文链接:https://www.haomeiwen.com/subject/vhugwxtx.html