Python下载网络图片

作者: DevSpoon | 来源:发表于2017-10-28 16:44 被阅读8次

Python下载网络图片
Python基础知识全网最全6(网络爬虫)
制作you-get视频下载器单文件版
下载各个网站的视频
python下载图片
python网络操作和有道翻译api破解
python爬虫-图片下载
iOS 基础知识大全之网络篇（可供零基础学习）
python一些有趣的库（摘自微信公众号）
Python网络数据采集3-数据存到CSV以及MySql

#!/usr/bin/python3
# -*- coding: UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import re
import time
import gevent
from gevent import monkey

monkey.patch_all()


# 分析图片地址
def parser(html):
    try:
        soup = BeautifulSoup(html, 'html.parser', from_encoding='gbk')
        imgs = soup.find_all('img', src=re.compile(r'/d/file/\d+/\w+\.jpg'))
        print(imgs)
        return imgs
    except Exception as e:
        print('in parser error=%s' % e)
        return None


# 保存爬取得图片
def save_imgs(path, data):
    print(path)
    try:
        with open(path, 'wb') as f:
            f.write(data)
    except Exception as e:
        print('in save_imgs error=%s' % e)


# 下载器
def download(url):
    # 封装请求头
    header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38'
    }
    try:
        # 拼装请求体
        req = request.Request(url=url, headers=header)
        # 发送请求
        response = request.urlopen(req, timeout=10)
        return response.read()
    except Exception as e:
        print('in download error=%s' % e)
        return None


# 爬取图片主函数
def spider():
    imgs = []
    temp = []
    first_url = "http://www.xiaohuar.com/list-1-%s.html"
    for i in range(10):
        html = download(first_url % i)
        if html:
            temp = parser(html)
        if temp != []:
            imgs += temp
    s_time = time.time()
    glist = []
    if imgs:
        print(imgs.__len__())
        for img in imgs:
            data = download("http://www.xiaohuar.com%s" % img['src'])
            g = gevent.spawn(save_imgs, '%s.jpg' % img['alt'], data)
            glist.append(g)
            # save_imgs('%s.jpg' % img['alt'], data)
        gevent.joinall(glist)
        e_time = time.time()
        print('耗费%s 秒' % (e_time - s_time))
    else:
        print("网络错误")


if __name__ == '__main__':
    spider()

curl 'http://www.xiaohuar.com/list-1-1.html' \
-XGET \
-H 'Referer: http://www.xiaohuar.com/hua/' \
-H 'Host: www.xiaohuar.com' \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' \
-H 'Connection: keep-alive' \
-H 'Accept-Language: zh-cn' \
-H 'Accept-Encoding: gzip, deflate' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38' \
-H 'Cookie: a2513_times=2; bdshare_firstime=1509174867647; Hm_lpvt_0dfa94cc970f5368ddbe743609970944=1509174858; Hm_lvt_0dfa94cc970f5368ddbe743609970944=1509117400; BDTUJIAID=3d773a98f1f848519918522c2e8b8ba3' \
-H 'Upgrade-Insecure-Requests: 1'