美文网首页
Python 爬虫实战_meizitu

Python 爬虫实战_meizitu

作者: 以此怀念 | 来源:发表于2019-11-30 15:52 被阅读0次
# -*- coding=utf-8 -*-
'''   
人生苦短,我用Python
'''
###以此怀念###
import time
import requests
import re
import lxml
import os
from bs4 import BeautifulSoup

###################### 全局变量 ##############################
url = 'https://www.mzitu.com/all'  # 需要爬取的网页地址
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
           'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
           'Accept-Encoding': 'gzip',
           "Referer": "https://www.mzitu.com/all"
           }  # 创建头部信息
##############################################################
def get(url):  # 发送网络请求
    a = requests.get(url, headers=headers)
    html = a.text
    return html

def main():
    soup = BeautifulSoup(get(url), 'lxml')  # 解析爬取网址
    all_url = soup.find('div', class_='all').find_all('a')  # 过滤数据到all_url列表中
    for mulu in all_url:  # 遍历url列表
        if mulu.get_text() == '早期图片':
            continue
        else:
            dict_mulu = {
                'title': mulu.get_text(),
                'link': mulu.get('href'),
                'ID': re.findall('\d+', mulu.get('href'))
            }  # 过滤出字典
        mulu_id = dict_mulu['ID']   #读字典ID开始过滤已下载内容
        with open('已下载列表.txt','a+') as file:
            file.seek(0)
            txt = file.read().splitlines()
            aa = list(txt)
            wancheng = [True for a in mulu_id if a not in aa]
            if wancheng:
                mulu_url = dict_mulu['link']
                print('开始下载当前图帖链接:', mulu_url)
                soup2 = BeautifulSoup(get(mulu_url), 'lxml')  # 解析字典中的目录地址
                img_mulu = soup2.find("div", {"class": "main-image"}).find("img")['src']  # 匹配图片地址
                page = soup2.find_all("span")[9]  # 取图贴页数
                max_page = page.get_text()
                os.chdir(img_dir)
                new_dir(dict_mulu['title'])
                for j in range(1, int(max_page) + 1):
                    next_img_page = mulu_url + '/' + str(j)
                    img_html = BeautifulSoup(get(next_img_page), 'lxml')
                    # 图片链接
                    img_url = img_html.find("div", {"class": "main-image"}).find("img")['src']
                    # 图片名
                    img_name = dict_mulu['title'] + str(j)
                    # 下载图片
                    down(img_name, img_url)
                    print('图片地址: ', img_url)
                    time.sleep(yanshi)
                get_end(str(dict_mulu['ID']))
            else:
                print(str(dict_mulu['ID'])+'已下载,正在跳过....')

def down(name, image):
    f = open(name + '.jpg', 'wb+')
    img = requests.get(image, headers=headers)
    if str(img) == '<Response [200]>':
        print('下载图片...', end='')
        f.write(img.content)
    f.close()

def new_dir(name):  # 创建文件夹
    if os.path.exists(name):
        print('"%s"  文件夹已存在'%name)
        os.chdir(name)
    else:
        print('创建文件夹: {}'.format(name))
        os.mkdir(name)
        os.chdir(name)

def get_end(id):        #写出已下载列表
    os.chdir(img_dir)
    ftxt = open('已下载列表.txt', 'a+')
    txt = id.strip("[']")
    ftxt.write(txt+'\n')
    ftxt.close()

if __name__ == '__main__':
    print("####################################################################")
    print("# 开始执行脚本...                                                   #")
    print("# 支持断点重下,重新执行脚本即可...                                   #")
    print("#                                                       2019.11.30 #")
    print("####################################################################")
    img_dir = 'f:\学习资料'  # 设定存储爬取图片的路径
    new_dir(img_dir)
    yanshi = 0.3  # 设定抓取图片延时(0.3秒)
    main()

############################## End 2019.11.30 ######################################

相关文章

网友评论

      本文标题:Python 爬虫实战_meizitu

      本文链接:https://www.haomeiwen.com/subject/vhvowctx.html