美文网首页
爬妹子图

爬妹子图

作者: 交易狗二哈 | 来源:发表于2017-03-24 20:41 被阅读92次

    利用 BeautifulSoup + Requests 爬取 妹子图

    import requests
    import re, time, os
    from bs4 import BeautifulSoup
    
    urls = ["http://www.meizitu.com/a/list_1_{}.html".format(i) for i in range(1,31)]   #目前共92页
    
    headers1 = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',                               #妹子图的请求头
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Cookie':'bdshare_firstime=1477909035413; Hm_lvt_a01ff1f91d0b936673f038453940cdb9=1477909035,1477909103; safedog-flow-item=F05CF6535242D231B430A78792F9D78D; CNZZDATA30056528=cnzz_eid%3D318212343-1488381613-null%26ntime%3D1488449174',
    'Host':'www.meizitu.com',
    'If-Modified-Since':'Tue, 21 Feb 2017 15:45:20 GMT',
    'If-None-Match':"6470d82598cd21:196c",
    'Referer':'http://www.meizitu.com/a/list_1_1.html',
    'pgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    
    headers2 = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',                              #图片下载链接在另一个网站,有反爬,所以另建一个请求头
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Cookie':'__jsluid=a0a09999bc1cd95bb78e3cfc51c0b9d8; safedog-flow-item=2676F109CF0E6A11F1AB1ADC63D76F97',
    'Host':'mm.howkuai.com',
    'If-Modified-Since':'Sat, 19 Nov 2016 20:12:20 GMT',
    'If-None-Match':"16808f3ba142d21:1527",
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    
    def Get_url(url):
        links = []
        titles = []
        web_data = requests.get(url, headers=headers1)
        web_data.encoding = 'gb2312'                    #改为该网站编码
        soup = BeautifulSoup(web_data.text, 'lxml')
        picture_websites_location = soup.find_all("div", class_="pic")  #首页的链接存在这
        for i in range(len(picture_websites_location)):
            links.append(picture_websites_location[i].find_all(target="_blank")[0]["href"]) #该内容的链接全在这个标签内,find_all返回列表,所以应该用list[0]操作
            # links.append(picture_websites_location[i].find_all(href=re.compile(".*?")).get("href"))  与上面一样
        for i in range(len(picture_websites_location)):
            titles.append(picture_websites_location[i].find_all("img")[0]["alt"].strip('<b>').strip('</'))
            # 有的标题含有<b>...</b>,连用两次strip去除,前面过程与取内容链接一致
        return links, titles            #返回内容链接与内容标题
    
    def Get_picture_link(website, filename):            #传入内容网址,即为图片所在网址
        pictures = []
        titles = []
        os.mkdir(filename)              #用传入的内容标题创建文件夹目录,储存对应内容的图片
        time.sleep(4)                   #防止请求频繁被封IP
        web_data = requests.get(website, headers= headers1)
        web_data.encoding = 'gb2312'
        soup = BeautifulSoup(web_data.text, 'lxml')
        links = soup.find_all(id = "picture")              #返回实际为只有列表[0],因为picture这标签在里面只有一个
        links_real = links[0].find_all("img")              #图片的链接都存在列表[0]里,先排除多余标签再取出图片地址
        for i in range(len(links_real)):
            pictures.append(links_real[i]["src"])
        for i in range(len(links_real)):
            titles.append(links_real[i]["alt"])
        for i in range(len(pictures)):
            time.sleep(2)
            picture = requests.get(pictures[i], headers=headers2)   #用requests下载图片
            if picture.status_code == 200:
                a = os.getcwd()
                path = a + '\\' +filename +  '\\'+ titles[i] + '.jpg'  #对应的图片放到对应的文件夹里
            open(path, 'wb').write(picture.content)
        print('完成了一个文件夹')
    
    
    if __name__ == '__main__':
        for url in urls:
            pic, tit = Get_url(url)
            for i in range(len(pic)):
                Get_picture_link(pic[i], tit[i])
    
        #大网址——提取——>图片所在网址>——提取——图片链接
    
    
    

    相关文章

      网友评论

          本文标题:爬妹子图

          本文链接:https://www.haomeiwen.com/subject/xkafottx.html