美文网首页
爬取 wall.alphacoders.com 网站 图片

爬取 wall.alphacoders.com 网站 图片

作者: EZ | 来源:发表于2020-05-26 10:43 被阅读0次

    resuslt:


    requests还是挺快的

    issue: 打算使用scrapy 下载呢,没有报错也没有结果,将cookie 添加到settings中header中也不行,但是单独使用Requests库请求是可以成功的。
    solution: 可能是因为连接中有?我没有转义。。。

    get: 全局变量声明,继续学习BS4

    1使用requests获取图片

    使用requests 需要很多代码呀,果然还是框架舒服,
    本次以 夏目 相关图片'https://wall.alphacoders.com/tags.php?tid=45523&page=1' 页面进行演示

    issue: 不能下载所有图片
    solution: 原来是把jpg加到搜索 的正则字符串中了,有的图片后缀为png

    issue: request 按顺序挨个请求,
    solution : 学习使用 multiprocessingmultiprocess

    issue: 下载的只是页面课加载到的图片,更高清图片需要点击下载按钮
    solution: 手动点击下载得到 的文件大小与requests下载文件大小一致。下面尝试使用scrapy下载

    #coding utf-t
    #usage: python script
    
    import requests,re,os
    from lxml import etree
    from multiprocessing import pool
    from bs4 import BeautifulSoup
    
    HEADERS = {
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Language': 'en',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'Cookie': '__cfduid=d863900ab9b493ee3cefa677797c43c121589890737; cookieconsent_status=allow; wa_session=so9sub6dc51vsemee397th2ncttu15sq59vt3f59skc84a810os4d5cv0dk76q4649s8at2mlhalrrc24cbhpjqlhqk8923hntjmh20'
    }
    
    def get_all_page_url():  #获取图片所有页面url
        first_page = input("please input the first page url, as below \n https://wall.alphacoders.com/tags.php?tid=45523&page=1 \n : "
                        )
        pages_num = input("please input the max page num showd in the page or the number you like lower the max page num \n: ")
    
        main_url = '='.join(first_page.split("=")[:-1])  #为列表,url中有2个等号
        all_pages_url = [main_url + "=" +str(p+1) for p in range(int(pages_num))] #得到页面url
        #print(all_pages_url)
        return all_pages_url
    
    def get_html(url):  #请求页面url
        try:
            res = requests.get(url,headers=HEADERS)
            print(res.status_code)
            res.raise_for_status()
            res.encoding = res.apparent_encoding  #apparent 写错了,搞了好久
            return res.text
        except:
            print("cant get html")
            return ""   
            
    def parse_page(ilt,html): #使用正则搜索图片链接
        try:
            srcs = re.findall(r'data-src="https://images[\d]?\.alphacoders.com/.*\..+?"',html)
            for i in srcs:
                src = i.split("=")[1].strip("\"")  #get pic thumb url 
                src = src.replace('thumb-350-','')  #get hd pic url
                #print(src)
                ilt.append(src)
        except:
                print("提取url失败")
        else:
            print("process success")
                
    
    def down_pic(ilt):
        j = 0
        for i in ilt:
            print('正在请求',i)
            html = requests.get(i,headers=HEADERS)
            title = i.split('/')[-1]
            print("正在下载第",str(j+1),"张 ",title)
            with open(path+title,'wb') as f:
                f.write(html.content)
            j+=1
        print('共下载图片数量',j)
    
    def main():    #下载函数
        path1 = input('请输入存储路径,路径有比如 \n "C:\\Users\\Acer\\Desktop\\xxmu" \n win下可直接在文件夹窗口点击复制路径 \n: ')
        global path
        path = path1.strip("\"").replace('\\','\\') + "\\" #转义下载路径
        print("文件将被下载至",str(path),'\n')
        ilt = []
        all_pages_url = get_all_page_url()
        all_pages_url_num = len(all_pages_url)
        
        for single_page_url in all_pages_url:
            page_number = single_page_url.split("=")[-1]
            html = get_html(single_page_url)
            parse_page(ilt,html)
        all_pics_number = len(ilt)
        print(str(all_pics_number),"pics are found")
        down_pic(ilt) #先注释这个,查看连接数够不够
            
    if __name__ == '__main__':
        import time
        st = time.time()
        main()
        time.sleep(1)
        et = time.time()
        print("用时",str(et-st),"s")
    

    2 使用Requests + multiprocessing

    pool.map中的,变量应为迭代对象
    87张图片 约423MB 用时约 155s
    issue : 声明全局变量失败
    solution: 不造

    #coding: utf-8
    #usage: python script
    
    import requests,re,os
    from multiprocessing import pool
    from bs4 import BeautifulSoup
    import multiprocessing as mp  
    
    HEADERS = {
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Language': 'en',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'Cookie': '__cfduid=d863900ab9b493ee3cefa677797c43c121589890737; cookieconsent_status=allow; wa_session=so9sub6dc51vsemee397th2ncttu15sq59vt3f59skc84a810os4d5cv0dk76q4649s8at2mlhalrrc24cbhpjqlhqk8923hntjmh20'
    }
    #获得文件加路径
    
    def get_all_page_url():  #获取图片所有页面url
        first_page = input("please input the second page url, as below \n https://wall.alphacoders.com/tags.php?tid=45523&page=1 \n : "
                        )
        pages_num = input("please input the max page num showd in the page or the number you like lower the max page num \n: ")
    
        main_url = '='.join(first_page.split("=")[:-1])  #为列表,url中有2个等号
        all_pages_url = [main_url + "=" +str(p+1) for p in range(int(pages_num))] #得到页面url
        #print(all_pages_url)
        return all_pages_url
    
    def get_html(url):  #请求页面url
        try:
            res = requests.get(url,headers=HEADERS)
            print(res.status_code)
            res.raise_for_status()
            res.encoding = res.apparent_encoding  #apparent 写错了,搞了好久
            return res.text
        except:
            print("cant get html")
            return ""   
            
    def parse_page(ilt,html): #使用正则搜索图片链接
        try:
            srcs = re.findall(r'data-src="https://images[\d]?\.alphacoders.com/.*\..+?"',html)
            for i in srcs:
                src = i.split("=")[1].strip("\"")  #get pic thumb url 
                src = src.replace('thumb-350-','')  #get hd pic url
                #print(src)
                ilt.append(src)
        except:
                print("提取url失败")
        else:
            print("all pics url got")
                
    
    def down_single_pic(single_pic_url):
        #single_page_url = single_pic_url
        print('正在请求下载:',single_pic_url)
        html = requests.get(single_pic_url,headers=HEADERS)
        title = single_pic_url.split('/')[-1]
        
        with open(title,'wb') as f:
            f.write(html.content)
    
        
                
    def down_pic(ilt):
        pool = mp.Pool()
        pool.map(down_single_pic,ilt)
        #print("正在下载第",str(j+1),"张 ",(title))
            # with open(path+title,'wb') as f:
                # f.write(html.content)
        #j+=1
        #print('共下载图片数量',j)
    
    def main():    #下载函数
        ilt = []
        all_pages_url = get_all_page_url()
        all_pages_url_num = len(all_pages_url)
        
        for single_page_url in all_pages_url:
            page_number = single_page_url.split("=")[-1]
            html = get_html(single_page_url)
            parse_page(ilt,html)
        all_pics_number = len(ilt)
        print(str(all_pics_number),"pics are found")
        down_pic(ilt) #注释这个,查看连接数够不够
            
    if __name__ == '__main__':
        # path1 = input('请输入存储路径,路径有比如 \n "C:\\Users\\Acer\\Desktop\\xxmu" \n win下可直接在文件夹窗口点击复制路径 \n: ')
        # global path
        # path = path1.strip("\"").replace('\\','\\') + "\\" #转义下载路径
        # print("文件将被下载至",str(path),'\n')
    
        import time
        st = time.time()
        main()
        time.sleep(1)
        et = time.time()
        print("用时",str(et-st),"s")
    
    

    3 使用scrapy下载.

    虽然之前照着视频可以操作成功,但是自己写还是遇到不少问题,121张,64MB 用时在2分钟内
    issue: 获取img标签下的url属性值为空列表,
    solution:对应的属性值不在response对象中,应先查看标签内容,有无此属性值,是否在page source中。
    crawlspider流程,pipeline如下下


    crawlspider流程

    issue:
    1 建立项目及文件

    scrapy startproject xxmu
    进入项目文件加,生成spider
    scrapy genspider --list  #查看可用模板
    scrapy genspider -t crawl  xxmu_spider https://wall.alphacoders.com/
    

    2.spider配置
    运行后没有结果,需要带cookies登录

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    
    class XxmuSpiderSpider(CrawlSpider):
        name = 'xxmu_spider'
        allowed_domains = ['wall.alphacoders.com']
        start_urls = ['https://wall.alphacoders.com/by_sub_category.php?id=266516&name=Natsume%27s+Book+of+Friends+Wallpapers']
    
        rules = (
            Rule(LinkExtractor(allow=r'https://wall.alphacoders.com/by_sub_category.php?id=266516&name=Natsume%27s+Book+of+Friends+Wallpapers.*'),
                 callback='parse_xxmu',
                 follow=True),
    
        )
    
    
        def parse_xxmu(self, response):
            srcs = response.xpath('//div[@class="thumb-container"]//a/img/@src').getall()
            for src in srcs:
                print(src)
    

    3.setting配置

    4.pipeline


    request_objcs return request_objcs

    return request_objcs 将request_objcs中的request请求挨个return

    file_path中的路径,文件名为计算得到

    相关文章

      网友评论

          本文标题:爬取 wall.alphacoders.com 网站 图片

          本文链接:https://www.haomeiwen.com/subject/oabiohtx.html