爬图片

作者: Vincy_ivy | 来源:发表于2019-08-06 13:11 被阅读0次

    根据txt爬图片

    import os
    from icrawler.builtin import BingImageCrawler
    path = r'D:\pycharm_1\Image'
    f = open('starName.txt', 'r')
    lines = f.readlines()
    i=18
    for i, line in enumerate(lines):
        if i>25:
            break
        if i==1:
            continue
    
        name = line.strip('\n')
        file_path = os.path.join(path, name)
       # if not os.path.exists(file_path):
        #    os.makedirs(file_path)
        bing_storage = {'root_dir': file_path}
        bing_crawler = BingImageCrawler(parser_threads=2, downloader_threads=4, storage=bing_storage)
        bing_crawler.crawl(keyword=name,max_num=50)
        print('第{}位明星:{}'.format(i, name))
    

     

    进入网站根据名字爬图片

    import requests
    import re
    import os
    from pypinyin import pinyin, lazy_pinyin
    
    def getHTMLText(url):
        try:
            r = requests.get(url, timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            print("")
    
    
    def getPageUrls(text, name):
        re_pageUrl = r'href="(.+)">\s*<img src="(.+)" alt="' + name
        return re.findall(re_pageUrl, text)
    
    
    def downPictures(text, root, name):
        pageUrls = getPageUrls(text, name)
        titles = re.findall(r'alt="' + name + r'(.+)" ', text)
        for i in range(len(pageUrls)):
            pageUrl = pageUrls[i][0]
            path = root + titles[i] + "//"
            if not os.path.exists(path):
                os.mkdir(path)
            if not os.listdir(path):
                pageText = getHTMLText(pageUrl)
                totalPics = int(re.findall(r'<em>(.+)</em>)', pageText)[0])
                downUrl = re.findall(r'href="(.+?)" class="">下载图片', pageText)[0]
                cnt = 1;
                while (cnt <= totalPics):
                    picPath = path + str(cnt) + ".jpg"
                    r = requests.get(downUrl)
                    with open(picPath, 'wb') as f:
                        f.write(r.content)
                        f.close()
                    print('{} - 第{}张下载已完成\n'.format(titles[i], cnt))
                    cnt += 1
                    nextPageUrl = re.findall(r'href="(.+?)">下一张', pageText)[0]
                    pageText = getHTMLText(nextPageUrl)
                    downUrl = re.findall(r'href="(.+?)" class="">下载图片', pageText)[0]
        return
    
    
    def main():
        name = input("请输入你喜欢的明星的名字:")
        nameUrl = "http://www.win4000.com/mt/" + ''.join(lazy_pinyin(name)) + ".html"
        try:
            text = getHTMLText(nameUrl)
            if not re.findall(r'暂无(.+)!', text):
                root = "D://pycharm//" + name + "//"
                if not os.path.exists(root):
                    os.mkdir(root)
                downPictures(text, root, name)
                try:
                    nextPage = re.findall(r'next" href="(.+)"', text)[0]
                    while (nextPage):
                        nextText = getHTMLText(nextPage)
                        downPictures(nextText, root, name)
                        nextPage = re.findall(r'next" href="(.+)"', nextText)[0]
                except IndexError:
                    print("已全部下载完毕")
        except TypeError:
            print("不好意思,没有{}的照片".format(name))
        return
    
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

          本文标题:爬图片

          本文链接:https://www.haomeiwen.com/subject/sfkbdctx.html