美文网首页
Python爬虫之爬取煎蛋网妹子图

Python爬虫之爬取煎蛋网妹子图

作者: weizujie | 来源:发表于2017-08-17 23:20 被阅读80次

    写在前面

    感谢@Cstances学长的帮助。

    贴代码:

    import re
    import os
    import requests
    from bs4 import BeautifulSoup
    
    '''
    第一步:获取单页图片的链接
    第二步:获取页数的数字
    第三步:获取所有图片的链接
    第四步:保存图片
    '''
    
    def get_images(url):
        """获取单页图片链接"""
        headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Mobile Safari/537.36'}
        html = requests.get(url, headers=headers)
        html.encoding = 'utf-8'
        soup = BeautifulSoup(html.text, 'lxml')
    
        single_page_imgurls = [] # 用于保存当前页的图片链接
        div_a = soup.find('ol' ,{'class':'commentlist'}).find_all('a', href=re.compile(r'//(.*?\.jpg)')) #获取a标签
        for url in div_a:
            single_page_imgurls.append('http:' + url['href'])
        return single_page_imgurls
    
    def get_pages(url):
        """获取首页 page number"""
        headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Mobile Safari/537.36'}
        html = requests.get(url, headers=headers)
        html.encoding = 'utf-8'
        soup = BeautifulSoup(html.text, 'lxml')
    
        pattern = re.compile(r'<span class="current-comment-page">\[(.*)\]</span>')
        pagesNow = pattern.search(str(soup)).groups()[0]
        return pagesNow
    
    def get_all_images(max_pages):
        """获取所有的图片链接"""
        all_images_url = [] #保存所有图片的链接
        page_num = int(get_pages('http://jandan.net/ooxx'))
        for page in range(page_num, page_num-max_pages, -1): #图片是倒着来取的
            url = 'http://jandan.net/ooxx/page-' + str(page) + '#comments'
            all_images_url.extend(get_images(url))#把单页的图片链接加到all_images_url里
        return all_images_url
    
    def save_images(url, dir_name='ooxx'):
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)
        with open(dir_name+os.sep+url.split('/')[-1], 'wb') as fp:
            fp.write(requests.get(url).content)
    
    
    
    def main():
        endpage = int(input('请输入要下载的页数:'))
        all_images_url = get_all_images(endpage)
        for img_url in all_images_url:
            save_images(img_url)
    
    if __name__ == '__main__':
        main()
    
    效果图

    相关文章

      网友评论

          本文标题:Python爬虫之爬取煎蛋网妹子图

          本文链接:https://www.haomeiwen.com/subject/hgcorxtx.html