美文网首页
简单爬虫,尽情爬取妹子图片

简单爬虫,尽情爬取妹子图片

作者: 宋西厚 | 来源:发表于2018-11-08 16:54 被阅读0次
    import requests
    import redis
    import re
    from hashlib import md5
    from urllib.parse import urljoin
    
    class My_Flie:
        # path 为图片的保存路径 redis_host 为redis的主机ip
        def __init__(self,url,path=r'E:\img',redis_host='192.168.16.117',port=6379,password=None):
            self.redis_client=redis.Redis(host=redis_host,port=port,password=password)
            self.start_url=url
            self.request=requests.request
            self.headers={
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36'
            }
            self.path=path
    
        # 下载
        def download_html(self,url,method='get'):
            response=self.request(url=url,method=method,headers=self.headers)
            return response
        #解析 将url 存到redis的list里
        #利用redis的set去重
        def parse_html(self,response,source_url):
            page_url_list = re.findall(r'(?<=href=").*?(?=")',response.text)
            img_url_list=re.findall(r'(?<=src=").*?jpg(?=")',response.text)
    
            for url in page_url_list:
                if not url.startswith('http'):
                    url=urljoin(source_url,url)
                if self.fliter_request(url):
                    self.redis_client.lpush('my_flie:request_url_list',url)
    
            for url in img_url_list:
                if not url.startswith('http'):
                    url=urljoin(source_url,url)
                if self.fliter_request(url):
                    self.redis_client.lpush('my_flie:request_url_list',url)
    
    
        def save_img(self,response,url):
            with open(f'{self.path}\{self.md5_url(url)}.jpg','wb') as f:
                f.write(response.content)
    
        def fliter_request(self,url):
            flag=self.redis_client.sadd('my_flie:set',url)
            return flag
    
        def run(self):
            response=self.download_html(self.start_url)
            self.parse_html(response,self.start_url)
            count=1
            while True:
                try:
                    url=self.redis_client.rpop('my_flie:request_url_list').decode()
                    response=self.download_html(url)
                    if url.endswith('html'):
                        self.parse_html(response,url)
                        print(f"下载页面")
                    else:
                        self.save_img(response,url)
                        print(f"下载{count}张")
                        count+=1
                except Exception as e:
                    print(e)
    
    
        def md5_url(self,url):
            m=md5()
            m.update(url.encode())
            return m.hexdigest()
    
        def close(self):
            self.redis_client.delete('my_flie:request_url_list')
            self.redis_client.delete('my_flie:set')
    
    
    if __name__ == "__main__":
        url='http://www.meizitu.com'   # 这里可以换成你想要爬取的网址
        me=My_Flie(url)
        me.run()
    
    
    
    
    
    
    

    相关文章

      网友评论

          本文标题:简单爬虫,尽情爬取妹子图片

          本文链接:https://www.haomeiwen.com/subject/wewwxqtx.html