美文网首页
爬虫学习2

爬虫学习2

作者: l_b_n | 来源:发表于2017-04-17 14:36 被阅读0次

    1.目标

    利用爬虫学习1的大概框架
    爬取嗅事百科的段子

    2.实现方法流程大概流程

    流程

    2.1 url_manager

    class UrlManager(object):
        def __init__(self):
            self.new_urls = set()
            self.old_urls = set()
    
        def add_new_url(self,url):
            if url is None:
                return
            if url not in self.new_urls and url not in self.old_urls:
                self.new_urls.add(url)
    
        def get_new_url(self):
            new_url = self.new_urls.pop()#get the url from the url list and delete it
            self.old_urls.add(new_url)
            return new_url
        
        def has_new_url(self):
            return (len(self.new_urls) != 0)
    
        def add_new_urls(self,urls):
            if urls is None or len(urls) == 0:
                return 
            for url in urls:
                self.add_new_url(url)
    

    2.2 parser

    from bs4 import BeautifulSoup#利用了bs4
    import urllib
    import re#没用用正则表达
    class Pars_er(object):    
    
    
        def _get_page_datas(self,soup):
            page_datas = []
            contents = soup.find_all('div',class_='content')
           #所有段子均在div class=content的div中find_all返回的是list[ ]
            #print('hello')
            for content in contents:
                #print(content.get_text())
                page_datas.append(content.get_text())
           return page_datas
           #返回的也是一个列表,列表中的是内容str类型的
    
        def parse(self,do_content):
            if page_url is None or do_content is None:
                return
            #print('parsering')
            soup = BeautifulSoup(do_content,'html.parser',from_encoding='utf-8')
                #        new_urls = self._get_new_urls(page_url,soup)
            new_data = self._get_page_datas(soup)
            return new_data
    

    2.3 downloader

    import urllib.request
    import urllib
    class DownLoader(object):
    
     def download(self,url):
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = {'User-agent':user_agent}
     
            request = urllib.request.Request(url,headers=headers)
            html = urllib.request.urlopen(request)
            return html.read()
    

    这里的downloader应用了代理这个是python3x的典型写法与python2不同;

    2.4 xiubai_spider

    import downloader, par,outputer,url_manager
    #from parser import Pars_er
    #from outputer import OutPuter
    #from url_manager import UrlManager
    class SpiderMain(object):
    
    
        def __init__(self):
            self.urls = url_manager.UrlManager()
            self.downloader = downloader.DownLoader()
            self.parse = par.Pars_er()
            self.outpute = outputer.OutPuter()
    
    
    
        def crawl(self,root_url):
            count = 1
            self.urls.add_new_url(root_url)
            new_url = self.urls.get_new_url()
            number = input('the page you want to crawl? ')
            if int(number) == 1:
                downloaded_content =self.downloader.download(new_url)
                new_datas = self.parse.parse(downloaded_content)
                print(len(new_datas))
                for new_data in new_datas:
                    self.outpute.output_info(new_data,count)
                    #count += 1
            else:
                while(count <= int(number)):
                    phase = 0
                    downloaded_content =self.downloader.download(new_url)
                    new_datas = self.parse.parse(downloaded_content)
                    print(len(new_datas))
                    for new_data in new_datas:
                        phase += 1
                        self.outpute.output_info(new_data,count,phase)
                    count += 1
                    new_url = "http://www.qiushibaike.com/8hr/page/"+str(count)+"/?s=4974494" 
    
    
    if __name__ ==  "__main__":
        root_url = "http://www.qiushibaike.com/"
        obj_spider = SpiderMain()
        obj_spider.crawl(root_url)
    

    观察到第一页不同之外,之后的url有明显的规律
    "http://www.qiushibaike.com/8hr/page/"+str(count)+"/?s=4974494"
    其中str(count)为大于1的页数得到就是url

    2.5 outputer

    class OutPuter(object):
        def output_info(self,datas,num,duanzi):
            file_name = 'xiubai.txt'
            with open(file_name,'a',encoding = 'utf-8') as file_object:
                file_object.write(str(num).strip()+'.'+str(duanzi)+datas+'\n')
    

    这里有一个问题就是需要你先建好一个以utf-8编码的txt文件(windows下)

    相关文章

      网友评论

          本文标题:爬虫学习2

          本文链接:https://www.haomeiwen.com/subject/xvstzttx.html