美文网首页
爬虫框架的五大基础模块

爬虫框架的五大基础模块

作者: 静宸丶水默含声 | 来源:发表于2017-09-30 14:38 被阅读27次

    源码地址:https://github.com/fatezxl/reptileDemo

    一、基础组件

    • URL管理器:
      管理已经爬取的url和新加入的url;
    • HTML下载器
      下载网页,此时需要注意网页编码;
    • 数据存储器
      将数据存储成文件,或者存储到数据库;
    • HTML解析器
      使用BeautifulSoup4进行HTML解析,提取相关的URL和当前页面的有效信息;
    • 爬虫调度器
      协调以上四个模块组件;

    二、组件源码分析

    1、URL管理器

       #coding:utf-8
    class UrlManager(object):
        '''
        URL管理器
        '''
        def __init__(self):
            self.new_urls = set() #未爬取URL集合
            self.old_urls = set() #已爬取URL集合
    
        def has_new_url(self):
            """
            判断是否有未爬取的URL
            :return:
            """
            return self.new_url_size() != 0
    
        def get_new_url(self):
            """
            获取一个人未爬取的URL
            :return:
            """
            new_url = self.new_urls.pop()
            self.old_urls.add(new_url)
            return new_url
    
        def add_new_url(self,url):
            '''
            将新的URL添加到未爬取的URL集合中
            :param url: 单个URL
            :return:
            '''
            if url is None:
                return
            if url not in self.new_urls and url not in self.old_urls:
                self.new_urls.add(url)
    
        def add_new_urls(self,urls):
            '''
            将新的URL添加到未爬取的URL集合中
            :param urls: url集合
            :return:
            '''
            if urls is None or len(urls) == 0:
                return
            for url in urls:
                self.add_new_url(url)
    
        def new_url_size(self):
            '''
            获取未爬取URL集合的大小
            :return:
            '''
            return len(self.new_urls)
    
        def old_url_size(self):
            '''
            获取已经爬取URL集合的大小
            :return:
            '''
            return len(self.old_urls)
    

    2、HTML下载器

    #coding:utf-8
    import requests
    class HtmlDownloader(object):
        '''
        HTML下载器
        '''
        def download(self,url):
            if url is None:
                return None
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = {
                'User-Agent':user_agent
            }
            r = requests.get(url,headers=headers)
            if r.status_code == 200:
                # r.encoding = 'utf-8'
                return r.text.encode('utf-8')
            return None
    

    3、数据存储器

    #coding:utf-8
    import codecs
    class DataOutput(object):
        '''
        数据存储器
        '''
        def __init__(self):
            self.datas = []
    
        def store_data(self,data):
            '''
            将解析出来的数据存储在内存中
            :param data:
            :return:
            '''
            if data is None:
                return
            self.datas.append(data)
    
        def output_html(self):
            '''
            将存储的数据输出为指定的文件格式
            :return:
            '''
            fout = codecs.open('baike.html','w',encoding='utf-8')
            fout.write('<html>')
            fout.write("<head><meta charset='utf-8'/></head>")
            fout.write("<body>")
            fout.write("<table>")
            fout.write("<tr><th>this page</th><th>title</th><th>1366_768</th><th>1680_1050</th></tr>")
    
            print '数组长度:%d' % self.datas.__len__()
            print '@#' * 30
            for data in self.datas:
                fout.write("<tr>")
                if data['page_url'] != None:
                    fout.write("<td><a href='%s'>this_page</a></td>" % data['page_url'])
                    print data['page_url']
                if data['title'] != '':
                    fout.write("<td>%s</td>" % data['title'])
                    print data['title']
                if data['imgurl_1366_768'] != '':
                    fout.write("<td><a href='%s'>1366HD</a></td>" % data['imgurl_1366_768'])
                    print data['imgurl_1366_768']
                if data['imgurl_1680_1050'] != '':
                    fout.write("<td><a href=' %s '>1080BD</a></td>" % data['imgurl_1680_1050'])
                    print data['imgurl_1680_1050']
                fout.write("</tr>")
                self.datas.remove(data)
            fout.write("</table>")
            fout.write("</body>")
            fout.write('</html>')
            fout.close()
    
    

    4、HTML解析器

    #coding:utf-8
    import re
    import urlparse
    from bs4 import  BeautifulSoup
    
    class HtmlParser(object):
        '''
        HTML解析器
        '''
        def parser(self,root_url,page_url,html_cont):
            '''
            用于解析网页内容,抽取URL和数据
            :param page_url: 下载页面的URL
            :param html_cont: 下载的网页内容
            :return: 返回URL和数据
            '''
            if page_url is None or html_cont is None:
                return
            soup = BeautifulSoup(html_cont,'html.parser')#加上from_encoding='utf-8'会弹出警告
            new_urls = self._get_new_urls(page_url,soup)
            new_data = self._get_new_data(root_url,page_url,soup)
            print '***' * 30 + '当前页面:'
            print page_url
            print '--'*30 + 'url集合'
            print new_urls
            print '--' * 30 + 'data数据'
            print new_data
            return new_urls,new_data
    
        def _get_new_urls(self,page_url,soup):
            '''
            抽取新的URL集合
            :param page_url: 下载页面的URL
            :param soup: soup
            :return: 返回新的URL集合
            '''
            new_urls = set()
            #抽取符合要求的a标记
            # 注意:此处规则很重要,关乎爬取的内容,此处举例:/fengjing/,/bizhi/7171_88712_2.html
            links = soup.find_all('a',href=re.compile(r'/\w+/'))
            links_2 = soup.find_all('a', href=re.compile(r'/bizhi/\d+/.html'))
            for link in links:
                #提取href属性
                new_url = link['href']
                #拼接成完整网址
                new_full_url = urlparse.urljoin(page_url,new_url).encode('utf-8')
                new_urls.add(new_full_url)
            for link2 in links_2:
                # 提取href属性
                new_url2 = link2['href']
                # 拼接成完整网址
                new_full_url2 = urlparse.urljoin(page_url, new_url2).encode('utf-8')
                new_urls.add(new_full_url2)
            return new_urls
    
        def _get_new_data(self,root_url,page_url,soup):
            '''
            抽取有效数据
            :param root_url:根地址
            :param page_url: 下载页面的URL
            :param soup:
            :return: 返回有效数据
            '''
            data = {
                'title' : '',
                'page_url' : '',
                'imgurl_1366_768' : '',
                'imgurl_1680_1050' : ''
    
            }
            #请求页面的地址
            data['page_url'] = page_url
    
            #图片标题
            #TODO 筛选存在问题
            test = soup.find(id='titleName')
            print 'title'
            print test
            # print test.string
            if test != None:
                print '不是none'
                print test.string
    
                title = soup.find(id='titleName').string
                print '编码之后:'
                print title
                data['title'] = title
    
                # 图片的地址
                # 1366x768
                if soup.find(id='1366x768')!=None:
                    imgurl_136_768 = soup.find(id='1366x768')['href']
                    new_full_url_1366_768 = urlparse.urljoin(root_url, imgurl_136_768)
                    data['imgurl_1366_768'] = new_full_url_1366_768.encode('utf-8')
                #1680_1050
                if soup.find(id='1680x1050') != None:
                    imgurl_1680_1050 = soup.find( id='1680x1050')['href']
                    new_full_url_1680_1050 = urlparse.urljoin(root_url, imgurl_1680_1050)
                    data['imgurl_1680_1050'] = new_full_url_1680_1050.encode('utf-8')
    
    
            # 参考案例
            # title = soup.find('i',class_='business-icon').find('img')['alt']
            # data['title'] = title.get_text()
            # summary = soup.find('div',class_='lemma-summary')
            # #获取tag中包含的所有文本内容,包括子孙tag中的内容,并将结果作为Unicode字符串返回
            # data['summary'] = summary.get_text()
            return data
    

    5、爬虫调度器

    #coding:utf-8
    
    #导入之前的所有文件
    from DataOutput import DataOutput
    from HtmlParser import HtmlParser
    from HtmlDownloader import HtmlDownloader
    from URLManager import UrlManager
    
    class SpiderMan(object):
        def __init__(self):
            self.manager = UrlManager()
            self.downloader = HtmlDownloader()
            self.parser = HtmlParser()
            self.output = DataOutput()
    
        def crawl(self,root_url):
            #添加入口URL
            self.manager.add_new_url(root_url)
            #判断url管理器中是否有新的url,同时判断抓取了多少个url,抓取数据数量限定在0——100之间
            while(self.manager.has_new_url() and self.manager.old_url_size() < 100):
                try:
                    #从URL管理器获取新的url
                    new_url = self.manager.get_new_url()
                    #HTML下载器下载页面
                    html = self.downloader.download(new_url)
                    #HTML解析器抽取网页数据
                    new_urls,data = self.parser.parser(root_url,new_url,html)
                    #将抽取的url添加到URL管理器中
                    self.manager.add_new_urls(new_urls)
    
                    print '有待爬取的url数量:%d' % self.manager.new_url_size()
    
                    #数据存储器存储文件
                    self.output.store_data(data)
                    print "已经抓取%s个链接"%self.manager.old_url_size()
                except Exception as e:
                    print "crawl failed"
                    print e.message
                #数据存储器将文件输出成指定格式
            self.output.output_html()
    
    if __name__ == "__main__":
        spider_man = SpiderMan()
        spider_man.crawl("http://desk.zol.com.cn/")
    

    相关文章

      网友评论

          本文标题:爬虫框架的五大基础模块

          本文链接:https://www.haomeiwen.com/subject/auggextx.html