美文网首页
获取相关的百度搜索结果

获取相关的百度搜索结果

作者: g0 | 来源:发表于2018-01-18 09:28 被阅读147次
    #!/usr/bin/python
    # ^_^ coding:utf8 ^_^
    
    import re
    import time
    import requests
    from urllib import quote
    import sys, getopt
    import random
    from lxml import etree
    
    from pymongo import MongoClient
    
    
    
    add_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))
    class MongoDB(object):
        """"""
    
        #----------------------------------------------------------------------
        def __init__(self, host = 'localhost', port = 27017, database = 'bashiniandai'):
            """Constructor"""
            self.host = host
            self.port = port
            self.database = database
            self.conn = MongoClient(self.host, self.port)
            self.coll = self.conn[self.database]
      
    
    Mongo = MongoDB('127.0.0.1', 27017, 'wangzuxian')  
    
    
    class baidu:
    
        '''爬百度搜索结果的爬虫'''
    
    
        #################################################  初始化url  #######################################################
        def __init__(self, keyword,total_pages):
            
            self.url = u'https://www.baidu.com/baidu?wd=' + quote(keyword) + '&tn=monline_dg&ie=utf-8'
            
            self.m = 1
            
            url = u''
            self.item=[]
            self.urls = []
            self.o_urls = []
            self.result_item=[]
            self.html = ''
            self.total_pages = total_pages
            self.current_page = 0
            self.next_page_url = ''
            self.timeout = 60  # 默认超时时间为60秒        
    
        def set_current_url(self, url):
            '''设置当前url'''
            self.url = url
    
        def switch_url(self):
            '''切换当前url为下一页的url
               若下一页为空,则退出程序'''
            if self.next_page_url == '':
                sys.exit()
            else:
                self.set_current_url(self.next_page_url)
    
    
        def is_finish(self):
            '''判断是否爬取完毕'''
            if self.current_page >= self.total_pages:
                return True
            else:
                return False
        #################################################  动态获取user_agent  #######################################################
        def get_user_agent(self):
            user_agents = [
                "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
                "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
                "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
                "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
                "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
                "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
                "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
                "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20"]
            return random.choice(user_agents)
    
        #################################################  获取真实url  #######################################################
        def get_real(self,o_url):
    
            r = requests.get(o_url, timeout=self.timeout,allow_redirects=False)  # 禁止自动跳转
            if r.status_code == 302:
                try:
                    return r.headers['location']  # 返回指向的地址
                except:
                    pass
            return o_url  # 返回源地址
    
    
        #################################################  获取页面  #######################################################
        def get_html(self):
                headersParameters = {  # 发送HTTP请求时的HEAD信息,用于伪装为浏览器
                    'Connection': 'Keep-Alive',
                    'Accept': 'text/html, application/xhtml+xml, */*',
                    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
                    'Accept-Encoding': 'gzip, deflate',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
                }
                timeout=5
                '''爬取当前url所指页面的内容,保存到html中'''
                r = requests.get(self.url, timeout=timeout, headers=headersParameters)
                if r.status_code == 200:
                    self.html = r.text
                    self.current_page += 1
    
                else:
                    self.html = u''
                    print '[ERROR]', self.url, u'get此url返回的http状态码不是200'
    
        #################################################  获取页面中的url  #######################################################
        def get_urls(self):
                title=[];url=[];item_r={}
                html=self.get_html()
                '''从当前html中解析出搜索结果的url,保存到o_urls'''
                next = re.findall(' href\=\"(\/s\?wd\=[\w\d\%\&\=\_\-]*?)\" class\=\"n\"', self.html)
                # print len(next)
                if len(next) == 1:
                    self.next_page_url = 'https://www.baidu.com' + next[0]
                else:
                    self.next_page_url = 'https://www.baidu.com' + next[1]
    
                page = etree.HTML(self.html)
                links = page.xpath('//h3[@class="t"]/a')              #title
                for link in links:
                    title.append(link.xpath('string(.)'))
                links = page.xpath('//div[@class="f13"]/a/@href')   #url
                for link in links:
                    url.append(self.get_real(link))
                for i in range(len(title)):
                    item_r['title']=title[i]
                    item_r['url']=url[2*i]
                    item_r['url_photo'] = url[2*i+1]
                    self.item.append(item_r)
                print self.item
                
                Mongo.coll[key_word].update({"add_time": add_time},
                                                 {"$set": self.item},
                                                 upsert = True)          
    
        def run(self):
            while (not self.is_finish()):
                self.get_urls()
                self.switch_url()
                time.sleep(random.uniform(3.0,6.0))
                print self.m
                self.m=self.m+1
    
    
    def start(key_word, page):
        c = baidu(key_word, page)
        c.run()
    
    start('"澳门"', 1)
    

    相关文章

      网友评论

          本文标题:获取相关的百度搜索结果

          本文链接:https://www.haomeiwen.com/subject/jsjrpxtx.html