美文网首页pythontrivial程序员
股票数据的网站抓取(4.3)代码优化

股票数据的网站抓取(4.3)代码优化

作者: 明慢慢 | 来源:发表于2015-07-26 20:17 被阅读735次

    进一步优化

    1. 使用phantomjs替代firefox,实现无浏览器界面,加快速度
    2. 去除上版对于浏览器布局部分代码

    优化后的结果

    基本上可以在5分钟内搞定上海的股票数据下载比过去确实有大进步

    #coding=utf-8
    from selenium import webdriver
    import time
    import os
    import re
    import sys  
    import threading
    import Queue
    from selenium.common.exceptions import NoSuchElementException
    def myinit():
        reload(sys)  
        sys.setdefaultencoding('utf8')
    
    ########################
    #生成多个浏览器实例
    def makebr(number):
        brs=[]
        for i in range(number):
            #br=webdriver.Firefox()
            br=webdriver.PhantomJS('phantomjs')
            brs.append(br)
        return brs
    
    #关闭所有浏览器
    def closebr(brs):
        for i in brs:
            i.quit()
    #######################
    #获得指定股票相关数据
    def getinfo(mydriver,gourl):
        title='//*[@id="name"]'
        code='//*[@id="code"]'
        hexinshujuxpath="/html/body/div[14]/div[1]/div[4]/div[1]"
        restr=":".decode('utf8')
        myre=re.compile(restr,re.I|re.M|re.S)
        linetext=""
        errorinfo=""
        mydriver.get(gourl)
        try:
            gupiaoming=mydriver.find_element_by_xpath(title).text
            gupiaocode=mydriver.find_element_by_xpath(code).text
            hexinshuju=mydriver.find_element_by_class_name('pad5')
            shujuhang=hexinshuju.find_elements_by_tag_name('tr')
            for i in range(len(shujuhang)-2):
                shujulie=shujuhang[i].find_elements_by_tag_name('td')
                tmpshuju=myre.split(shujulie[0].text)
                linetext=linetext+"~"+tmpshuju[1]
            shuju=myre.split(shujuhang[8].text)
            linetext=linetext+"~"+shuju[1]
            tmpshuju=myre.split(shujuhang[9].text)
            linetext=linetext+"~"+tmpshuju[1]
            linetext="%s~%s%s"%(gupiaoming,gupiaocode,linetext)
            #print "数据:",linetext
        except  NoSuchElementException,e:
            #print "不是股票"
            pass
        except Exception:
            errorinfo= "非预期错误"+gourl
            print errorinfo
        finally:
            return linetext,errorinfo
    
    #获得所有股票链接
    def geturls(br):
        
        #通过link对象获得链接地址的text文本
        def getlinkurl(linklist):
            my=[]
            for x in linklist:
                my.append(x.get_attribute('href'))
            return my
        sz=[]
        sh=[]
        br.get("http://quote.eastmoney.com/stocklist.html")
        shxpath="/html/body/div[9]/div[2]/div/ul[1]"
        szxpath="/html/body/div[9]/div[2]/div/ul[2]"
        shgupiao=br.find_element_by_xpath(shxpath)
        szgupiao=br.find_element_by_xpath(szxpath)
        shgupiaolist=shgupiao.find_elements_by_tag_name('a')
        szgupiaolist=szgupiao.find_elements_by_tag_name('a')
        sh=getlinkurl(shgupiaolist)
        sz=getlinkurl(szgupiaolist)
        return sh,sz
    
    #多线程执行用的函数
    def thread_getinfo(br,jobslink_queue,jieguo_queue,errorinfo_queue):
        while True:
            try:
                #获得队列里的地址
                url=jobslink_queue.get(False)  #False =Don't wait
            except Queue.Empty:
                print "完成退出"
                #br.quit()
                return
            #print url
            if(url!=None):
                (linetext,errorinfo)=getinfo(br,url)
                if(linetext!=""):
                    jieguo_queue.put(linetext)
                if(errorinfo!=""):
                    errorinfo_queue.put(errorinfo)
                    
    
    #######################
    #多线程控制函数
    def saveinfoabc(info_filename,error_filename,urllist):
    
        jobslink=Queue.Queue(0)
        jieguo=Queue.Queue(0)
        errorsinfo=Queue.Queue(0)
        
        for x in urllist[200:250]:#为测试方便这里只取了50,如果要全下载,取消[]就好
        #for x in urllist:
            jobslink.put(x)
        
        #启动线程
        for x in range(THREAD_NUM):
            t=threading.Thread(target=thread_getinfo,args=(brs[x],jobslink,jieguo,errorsinfo))
            t.start()
        
        f=open(info_filename,'w')
        e=open(error_filename,'w')
        mycount=0
        
        while (threading.activeCount()>1) or (not jobslink.empty()):
            while jieguo.qsize()>0 or errorsinfo.qsize()>0:
                if(jieguo.qsize()>0):
                    jieguotxt=jieguo.get()
                    f.write(jieguotxt+"\n")
                if(errorsinfo.qsize()>0):
                    error=errorsinfo.get()
                    e.write(error+"\n")
            mycount=mycount+1
            if(mycount%100)==0:
                print "%d:  活动线程:%d,剩余连接数:%d,结果剩余条数:%d,错误剩余条数:%d"%(mycount,threading.activeCount(),jobslink.qsize(),jieguo.qsize(),errorsinfo.qsize())
            time.sleep(0.01)
        e.close()
        f.close()
        
        print "数据下载完成"
    
    ######################
    myinit()
    
    #br=webdriver.Firefox()
    br=webdriver.PhantomJS('phantomjs')
    print "获得所有链接地址"
    (sh,sz)=geturls(br)
    br.quit()
    
    #info_filename='shinfo.txt'
    #error_filename='sherror.txt'
    THREAD_NUM=10
    brs=makebr(THREAD_NUM)
    
    saveinfoabc('shinfo_for_008.txt','sherror_for_008.txt',sh)
    saveinfoabc('szinfo.txt','szerror.txt',sz)
    closebr(brs)
    
    

    相关文章

      网友评论

        本文标题:股票数据的网站抓取(4.3)代码优化

        本文链接:https://www.haomeiwen.com/subject/gkjmqttx.html