美文网首页
2018-07-16-spider1

2018-07-16-spider1

作者: 加勒比海带_4bbc | 来源:发表于2018-07-16 00:05 被阅读0次

    coding=utf-8

    import requests
    from bs4 import BeautifulSoup
    import re
    import os
    import pymongo
    import json
    import pandas as pd
    import numpy as np
    import xlrd
    import datetime
    from pyecharts import Line,Grid,EffectScatter,Overlap

    def getPriceSoup_table(spiderDay):
    soup_table=BeautifulSoup('',"lxml")
    for m in range(1,4):
    url = "http://www.hfzgncp.com.cn/index.php?m=content&c=index&a=lists&catid=59&sendtime="+str(spiderDay)+"&page="+str(m)
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0)' +
    'Gecko/20100101 Firefox/61.0'}
    r = requests.get(url,headers=headers)
    html = r.text.encode(encoding='utf_8').decode()
    soup = BeautifulSoup(html,"lxml")
    table = soup.find('table', attrs={'class' :{'h_list_table r_list_table'}})
    soup_table.append(table)
    if soup_table.find('td') == None:
    print('!!!!!!!!!!!!!!!!!!!!!!'+str(spiderDay)+'这一天没有水产品价格数据!!!!!!!!!!!!!!!!!!!!!!')
    return
    print(str(spiderDay)+'这一天有水产品价格数据')
    return soup_table

    def soup_table2DataFrame(soup_table): #构造dataframe 准备存储表格
    #收集表头
    columns = [x.text for x in soup_table.tr.findAll('th')]
    columns = [x.replace('\xa0','') for x in columns]
    #print('水产品表头列表:'+str(columns))
    #表的宽度
    width = len(columns)
    print('水产品表头数目:'+str(width))
    #height = len(soup_table.findAll(lambda tag:tag.name=='tr' and len(tag.findAll('td'))>=1))
    #print('height:'+height)
    rows=[]
    for row in soup_table.findAll('tr'):
    if row not in rows and row.find('td')!=None:
    rows.append(row)
    height = len(rows) #水产品种类数目
    if height <=0:
    return
    print('水产品种类数目:'+str(height))

    for i in range(height):

    cells = rows[i].findAll('td')

        #print(cells)
    df = pd.DataFrame(data = np.full((height,width),'',dtype = 'U'),columns = columns)
    #逐行分析表格
    for i in range(height):
        cells = rows[i].findAll('td')
        print(cells)
        if len(cells) == width:
            df.iloc[i] = [cell.text.replace(' ','').replace('\n','') for cell in cells]  #去点空格和换行
        else:
            w=len(cells)
            df.iloc[i,width-w:] = [cell.text.replace(' ','').replace('\n','') for cell in cells]
    return df
    

    def onedayPriceSpider(spiderDay):

    #查看表格数据行数
    #height = len(table.findAll(lambda tag:tag.name=='tr' and len(tag.findAll('td'))>=1))
    #rows = [row for row in table.findAll('tr') if row.find('td')!=None]
    #print('rows:'+str(len(rows)))
    
    #sendtime = soup.find('input', attrs={'id' :{'sendtime'}})['value'].rstrip('/-') #获取数据时间
    #sendtimeStr=re.sub("\-","",sendtime)
    

    def getPriceLine(product_name,days):
    price_data = []
    price_date = []
    for i in range(days):
    spiderDay = datetime.date.today()- datetime.timedelta(days=i+1)
    spiderDayStr =str(spiderDay) #2018-07-11格式
    sendtimeStr=re.sub("-","",spiderDayStr) #20180711格式
    #outputfilePath="D:/xlsx/"+sendtimeStr+".水产品价格xlsx"
    if os.path.exists(outputfilePath):
    ExcelFile = xlrd.open_workbook(outputfilePath)
    sheet = ExcelFile.sheet_by_index(0)
    columnIndex = None
    rowIndex = None
    for j in range(sheet.ncols):
    for i in range(sheet.nrows):
    if sheet.cell_value(i, j) == '平均价':
    columnIndex = j
    break
    if sheet.cell_value(i, j) == product_name:
    rowIndex = i
    break
    if not (rowIndex == None) and not (columnIndex == None):
    print(sheet.cell_value(rowIndex, columnIndex))
    price_data.append(sheet.cell_value(rowIndex, columnIndex))
    price_date.append(sendtimeStr)
    print(price_data)
    print(price_date)
    attr = price_date[::-1]
    v1 = price_data[::-1]
    line = Line(product_name+'价格走势图')
    line.add(product_name,attr,v1,is_smooth = True,mark_point = ['max','min'],mark_line=["average"],yaxis_formatter="元")
    grid =Grid()
    grid.add(line,grid_top="10%")

    判断是否存在目标文件夹

    isExists=os.path.exists('D:/价格走势图')
    if not isExists:
    

    如果不存在则创建目录

        os.makedirs('D:/价格走势图') 
    
        print ('创建D:/价格走势图文件夹成功')
    #return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
    

    print ('目录已存在')

        grid.render('D:/价格走势图/'+product_name+str(days)+'天价格走势图.html')
        print('已得到'+product_name+'价格走势图')
    

    es = EffectScatter()

    es.add('',attr,v1,effect_scale=8) #闪烁

    overlop = Overlap()

    overlop.add(line) #必须先添加line,在添加es

    overlop.add(es)

    overlop.render('./line-es01.html')

    return  
        
    #client= pymongo.MongoClient()
    #获取一个数据库   
    #db=client.priceSpider
    #创建 或获取一个集合,并在collection下新建books
    #account=db.prcie
    #data=xlrd.open_workbook("D:/"+sendtimeStr+".xlsx")
    #table=data.sheets()[0]
    #读取excel第一行数据作为存入mongodb的字段名
    #rowstag=table.row_values(0)
    #nrows=table.nrows
    #print('-------------nrows----------------'+str(nrows))
    #ncols=table.ncols #print rows
    #returnData={}
    #for i in range(1,nrows):
        #将字段名和excel数据存储为字典形式,并转换为json格式
        #returnData[i]=json.dumps(dict(zip(rowstag,table.row_values(i))))
        #通过编解码还原数据
        #returnData[i]=json.loads(returnData[i])
        #print returnData[i]
        #account.insert(returnData[i])
        #return daySpider
    

    if name=="main":

    spiderDaynumber = 60 
    for i in range(spiderDaynumber):    
        spiderDay = datetime.date.today()- datetime.timedelta(days=i+1)
        spiderDayStr =str(spiderDay)  #2018-07-11格式 
        sendtimeStr=re.sub("\-","",spiderDayStr) #20180711格式
        soup_table = getPriceSoup_table(spiderDay)
        if not soup_table == None:
            df = soup_table2DataFrame(soup_table)
            isExists=os.path.exists('D:/xlsx')
            if not isExists:
        #         如果不存在则创建目录
                os.makedirs('D:/xlsx') 
                print ('创建D:/xlsx文件夹成功')
            #return True
            else:
                outputfilePath="D:/xlsx/"+sendtimeStr+"水产品价格.xlsx"
                df.to_excel(outputfilePath)
        print('-------------------------------')
    getPriceLine('带鱼(大)',59)
    

    相关文章

      网友评论

          本文标题:2018-07-16-spider1

          本文链接:https://www.haomeiwen.com/subject/rlvmpftx.html