美文网首页
2018-07-11晚上改的

2018-07-11晚上改的

作者: 加勒比海带_4bbc | 来源:发表于2018-07-11 00:14 被阅读0次

    # coding=utf-8

    import requests

    from bs4 import  BeautifulSoup

    import re

    import pymongo

    import json

    import pandas as pd

    import numpy as np

    import xlrd

    import datetime

    def onedayPriceSpider(spiderDay):

        url = "http://www.hfzgncp.com.cn/index.php?m=content&c=index&a=lists&catid=59&sendtime="+str(spiderDay)+"&page=1"

        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0)' +\

                  'Gecko/20100101 Firefox/61.0'}

        r = requests.get(url,headers=headers)

        html = r.text.encode(encoding='utf_8').decode()

        soup = BeautifulSoup(html,"lxml")

        table = soup.find('table', attrs={'class' :{'h_list_table r_list_table'}})

        #print(table)

        #查看表格数据行数

        height = len(table.findAll(lambda tag:tag.name=='tr' and

                                  len(tag.findAll('td'))>=1))

        print('height:'+str(height))

        #收集表头

        columns = [x.text for x in table.tr.findAll('th')]

        columns = [x.replace('\xa0','') for x in columns]

        #print(columns)

        #构造dataframe 准备存储表格

        width = len(columns)  #表头列数

        #print('width:'+str(width))

        df = pd.DataFrame(data = np.full((height,width),'',dtype = 'U'),columns = columns)

        rows = [row for row in table.findAll('tr') if row.find('td')!=None]

        #逐行分析表格

        for i in range(len(rows)):

            cells = rows[i].findAll('td')

            if len(cells) == width:

                df.iloc[i] = [cell.text.replace(' ','').replace('\n','') for cell in cells]  #去点空格和换行

            else:

                w=len(cells)

                df.iloc[i,width-w:] = [cell.text.replace(' ','').replace('\n','') for cell in cells]

        #print(df)

        sendtime = soup.find('input', attrs={'id' :{'sendtime'}})['value'].rstrip('/-') #获取数据时间

        sendtimeStr=re.sub("\-","",sendtime)

        #print(sendtimeStr)

        outputfilePath="D:/"+sendtimeStr+".xlsx"

        df.to_excel(outputfilePath)

        xlsx_data=xlrd.open_workbook(outputfilePath)

        xlsx_table=xlsx_data.sheet_by_name(u'Sheet1')

        columnIndex = None

        rowIndex = None 

        for j in range(xlsx_table.ncols):       

            for i in range(height):

                if(xlsx_table.cell_value(i, j) == '平均价'):

                    columnIndex = j

                    break       

                if(xlsx_table.cell_value(i, j) == '鲫鱼'):

                    rowIndex = i

                    break

        print(xlsx_table.cell_value(rowIndex, columnIndex))

        #price_data = []

        #data.append(sheet.row_values(i))

        #client= pymongo.MongoClient()

        #获取一个数据库

        #db=client.priceSpider

        #创建 或获取一个集合,并在collection下新建books

        #account=db.prcie

        #data=xlrd.open_workbook("D:/"+sendtimeStr+".xlsx")

        #table=data.sheets()[0]

        #读取excel第一行数据作为存入mongodb的字段名

        #rowstag=table.row_values(0)

        #nrows=table.nrows

        #print('-------------nrows----------------'+str(nrows))

        #ncols=table.ncols #print rows

        #returnData={}

        #for i in range(1,nrows):

            #将字段名和excel数据存储为字典形式,并转换为json格式

            #returnData[i]=json.dumps(dict(zip(rowstag,table.row_values(i))))

            #通过编解码还原数据

            #returnData[i]=json.loads(returnData[i])

            #print returnData[i]

            #account.insert(returnData[i])

            #return daySpider

    def getColumnIndex(table, columnName):

        columnIndex = None   

        for i in range(table.ncols):       

            if(table.cell_value(0, i) == columnName):

                columnIndex = i

            break

        return columnIndex

    spiderDaynumber =1

    for i in range(spiderDaynumber):

        spiderDay = datetime.date.today()- datetime.timedelta(days=i+1)   

        print(spiderDay)

        onedayPriceSpider(spiderDay)

    #xlsfile= 'F:/test_AutoTesting/TestCase/RunList.xlsx'

    #  table = readExcelDataByName(xlsfile, 'Sheet1')[0]

        #获取第一行的值   

      #  testcase_id = table.cell_value(1, getColumnIndex(table,'平均价'))

    相关文章

      网友评论

          本文标题:2018-07-11晚上改的

          本文链接:https://www.haomeiwen.com/subject/fircpftx.html