# coding=utf-8
import requests
from bs4 import BeautifulSoup
import re
import pymongo
import json
import pandas as pd
import numpy as np
import xlrd
import datetime
def onedayPriceSpider(spiderDay):
url = "http://www.hfzgncp.com.cn/index.php?m=content&c=index&a=lists&catid=59&sendtime="+str(spiderDay)+"&page=1"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0)' +\
'Gecko/20100101 Firefox/61.0'}
r = requests.get(url,headers=headers)
html = r.text.encode(encoding='utf_8').decode()
soup = BeautifulSoup(html,"lxml")
table = soup.find('table', attrs={'class' :{'h_list_table r_list_table'}})
#print(table)
#查看表格数据行数
height = len(table.findAll(lambda tag:tag.name=='tr' and
len(tag.findAll('td'))>=1))
print('height:'+str(height))
#收集表头
columns = [x.text for x in table.tr.findAll('th')]
columns = [x.replace('\xa0','') for x in columns]
#print(columns)
#构造dataframe 准备存储表格
width = len(columns) #表头列数
#print('width:'+str(width))
df = pd.DataFrame(data = np.full((height,width),'',dtype = 'U'),columns = columns)
rows = [row for row in table.findAll('tr') if row.find('td')!=None]
#逐行分析表格
for i in range(len(rows)):
cells = rows[i].findAll('td')
if len(cells) == width:
df.iloc[i] = [cell.text.replace(' ','').replace('\n','') for cell in cells] #去点空格和换行
else:
w=len(cells)
df.iloc[i,width-w:] = [cell.text.replace(' ','').replace('\n','') for cell in cells]
#print(df)
sendtime = soup.find('input', attrs={'id' :{'sendtime'}})['value'].rstrip('/-') #获取数据时间
sendtimeStr=re.sub("\-","",sendtime)
#print(sendtimeStr)
outputfilePath="D:/"+sendtimeStr+".xlsx"
df.to_excel(outputfilePath)
xlsx_data=xlrd.open_workbook(outputfilePath)
xlsx_table=xlsx_data.sheet_by_name(u'Sheet1')
columnIndex = None
rowIndex = None
for j in range(xlsx_table.ncols):
for i in range(height):
if(xlsx_table.cell_value(i, j) == '平均价'):
columnIndex = j
break
if(xlsx_table.cell_value(i, j) == '鲫鱼'):
rowIndex = i
break
print(xlsx_table.cell_value(rowIndex, columnIndex))
#price_data = []
#data.append(sheet.row_values(i))
#client= pymongo.MongoClient()
#获取一个数据库
#db=client.priceSpider
#创建 或获取一个集合,并在collection下新建books
#account=db.prcie
#data=xlrd.open_workbook("D:/"+sendtimeStr+".xlsx")
#table=data.sheets()[0]
#读取excel第一行数据作为存入mongodb的字段名
#rowstag=table.row_values(0)
#nrows=table.nrows
#print('-------------nrows----------------'+str(nrows))
#ncols=table.ncols #print rows
#returnData={}
#for i in range(1,nrows):
#将字段名和excel数据存储为字典形式,并转换为json格式
#returnData[i]=json.dumps(dict(zip(rowstag,table.row_values(i))))
#通过编解码还原数据
#returnData[i]=json.loads(returnData[i])
#print returnData[i]
#account.insert(returnData[i])
#return daySpider
def getColumnIndex(table, columnName):
columnIndex = None
for i in range(table.ncols):
if(table.cell_value(0, i) == columnName):
columnIndex = i
break
return columnIndex
spiderDaynumber =1
for i in range(spiderDaynumber):
spiderDay = datetime.date.today()- datetime.timedelta(days=i+1)
print(spiderDay)
onedayPriceSpider(spiderDay)
#xlsfile= 'F:/test_AutoTesting/TestCase/RunList.xlsx'
# table = readExcelDataByName(xlsfile, 'Sheet1')[0]
#获取第一行的值
# testcase_id = table.cell_value(1, getColumnIndex(table,'平均价'))
网友评论