1.解析csv文件
逐行阅读输入 datafile 文件,对于前 10 行(不包括标题),根据“,”拆分每行,然后为每行创建一个字典,键是字段的标题,值是该字段在该行中的值。函数 parse_file 应该返回一个字典列表,文件中的每个数据行是一个列表项。字段名称和值不应该包含多余的空白,例如空格或换行字符。可以使用 Python 字符串方法strip() 删除多余的空白。
对于这道练习,只需解析前 10 行数据,所以返回的列表应该有 10 项!
datafile='beatles-diskography.csv'
def parse_file(datafile):
data=[]
with open(datafile,'rb') as f:
header = f.readline().split(',')
counter=0
for line in f:
if counter==10:
break
fields=line.split(',')
entry={}
for i,value in enumerate(fields):
entry[header[i].strip()]=value.strip()
data.append(entry)
counter+=1
return data
parse_file(datafile)
2.使用csv模块
import csv
filename = 'beatles-diskography.csv'
def parse_file(filename):
data=[]
with open(filename,'rb') as f:
r = csv.DictReader(f)
for line in r:
data.append(line)
return data
parse_file(filename)
3.xlrd简介
import xlrd
datafile = "2013_ERCOT_Hourly_Load_Data.xls" #设置要载入的文件
def parse_file(datafile):
workbook = xlrd.open_workbook(datafile) #打开工作簿
sheet = workbook.sheet_by_index(0) #设置有待处理的工作表
data = [[sheet.cell_value(r, col) #遍历所有行和列,并将所有数据读取成python list
for col in range(sheet.ncols)]
for r in range(sheet.nrows)]
print "\nList Comprehension" #打印出我们刚才生成的列表中第三行第二列的值
print "data[3][2]:",
print data[3][2]
print "\nCells in a nested loop:" #遍历整个工作表,每次只遍历一行,然后在列中进行移动
for row in range(sheet.nrows):
for col in range(sheet.ncols):
if row == 50: #当我们遍历至第50行时,就把这一行的所有值都打印出来,每次只打一列
print sheet.cell_value(row, col),
### other useful methods:
print "\nROWS, COLUMNS, and CELLS:"
print "Number of rows in the sheet:",
print sheet.nrows #取出工作表的行数
print "Type of data in cell (row 3, col 2):",
print sheet.cell_type(3, 2) #查看某一单元格的数据或值的类型
#2 代表浮点数 #3 代表日期
print "Value in cell (row 3, col 2):",
print sheet.cell_value(3, 2) #以适当的Python值形式获取单元格的值
print "Get a slice of values in column 3, from rows 1-3:"
print sheet.col_values(3, start_rowx=1, end_rowx=4) #将某一列(第三列)的值进行切片
#需要这一列的三个值,从第一行开始取,一直到第四行,但不包含第四行
#取出这一列的三个值
print "\nDATES:"
print "Type of data in cell (row 1, col 0):",
print sheet.cell_type(1, 0)
exceltime = sheet.cell_value(1, 0) #取出单元格的值(日期)在xls中,日期显示为浮点数
print "Time in Excel format:",
print exceltime
print "Convert time to a Python datetime tuple, from the Excel float:",
print xlrd.xldate_as_tuple(exceltime, 0) #获取这个时间,并让我们可以在Python中把它当作时间处理
#convert to python datetime tuple
return data
data = parse_file(datafile)
4.练习:读取excel文件
import xlrd
import pprint
datafile = '2013_ERCOT_Hourly_Load_Data.xls'
def parse_file(datafile):
workbook = xlrd.open_workbook(datafile)
sheet = workbook.sheet_by_index(0)
data = [[sheet.cell_value(r,col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]
cv = sheet.col_values(1,start_rowx = 1,end_rowx = None)
maxval = max(cv)
minval = min(cv)
maxpos = cv.index(maxval) +1
minpos = cv.index(minval) +1
maxtime = sheet.cell_value(maxpos,0)
realmaxtime = xlrd.xldate_as_tuple(maxtime,0)
mintime = sheet.cell_value(minpos,0)
realmintime = xlrd.xldate_as_tuple(mintime,0)
data = {
'maxtime': realmaxtime,
'maxvalue': maxval,
'mintime': realmintime,
'minvalue': minval,
'avgcoast': sum(cv)/float(len(cv))
}
return data
data = parse_file(datafile)
pprint.pprint(data)
5.探索json
5.1主函数
import json
import requests
BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"
# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = { "simple": {},
"atr": {"inc": "aliases+tags+ratings"},
"aliases": {"inc": "aliases"},
"releases": {"inc": "releases"}}
def query_site(url, params, uid="", fmt="json"):
"""
This is the main function for making queries to the musicbrainz API. The
query should return a json document.
"""
params["fmt"] = fmt
r = requests.get(url + uid, params=params)
print "requesting", r.url
if r.status_code == requests.codes.ok:
return r.json()
else:
r.raise_for_status()
def query_by_name(url, params, name):
"""
This adds an artist name to the query parameters before making an API call
to the function above.
"""
params["query"] = "artist:" + name
return query_site(url, params)
def pretty_print(data, indent=4):
"""
After we get our output, we can use this function to format it to be more
readable.
"""
if type(data) == dict:
print json.dumps(data, indent=indent, sort_keys=True)
else:
print data
def main():
"""
Below is an example investigation to help you get started in your
exploration. Modify the function calls and indexing below to answer the
questions on the next quiz.
HINT: Note how the output we get from the site is a multi-level JSON
document, so try making print statements to step through the structure one
level at a time or copy the output to a separate output file. Experimenting
and iteration will be key to understand the structure of the data!
"""
# Query for information in the database about bands named Nirvana
results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
pretty_print(results)
# Isolate information from the 4th band returned (index 3)
print "\nARTIST:"
pretty_print(results["artists"][3])
# Query for releases from that band using the artist_id
artist_id = results["artists"][3]["id"]
artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
releases = artist_data["releases"]
# Print information about releases from the selected band
print "\nONE RELEASE:"
pretty_print(releases[0], indent=2)
release_titles = [r["title"] for r in releases]
print "\nALL TITLES:"
for t in release_titles:
print t
if __name__ == '__main__':
main()
5.2练习
5.2.1 how many bands named 'FIRST AID KIT'?
results_1 = query_by_name(ARTIST_URL, query_type["simple"], "FIRST AID KIT")
pretty_print(results_1)
n=0
for result_1 in results_1['artists']:
if result_1['name'].lower()=='first aid kit':
n+=1
print n
5.2.2 begin-area name for queen?
results_2 = query_by_name(ARTIST_URL, query_type["simple"], "QUEEN")
for result_2 in results_2['artists']:
if result_2['name'].lower()=='queen' and 'begin-area' in result_2.keys():
print result_2['begin-area']['name']
5.2.3 spanish alias for beatles?
results_3 = query_by_name(ARTIST_URL, query_type["aliases"], "BEATLES")
result_3 = results_3['artists']
for i,value in enumerate(result_3):
print(i,value['name'])
for aliase in results_3['artists'][7]['aliases']:
print aliase['locale']
for aliase in results_3['artists'][7]['aliases']:
if aliase['locale'] =='es':
print aliase['name']
5.2.4 nirvana disambiguation?
results_4 = query_by_name(ARTIST_URL, query_type["simple"], "NIRVANA")
for result_4 in results_4['artists']:
if result_4['name'] =='Nirvana' and 'country' in result_4.keys() and result_4['country']=='US':
print result_4['disambiguation']
5.2.5 when was one direction formed ?
results_5 = query_by_name(ARTIST_URL, query_type["simple"], "one direction")
for result_5 in results_5['artists']:
if result_5['name'].lower()=='one direction' and 'life-span' in result_5.keys():
print result_5['life-span']['begin']
6.习题集
6.1 使用csv模块
csv文件第一行如下:
745090,"MOUNTAIN VIEW MOFFETT FLD NAS",CA,-8.0,37.400,-122.050,12
第一行既不是数据项,也不是标题。它是用来描述数据来源的行。你应该从该行中提取气象站名称。
数据应该返回为包含列表的列表(而不是字典)。可以使用模块“reader”方法获取此类格式的数据。另一个有用的方法是next(),可以用来获取迭代器中的下一行。
import csv
datafile = "745090.csv"
def parse_file(datafile):
name = ""
data = []
with open(datafile,'rb') as f:
csv_reader = csv.reader(f)
name = next(csv_reader)[1]
csv_reader.next()
for line in csv_reader:
data.append(line)
pass
# Do not change the line below
return (name, data)
parse_file(datafile)
6.2 Excel 至 CSV
计算以下每个地区的最大加载时间和值:COAST、EAST、FAR_WEST、NORTH、NORTH_C、SOUTHERN、SOUTH_C、WEST,并将结果写到 csv 文件中,使用竖线字符“|”作为分隔符。
import xlrd
def parse_file(datafile):
workbook = xlrd.open_workbook(datafile)
sheet = workbook.sheet_by_index(0)
data = {}
for n in range (1, 9):
station = sheet.cell_value(0, n)
cv = sheet.col_values(n, start_rowx=1, end_rowx=None)
maxval = max(cv)
maxpos = cv.index(maxval) + 1
maxtime = sheet.cell_value(maxpos, 0)
realtime = xlrd.xldate_as_tuple(maxtime, 0)
data[station] = {"maxval": maxval,
"maxtime": realtime}
print data
return data
def save_file(data, filename):
with open(filename, "w") as f:
w = csv.writer(f, delimiter='|')
w.writerow(["Station", "Year", "Month", "Day", "Hour", "Max Load"])
for s in data.keys():
year, month, day, hour, _ , _= data[s]["maxtime"]
w.writerow([s, year, month, day, hour, data[s]["maxval"]])
parse_file('2013_ERCOT_Hourly_Load_Data.xls')
save_file(parse_file('2013_ERCOT_Hourly_Load_Data.xls'),'2013_Max_Loads.csv')
网友评论