美文网首页
数据提取基础(csv&xls&json)

数据提取基础(csv&xls&json)

作者: esskeetit | 来源:发表于2018-01-23 18:29 被阅读0次

1.解析csv文件

逐行阅读输入 datafile 文件,对于前 10 行(不包括标题),根据“,”拆分每行,然后为每行创建一个字典,键是字段的标题,值是该字段在该行中的值。函数 parse_file 应该返回一个字典列表,文件中的每个数据行是一个列表项。字段名称和值不应该包含多余的空白,例如空格或换行字符。可以使用 Python 字符串方法strip() 删除多余的空白。
对于这道练习,只需解析前 10 行数据,所以返回的列表应该有 10 项!

datafile='beatles-diskography.csv'

def parse_file(datafile):
    data=[]

    with open(datafile,'rb') as f:
        header = f.readline().split(',')
        counter=0
        
        for line in f:
            if counter==10:
                break
            
            fields=line.split(',')
            entry={}
            
            for i,value in enumerate(fields):
                entry[header[i].strip()]=value.strip()
                
            data.append(entry)
            counter+=1
    return data
parse_file(datafile) 

2.使用csv模块

import csv
filename = 'beatles-diskography.csv'
def parse_file(filename):
    data=[]
    with open(filename,'rb') as f:
        r = csv.DictReader(f)
        for line in r:
            data.append(line)
    return data
parse_file(filename)

3.xlrd简介

import xlrd
datafile = "2013_ERCOT_Hourly_Load_Data.xls"                     #设置要载入的文件


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)                      #打开工作簿
    sheet = workbook.sheet_by_index(0)                           #设置有待处理的工作表

    data = [[sheet.cell_value(r, col)                            #遍历所有行和列,并将所有数据读取成python list
                for col in range(sheet.ncols)] 
                    for r in range(sheet.nrows)]

    print "\nList Comprehension"                                 #打印出我们刚才生成的列表中第三行第二列的值
    print "data[3][2]:",
    print data[3][2]

    print "\nCells in a nested loop:"                             #遍历整个工作表,每次只遍历一行,然后在列中进行移动
    for row in range(sheet.nrows):
        for col in range(sheet.ncols):
            if row == 50:                                        #当我们遍历至第50行时,就把这一行的所有值都打印出来,每次只打一列
                print sheet.cell_value(row, col),


    ### other useful methods:
    print "\nROWS, COLUMNS, and CELLS:"
    print "Number of rows in the sheet:", 
    print sheet.nrows                                             #取出工作表的行数
    print "Type of data in cell (row 3, col 2):", 
    print sheet.cell_type(3, 2)                                   #查看某一单元格的数据或值的类型
                                                                  #2 代表浮点数  #3 代表日期
    print "Value in cell (row 3, col 2):", 
    print sheet.cell_value(3, 2)                                 #以适当的Python值形式获取单元格的值
    print "Get a slice of values in column 3, from rows 1-3:"
    print sheet.col_values(3, start_rowx=1, end_rowx=4)           #将某一列(第三列)的值进行切片
                                                                  #需要这一列的三个值,从第一行开始取,一直到第四行,但不包含第四行
                                                                  #取出这一列的三个值

    print "\nDATES:"
    print "Type of data in cell (row 1, col 0):", 
    print sheet.cell_type(1, 0)                                                        
    exceltime = sheet.cell_value(1, 0)                                                 #取出单元格的值(日期)在xls中,日期显示为浮点数
    print "Time in Excel format:",
    print exceltime
    print "Convert time to a Python datetime tuple, from the Excel float:",
    print xlrd.xldate_as_tuple(exceltime, 0)                                           #获取这个时间,并让我们可以在Python中把它当作时间处理
                                                                                       #convert to python datetime tuple
    return data

data = parse_file(datafile)

4.练习:读取excel文件

import xlrd
import pprint 

datafile = '2013_ERCOT_Hourly_Load_Data.xls'

def parse_file(datafile):
    workbook  = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    
    data = [[sheet.cell_value(r,col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]
    
    cv = sheet.col_values(1,start_rowx = 1,end_rowx = None)
    maxval = max(cv)
    minval = min(cv)
    
    maxpos = cv.index(maxval) +1
    minpos = cv.index(minval) +1
    
    maxtime = sheet.cell_value(maxpos,0)
    realmaxtime = xlrd.xldate_as_tuple(maxtime,0)
    mintime = sheet.cell_value(minpos,0)
    realmintime = xlrd.xldate_as_tuple(mintime,0)
    
    data = {
            'maxtime': realmaxtime,
            'maxvalue': maxval,
            'mintime': realmintime,
            'minvalue': minval,
            'avgcoast': sum(cv)/float(len(cv))
    }
    return data
data = parse_file(datafile)
pprint.pprint(data)

5.探索json

5.1主函数

import json
import requests

BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"


# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    """
    This is the main function for making queries to the musicbrainz API. The
    query should return a json document.
    """
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print "requesting", r.url

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    """
    This adds an artist name to the query parameters before making an API call
    to the function above.
    """
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    """
    After we get our output, we can use this function to format it to be more
    readable.
    """
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data


def main():
    """
    Below is an example investigation to help you get started in your
    exploration. Modify the function calls and indexing below to answer the
    questions on the next quiz.

    HINT: Note how the output we get from the site is a multi-level JSON
    document, so try making print statements to step through the structure one
    level at a time or copy the output to a separate output file. Experimenting
    and iteration will be key to understand the structure of the data!
    """

    # Query for information in the database about bands named Nirvana
    results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
    pretty_print(results)

    # Isolate information from the 4th band returned (index 3)
    print "\nARTIST:"
    pretty_print(results["artists"][3])

    # Query for releases from that band using the artist_id
    artist_id = results["artists"][3]["id"]
    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]

    # Print information about releases from the selected band
    print "\nONE RELEASE:"
    pretty_print(releases[0], indent=2)

    release_titles = [r["title"] for r in releases]
    print "\nALL TITLES:"
    for t in release_titles:
        print t

if __name__ == '__main__':
    main()

5.2练习

5.2.1 how many bands named 'FIRST AID KIT'?

results_1 = query_by_name(ARTIST_URL, query_type["simple"], "FIRST AID KIT")
pretty_print(results_1)
n=0
for result_1 in results_1['artists']:
    if result_1['name'].lower()=='first aid kit':
        n+=1
print n

5.2.2 begin-area name for queen?

results_2 = query_by_name(ARTIST_URL, query_type["simple"], "QUEEN")
for result_2 in results_2['artists']:
    if result_2['name'].lower()=='queen' and 'begin-area' in result_2.keys():
        print result_2['begin-area']['name']

5.2.3 spanish alias for beatles?

results_3 = query_by_name(ARTIST_URL, query_type["aliases"], "BEATLES")
result_3 = results_3['artists']
for i,value in enumerate(result_3):
    print(i,value['name'])

for aliase in results_3['artists'][7]['aliases']:
    print aliase['locale']

for aliase in results_3['artists'][7]['aliases']:
    if aliase['locale'] =='es':
        print aliase['name']

5.2.4 nirvana disambiguation?

results_4 = query_by_name(ARTIST_URL, query_type["simple"], "NIRVANA")
for result_4 in results_4['artists']:
    if result_4['name'] =='Nirvana' and 'country' in result_4.keys() and result_4['country']=='US':
        print result_4['disambiguation']

5.2.5 when was one direction formed ?

results_5 = query_by_name(ARTIST_URL, query_type["simple"], "one direction")
for result_5 in results_5['artists']:
    if result_5['name'].lower()=='one direction' and 'life-span' in result_5.keys():
        print result_5['life-span']['begin']

6.习题集

6.1 使用csv模块

csv文件第一行如下:
745090,"MOUNTAIN VIEW MOFFETT FLD NAS",CA,-8.0,37.400,-122.050,12
第一行既不是数据项,也不是标题。它是用来描述数据来源的行。你应该从该行中提取气象站名称。
数据应该返回为包含列表的列表(而不是字典)。可以使用模块“reader”方法获取此类格式的数据。另一个有用的方法是next(),可以用来获取迭代器中的下一行。

import csv

datafile = "745090.csv"


def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'rb') as f:
        csv_reader = csv.reader(f)
        name = next(csv_reader)[1]
        csv_reader.next()
        for line in csv_reader:
            data.append(line)
        pass
    # Do not change the line below
    return (name, data)
parse_file(datafile)

6.2 Excel 至 CSV

计算以下每个地区的最大加载时间和值:COAST、EAST、FAR_WEST、NORTH、NORTH_C、SOUTHERN、SOUTH_C、WEST,并将结果写到 csv 文件中,使用竖线字符“|”作为分隔符。

import xlrd

def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data = {}
    for n in range (1, 9):
        station = sheet.cell_value(0, n)
        cv = sheet.col_values(n, start_rowx=1, end_rowx=None)

        maxval = max(cv)
        maxpos = cv.index(maxval) + 1
        maxtime = sheet.cell_value(maxpos, 0)
        realtime = xlrd.xldate_as_tuple(maxtime, 0)
        data[station] = {"maxval": maxval,
                         "maxtime": realtime}

    print data
    return data

def save_file(data, filename):
    with open(filename, "w") as f:
        w = csv.writer(f, delimiter='|')
        w.writerow(["Station", "Year", "Month", "Day", "Hour", "Max Load"])
        for s in data.keys():
            year, month, day, hour, _ , _= data[s]["maxtime"]
            w.writerow([s, year, month, day, hour, data[s]["maxval"]])
            
parse_file('2013_ERCOT_Hourly_Load_Data.xls')
save_file(parse_file('2013_ERCOT_Hourly_Load_Data.xls'),'2013_Max_Loads.csv')

相关文章

  • 数据提取基础(csv&xls&json)

    1.解析csv文件 逐行阅读输入 datafile 文件,对于前 10 行(不包括标题),根据“,”拆分每行,然后...

  • Python爬虫(概念通用爬虫和聚焦爬虫)

    Python基础语法学习(基础知识)HTML页面的内容抓取(数据抓取)HTML页面的数据提取(数据清洗)Scrap...

  • 用MongoDB进行数据整理(一)

    用MongoDB进行数据整理(一)数据提取基础 1.MongoDB简介 数据预处理 Data Wrangling:...

  • python-爬虫

    初识爬虫 网页基础 BeautifuleSoup 解析数据 提取数据 Tag 对象 CSS选择器 爬虫进阶

  • 数据仓库组件:Hive环境搭建和基础用法

    一、Hive基础简介 1、基础描述 Hive是基于Hadoop的一个数据仓库工具,用来进行数据提取、转化、加载,是...

  • 给女朋友写的生统资料_Part4

    上一节我们讲到了一些向量提取的操作。这一部分我们会讲一些数据框提取的操作。在R里面,数据框提取的基础操作跟向量很相...

  • 【分析篇】:Pandas像sql操作python 进行数据分析

    前言:Python数据分析的基础在于对数据的提取,清洗,汇总,并思考整理发现数据规律的过程。以下模块是个人在数据分...

  • <官网学大数据>Hive

    Hive简介 Hive是简历在Hadoop HDFS上的数据仓库基础架构 Hive可以用来进行数据提取转化加载(E...

  • 数据提取之JSON

    数据提取 什么是数据提取? 简单的来说,数据提取就是从响应中获取我们想要的数据的过程 数据分类 非结构化的数据:h...

  • CAD数据提取方法

    CAD数据提取方法 1.工具栏选择“注释”模块——选择“表格”——点击“数据提取”,弹出“数据提取”操作框弹出“数...

网友评论

      本文标题:数据提取基础(csv&xls&json)

      本文链接:https://www.haomeiwen.com/subject/lwwyoxtx.html