1.1 利用Python进行数据分析

作者: 彭健平6点30 | 来源:发表于2017-03-24 21:43 被阅读625次
    %pwd#显示路径
    
    '/Users/zhongyaode/pythonbook'
    
    #读取文件中的第一行数据
    
    path = '/Users/zhongyaode/pythonbook/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
    
    open(path).readline()
    
    '{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
    
    #将json字符串转换成python字典对象。用json的模块及其loads函数逐行加载已经下载好的数据文件
    
    import json
    path = '/Users/zhongyaode/pythonbook/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
    records = [json.loads(line) for line in open(path)]#列表推导式,这是一种在一组字符串(或一组别分的对象)上执行一条相同操作(如json.loass)
    #的简洁方式,在一个打开的文件句柄上进行迭代即可获得一个由行组成的系列,现在,records对象就成为一组Python字典了
    

    import json
    path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
    records = [json.loads(line) for line in open(path)]

    records[0]
    
    {'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
     'al': 'en-US,en;q=0.8',
     'c': 'US',
     'cy': 'Danvers',
     'g': 'A6qOVH',
     'gr': 'MA',
     'h': 'wfLQtf',
     'hc': 1331822918,
     'hh': '1.usa.gov',
     'l': 'orofrog',
     'll': [42.576698, -70.954903],
     'nk': 1,
     'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
     't': 1331923247,
     'tz': 'America/New_York',
     'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
    
    #现在只要以字符串形式给出想要访问的键就可以得到当前记录中相应的值
    records[0]['u']
    
    'http://www.ncbi.nlm.nih.gov/pubmed/22415991'
    
    records[2]['u']
    
    'http://boxer.senate.gov/en/press/releases/031612.cfm'
    
    records[0]['tz']
    
    'America/New_York'
    
    print(records[0]['tz'])
    
    America/New_York
    

    得到数据集中最常出现的是那个时区(即tz区),有很多方法,下面用列表推导式取出一组时区

    time_zones=[rec['tz']for rec in records]

    #不是所有记录都有时区字段,只需在列表推导式末尾加上一个 if 'tz' in rec判断即可
    time_zones=[rec['tz'] for rec in records if 'tz' in rec]
    
    time_zones = [rec['tz'] for rec in records if 'tz' in rec]
    time_zones[:10]#只看前十个时区
    
    ['America/New_York',
     'America/Denver',
     'America/New_York',
     'America/Sao_Paulo',
     'America/New_York',
     'America/New_York',
     'Europe/Warsaw',
     '',
     '',
     '']
    
    #对时区进行计数,计数办法之一是在遍历时区的过程中将计数值保存在字典中
    def get_counts(sequence)
       counts={}
        for x in sequence:
            if x in counts:
                counts[x] +=1
            else:
                counts[x] =1
        return counts
    
      File "<ipython-input-43-62431215ac18>", line 2
        def get_counts(sequence)
                                ^
    SyntaxError: invalid syntax
    
    def get_counts(sequence):
        counts = {}
        for x in sequence:
            if x in counts:
                counts[x] += 1
            else:
                counts[x] = 1
        return counts
    
    from collections import defaultdict
    
    def get_counts2(sequence):
        counts = defaultdict(int) # values will initialize to 0
        for x in sequence:
            counts[x] += 1
        return counts
    
    #对时区进行处理,只需将time_zones传入即可:
    
    counts = get_counts(time_zones)
    counts['America/New_York']
    
    1251
    
    len(time_zones)
    
    3440
    
    #想获得前10位的时区及其计数值,需要用到一些有关字典的处理技巧:
    
    def top_counts(count_dict,n=10):
        value_key_pairs = [(count,tz) for tz,count in count_dict.items()]
        value_key_pairs.sort()
        return value_key_pairs[-n:]
    
    top_counts(counts)
    
    [(33, 'America/Sao_Paulo'),
     (35, 'Europe/Madrid'),
     (36, 'Pacific/Honolulu'),
     (37, 'Asia/Tokyo'),
     (74, 'Europe/London'),
     (191, 'America/Denver'),
     (382, 'America/Los_Angeles'),
     (400, 'America/Chicago'),
     (521, ''),
     (1251, 'America/New_York')]
    
    from collections import Counter
    counts=Counter(time_zones)
    counts.most_common(10)
    
    [('America/New_York', 1251),
     ('', 521),
     ('America/Chicago', 400),
     ('America/Los_Angeles', 382),
     ('America/Denver', 191),
     ('Europe/London', 74),
     ('Asia/Tokyo', 37),
     ('Pacific/Honolulu', 36),
     ('Europe/Madrid', 35),
     ('America/Sao_Paulo', 33)]
    
    #用pandas对时区进行计数;Dataframe是pandas中最重要的数据结构,它用于表为一个表格
    
    from pandas import DataFrame,Series
    
    

    import pandas as pd;import numpy as np
    frame=DataFrame(records)
    frame

    frame['tz'][:10]
    
    0     America/New_York
    1       America/Denver
    2     America/New_York
    3    America/Sao_Paulo
    4     America/New_York
    5     America/New_York
    6        Europe/Warsaw
    7                     
    8                     
    9                     
    Name: tz, dtype: object
    
    #frame['tz]所返回的Series对象有一个value_counts方法,该方法可以让我们得到所需的信息
    

    tz_counts=frame['tz'].value_counts()
    tz_counts[:10]

    clean_tz=frame['tz'].fillna('Missing')
    
    clean_tz[clean_tz=='']='Unknown'
    
    tz_counts=clean_tz.value_counts()
    
    tz_counts[:10]
    
    America/New_York       1251
    Unknown                 521
    America/Chicago         400
    America/Los_Angeles     382
    America/Denver          191
    Missing                 120
    Europe/London            74
    Asia/Tokyo               37
    Pacific/Honolulu         36
    Europe/Madrid            35
    Name: tz, dtype: int64
    
    #利用counts对象的plot方法可得到一张水平条形图
    
    tz_counts[:10].plot(kind='barh', rot=0)
    
    <matplotlib.axes._subplots.AxesSubplot at 0x10b5b67f0>
    
    frame['a'][1]
    
    'GoogleMaps/RochesterNY'
    
    frame['a'][50]
    
    'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
    
    frame['a'][51]
    
    'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
    

    results = Series([x.split()[0] for x in frame.a.dropna()])
    results[:5]

    results.value_counts()[:8]

    cframe=frame[frame.a.notnull()]
    
    operating_system=np.where(cframe['a'].str.contains('Windows'),
                             'Windows','Not Windows')
    
    operating_system[:5]
    
    array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'], 
          dtype='<U11')
    
    by_tz_os=cframe.groupby(['tz',operating_system])
    
    agg_counts=by_tz_os.size().unstack().fillna(0)
    

    agg_counts[:10]

    #选出最常出现的时区,为了达到这个目的,根据agg_counts中的行数构造了一个间接索引数组
    
    indexer=agg_counts.sum(1).argsort(0)
    

    indexer[:10]

    #通过take按照这个顺序截取了最后10行
    

    count_subset=agg_counts.take(indexer)[-10:]
    count_subset

    count_subset.plot(kind='barh', stacked=True)
    
    <matplotlib.axes._subplots.AxesSubplot at 0x10de0aac8>
    
    normed_subset = count_subset.div(count_subset.sum(1), axis=0)
    normed_subset.plot(kind='barh', stacked=True)
    
    <matplotlib.axes._subplots.AxesSubplot at 0x10dee09e8>
    
    
    

    相关文章

      网友评论

        本文标题:1.1 利用Python进行数据分析

        本文链接:https://www.haomeiwen.com/subject/nbkqottx.html