```
import json
from collectionsimport Counter
from collectionsimport defaultdict
from pandasimport DataFrame,Series
import pandasas pd;import numpyas np
path='example.txt'
#print(open(path).readline())
records=[json.loads(line)for linein open(path)]#按行以json读取所有数据
#print(records[0]['tz'])
#for rec in records: #rec 是每一行
# print(rec)
# break
time_zones=[rec['tz']for recin recordsif 'tz' in rec]#遍历json,如果该行包含字段‘tz’,则读取该字段的内容
#print(time_zones[:10])
#统计不同tz内容的各多少个返回字典字段:内容
def get_counts(sequence):
counts={}
for xin sequence:
if xin counts:
counts[x]+=1
else:
counts[x]=1
return counts
#print(get_counts(time_zones))
#print(len(time_zones))
def get_counts2(sequence):
counts=defaultdict(int)#所有的值都会被初始化为0
for xin sequence:
counts[x]+=1
return counts
#前10个排序
def top_counts(count_dict,n=10):
value_key_pairs=[(count,tz)for tz,countin count_dict.items()]#count_dict.items()-字段:内容
print(count_dict.items())
value_key_pairs.sort()#根据字段排序
return value_key_pairs[-n:]
print(top_counts(get_counts(time_zones)))
#使用collections.counter类
counts=Counter(time_zones)
print(counts.most_common(10))
#使用padndas
frame=DataFrame(records)
#print(frame)
tz_counts=frame['tz'].value_counts()
print(tz_counts[:10])
clean_tz=frame['tz'].fillna('Missing')
clean_tz[clean_tz=='']='Unknown'
tz_counts=clean_tz.value_counts()
print(tz_counts[:10])
tz_counts[:10].plot(kind='barh',rot=0)
```
网友评论