美文网首页
python基础

python基础

作者: maocy | 来源:发表于2017-06-06 13:58 被阅读0次

从py文件导入模块,保证目录正确(spyder set console working directory)

from mdlp import MDLP
var=MDLP()

一份非常好的Matplotlib 教程
21天pandas入门手册
python中pandas库中DataFrame对行和列的操作使用方法

dataframe 基本

http://blog.csdn.net/claroja/article/details/71598572

sai

http://nbviewer. <font color="#000000">ju</font> pyter.org/github/Sai628/ <font color="#000000">ju</font> pyter-notebook/blob/master/Python/000_pandas-tutorial.ipynb

#变量类型
train.info()
data.dtypes
data['date']=data['date'].astype(np.int)
data['date'] = data['date'].str.replace('%','').astype(np.float64)
#特定类型统一操作
def ob_float(data):
    data_dtype=data.dtypes
    data_object=data_dtype[data_dtype=='object'].index
    for col in data_object[1:]:
        data[col] = data[col].str.replace('%','').astype(np.float64)
    return data
#查看唯一值
np.unique(data_new.num)
#百分比
data20wsum.describe(percentiles=[.10, .20, .30,0.40])

data_last32.mean()
#众数
data_last32.mode().iloc[0]
import pandas as pd
import numpy as np
df.head()
data_last3_mindata.shape[1]
data_last3_mindata.shape[0]

#pyecharts作图
pddt.values
pddt.index

#注意数据格式
datatest是2*5的数据框
v3=datatest.values[1]   #float 长度5 
v2 = [[0.94, 0.79, 0.92, 0.93, 0.69]] #list长度1

#一个list 一个float 但是长度一致 √
v4=[5, 20, 36, 10, 75, 90]
v5=pddt.values[1]


#列:
df['A']                                                                      
df.loc[:,['A','B']]    
data.loc[(data["Gender"]=="Female") & (data["Education"]=="Not Graduate") & (data["Loan_Status"]=="Y"),
         ["Gender","Education","Loan_Status"]]

data['date']=data['num'].apply(lambda x: x[3:6])
#单列用作index  
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
错误 train=data.iloc[train_index,:]  #即使只有一列,也不能直接把df做为索引
正确 train=data.iloc[train_index.Resample1,:] 
             
#行:
df[0:3]     
df['20130102':'20130104']                                        
 df.iloc[3]          
#一列
data_last['C7'].iloc[1]

#####iloc 与loc
#选取特定index
data['num'].loc[select_index]
data522a523.loc[3192576430792]
#注意,iloc是将序列当作数组来访问,下标又会从0开始:
 data['num'].iloc[[2,3,4]]


删除pandas DataFrame的某一/几列:
方法一:直接del DF['column-name']
方法二:采用drop方法,有下面三种等价的表达式:
1. DF= DF.drop('column_name', 1);
2. DF.drop('column_name',axis=1, inplace=True)
3. DF.drop(DF.columns[ : ], axis=1,inplace=True)   # Note: zero indexed
注意:凡是会对原数组作出修改并返回一个新数组的,往往都有一个 inplace可选参数。如果手动设定为True(默认为False),那么原数组直接就被替换。也就是说,采用inplace=True之后,原数组名(如2和3情况所示)对应的内存值直接改变;而采用inplace=False之后,原数组名对应的内存值并不改变,需要将新的结果赋给一个新的数组或者覆盖原数组的内存位置(如1情况所示)。

df.iloc[3:5,0:2]
#满足条件:
df[df.A>0]   
df[df>0]

#针对series
data_model523=data_copy['sum']
data_copy_unsatisf_mode523=data_model523[data_model523<65 ]

data_last3_min21['U26'][(data_last3_min21['U26']>test[4]) & (data_last3_min21['U26']<=test[5])]=-6

[pandas多条件行数据筛选](http://www.cnblogs.com/ppqchina/p/6269008.html)
print df[(df['PCTL']<0.95) & (df['PCTL']>0.05)]
如果是要对element-wise作判断,考虑用df.map
如果是行列运算考虑用df.apply + lambda

data20wsum=data20wsum.sort_values()

obj4 = obj3.reset_index(drop=True) 
#错误方式
df_train=df_train.reindex()
columns = data_last_res.columns.values.tolist()
columns.index('C7')

dataframe常见操作

###len(x)
data.groupby('date').count()['num']
data.groupby('date').count()['num'].plot(kind='bar')
data_20W_maxdata=data_20W_max.groupby('num').min(skipna=True)
data_last=data_new.groupby('num').mean() #mean不要有(skipna=True)

##group min不足 一列,多列排序后保留唯一值
data = data_k.sort_values(['num', 'sum'], ascending=[0, 0])
data_3= data.drop_duplicates(['num'])


#分组条件统计 可以先条件后统计 特别注意cci_all.drop(cci_all[cci_all['sum']<30].index,0)的index都是唯一值(合并dataframe组成的要重置)
cci_all_date=cci_all.groupby('date').count()['num']

cci_all_s= cci_all.drop(cci_all[cci_all['sum']<30].index,0)
cci_all_s_num=cci_all_s.groupby('date').count()['num']
cci_all_res = pd.DataFrame({'all':cci_all_date,'comp':cci_all_s_num})
cci_all_res['satis']=cci_all_res['all'] - cci_all_res['comp']
cci_all_res['accuracy']=cci_all_res['satis']/cci_all_res['all']


data= data.drop(data[data['date']<500].index,0)
data=data.reset_index(drop=True)
注意:
train2=df2.drop(['Pclass','Sex','Cabin','Embarked','Name','Ticket','PassengerId'], axis=1, inplace=True) 
删除列前面不可以写=
df2.drop(['Pclass','Sex','Cabin','Embarked','Name','Ticket','PassengerId'], axis=1, inplace=True) 

但是filter可以
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
比drop的好处就是 df可以复用

data_complain=df_train.merge(data, on='num',how='left')
#series合并为df
c = pd.DataFrame([a,b])
c = pd.DataFrame({'a':a,'b':b})
#以某一列为索引
data_20W=data_20W.set_index('num')
#获取索引  list方便显示
testindex=list(data_last_res.index)
testindex=data_last_res.index  #set 方便交集等运算

#抽样
data_last_res=data_last_res.sample(frac=1,random_state=123).reset_index(drop=True)  

 import copy
 origin = [1, 2, [3, 4]]
#origin 里边有三个元素:1, 2,[3, 4]
 cop1 = copy.copy(origin)
 cop2 = copy.deepcopy(origin)
#透视表
http://python.jobbole.com/81212/
  #Determine pivot table
impute_grps = data.pivot_table(values=["LoanAmount"], index=["Gender","Married","Self_Employed"], aggfunc=np.mean)
print impute_grps
pd.crosstab(data["Credit_History"],data["Loan_Status"],margins=True)
def percConvert(ser):
  return ser/float(ser[-1])
pd.crosstab(data["Credit_History"],data["Loan_Status"],margins=True).apply(percConvert, axis=1)

不断添加dataframe

data_k=pd.DataFrame()
data_k= pd.concat([data_k,data_complain_531],ignore_index=True)
data_k= pd.concat([data_k,data_complain_531],axis=1)  #横着连
[【pandas】[3] DataFrame 数据合并,连接(merge,join,concat)](http://blog.csdn.net/you_xian/article/details/51498026)
DataFrame.mean(axis=0,skipna=True) 
data_num1['C12'].min(skipna=True)
data_score['sum']=data_score0.sum(1,skipna=True)

读存数据

长数字作为字符读入
data0=pd.read_csv('%s\%s'%(root,fn),usecols=[0,27],dtype={'num':str})
encoding='gbk'
read_csv只是把数据读作df,和读入文件格式无关,
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
wine = pd.read_csv(target_url,header=0, sep=";")

data = urllib2.urlopen(target_url)
abalone = pd.read_csv(target_url,header=None, prefix="V")
列名
abalone.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight',
'Shucked weight', 'Viscera weight', 'Shell weight',
'Rings']

pandas.read_csv参数详解

#读取指定列,使用这个参数可以加快加载速度并降低内存消耗。
data = pd.read_csv('data.csv',usecols=[0,1,2,3])
data=pd.read_csv('C:/Users/maojh/Documents/jm/data_score_rate_yidong2.csv',usecols=range(6)) 
#index=False 首列1234
data_last.to_csv('C:/Users/maojh/Documents/jm/model_data/data522_09042.csv',index=False) 
data_last.to_csv('C:/Users/maojh/Documents/jm/pd.csv') 

# python xxx.py inputfilename.csv 
input_path = sys.argv[1]
output_path = input_path + 'result.csv'
data0=pd.read_csv(input_path)
data['sum'].to_csv(output_path, index=False)

##重复数据
#R语言
rep(c(1,3),each=4)
#python
 lis = ["A", "B"]
 times = (2, 3)
 sum(([x]*y for x,y in zip(lis, times)),[])
['A', 'A', 'B', 'B', 'B']

list(chain.from_iterable([[num]*3 for num in data]))

from itertools import chain, izip, starmap
from operator import mul
list(chain.from_iterable(starmap(mul, izip(lis, times))))

####每列唯一值及其个数


#批量
import os
source = "C:\\Users\\maojh\\Documents\\jm\\csv"
lst0=[]
per_test0=[]
for root, dirs, files in os.walk(source):
    for fn in files :
       data0=pd.read_csv('%s\\%s'%(root,fn),header=None,skiprows=1,names=name)
       data0['date_string']=fn[0:3]
       data0=ob_float(data0)
       lst0.append(data0)
       data0_1=data0.iloc[:,1:]
       data0_percentest=data0_1.max()   #检查百分数还是小数 
       per_test0.append(data0_percentest)

np.save("C:\\Users\\maojh\\Documents\\jm\\jm725_5_data_new.npy",data_new)
numpy.load("filename")

读取json文件

f=open('C:/Users/xx/Documents/heat.json','r')
histdataM1=json.loads(f.read())
f.close()
heat=histdataM1['data']
import pandas
#list转dataframe
test2=pandas.DataFrame(heat, columns=['lon', 'lat', 'number'])
test2['number'].quantile(0.2)

python/pandas/Numpy数据分析-统计描述,唯一值,值计数

排序

df.sort_values(['high', '日序'], ascending=[True, False])

百分数等数字object转换

https://stackoverflow.com/questions/28277137/how-to-convert-datatypeobject-to-float64-in-python

data_complain_514['C5'] = data_complain_514['C5'].str.replace('%','').astype(np.float64)

spider绘图单独显示

tools-preference-ipython/python console-graphics-backend-automatic

editor background

tools-preference-syntax

python/pandas数据挖掘(十四)-groupby,聚合,分组级运算

df.groupby('key1').mean()
def min_max(data):
    max_data=data.max(skipna=True)
    min_data=data.min(skipna=True)
    if max_data==min_data:
        data[data.notnull()]=1
    else:
        data[data.notnull()]=(data-min_data)/(max_data-min_data)
    return data

#test=min_max(data_num1['C2'])
data_num1_minmax=data_num1.iloc[:,2:60].apply(min_max)

#重复项
x, idx = np.unique(data.num, return_index=True)
idx_data=range(7137)

get_double=[]
for i in idx_data:
    if i not in idx:
        get_double.append(i)

#缺失值统计
#Create a new function:
def num_missing(x):
  return sum(x.isnull())

#Applying per column:
print "Missing values per column:"
print data.apply(num_missing, axis=0) #axis=0 defines that function is to be applied on each column

#Applying per row:
print "\nMissing values per row:"
print data.apply(num_missing, axis=1).head() #axis=1 defines that function is to be applied on each row

#缺失统计

def sum_na(data):
    return data.isnull().sum()

data_new['column_na']= data_new.apply(sum_na,axis=1)
data_new['count_na']=1
miss_data_row=pd.pivot_table(data_new,index=["column_na"],values=["count_na"],aggfunc=np.sum)

miss_data_row.sort_index(ascending=False)
miss_data_row.sort_index(by='count_na',ascending=False).head(20)


total= df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'])
missing_data.head(20)

df_train= df_train.drop((missing_data[missing_data['Total'] > 50000]).index,1)  

sum_na_col=data_last.apply(lambda x: sum(x.isnull()),axis=0) 
per_na_col=sum_na_col/data_last.shape[0]
sum_na_row=data_last.apply(lambda x: sum(x.isnull()),axis=1) 
#每列唯一值
data_k.apply(lambda x: len(x.unique()),axis=0) 

plt.hist(sum_na_row)
sum_na_row=sum_na_row.tolist()

count_na_row = {k:sum_na_row.count(k) for k in set(sum_na_row)}
#缺失值众数填充
data['Gender'].fillna(mode(data['Gender'])[0][0], inplace=True)
#保存模型
import statsmodels.api as sm
# 指定作为训练变量的列,不含目标列`admit`
train_cols = data.columns[1:]
logit = sm.Logit(data['aim'], data[train_cols])
# 拟合模型
result = logit.fit()
# 查看数据的要点
print result.summary()

from sklearn.externals import joblib
joblib.dump(result, 'lr.model')
result = joblib.load('lr.model')

法二:
https://www.zhihu.com/question/31604690
#区间替代
data['d']=data['c'].map(lambda x: 0 if x<5 else (1 if x<10 else 2))
#交集等操作  x需针对set
index522=data_copy_unsatisf_mode522.index
index523=data_copy_unsatisf_mode523.index

index522and523=index522 & index523
index522and523=list(index522and523)
#差集
index522only=list(index522.difference(index523))

1. 获取两个list 的交集
#方法一:
a=[2,3,4,5]
b=[2,5,8]
tmp = [val for val in a if val in b]
print tmp
#[2, 5]
#方法二
list(set(a).intersection(set(b)))
2. 获取两个list 的并集
list(set(a).union(set(b)))
3.获取两个 list 的差集
list(set(b).difference(set(a))) # b中有而a中没有的

#变量类型
for i, row in colTypes.iterrows(): #i: dataframe index; row: each row in series format
  if row['feature']=="category":
    data[row['feature']]=data[row['feature']].astype(np.object)
  elif row['feature']=="cont":
    data[row['feature']]=data[row['feature']].astype(np.float)
print data.dtypes

python数据分析入门学习笔记

返回value_counts

source = "E:\CCI_minmax"

data_k=pd.DataFrame()
for root, dirs, files in os.walk(source):
for fn in files :
data0=pd.read_csv('%s\%s'%(root,fn),usecols=range(1,27))
column_names=data0.columns.tolist()
data_date=pd.DataFrame()
for col_name in column_names:
val_count=data0[col_name].value_counts()
data_date=pd.concat([data_date,val_count.sort_index()])
data_date.columns=[fn[0:6]]
data_k=pd.concat([data_k,data_date],axis=1)

相关文章

网友评论

      本文标题:python基础

      本文链接:https://www.haomeiwen.com/subject/dhigfxtx.html