import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_curve
import os
import pylab as pl
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
from scipy.stats import chisquare
#import seaborn as sns
数据分析师交流群:283296032
##导入数据
os.chdir('D:\\model\\sal')
sa = pd.read_csv('data.csv',sep='\001', encoding='utf-8', header=None)
sa.columns=['cv_id','gender','school','degree','major','location','age','work_year','industry','salary']
##数据探索
for i in sa.columns:
if sa[i].dtypes=='object' and i != 'cv_id':
print sa[i].value_counts()
##填充异常值
sa['work_year'] = sa['work_year'].replace('\N',np.nan)
sa['age'] = sa['age'].replace('\N',np.nan)
sa['salary'] = sa['salary'].replace('\N',np.nan)
sa['work_year'] = sa['work_year'].astype('float')
sa['age'] = sa['age'].astype('float')
sa['salary'] = sa['salary'].astype('float')
#sa['school'] = sa['school'].astype('float')
#sa['location'] = sa['location'].astype('float')
#sa['salary'] = sa['gender'].astype('float')
#work_year异常值去除
s1=sa[sa['age']>=18]
s2=s1[s1['age']<=40]
s3=s2[s2['work_year']>=0]
s4=s3[s3['work_year']<=22]
sa1=s4[s4['gender']>0]
for i in sa1.columns:
if sa1[i].dtypes=='object' and i != 'cv_id':
print sa1[i].value_counts()
sa1['gender'] = sa1['gender'].astype('float')
sa1['degree'] = sa1['degree'].astype('float')
sa1['school'] = sa1['school'].astype('float')
sa1['location'] = sa1['location'].astype('float')
sa1['work_year'] = sa1['work_year'].astype('float')
sa1['salary'] = sa1['salary'].astype('float')
sa1['major'] = sa1['major'].astype('float')
sa1['industry'] = sa1['industry'].astype('float')
#看整体分布
sa1.describe()
# 查看每一列的标准差
print sa1.std()
#相关性分析:
d1=sa1.corr() #相关系数矩阵,即给出了任意二个变量之间的相关系数
plt.hist(sa1['gender'])
plt.show()
plt.hist(sa1['age'])
plt.show()
plt.hist(sa1['degree'])
plt.show()
plt.hist(sa1['school'])
plt.show()
plt.hist(sa1['location'])
plt.show()
pl.hist(sa1['work_year'])
pl.show()
plt.hist(sa1['major'])
plt.show()
plt.hist(sa1['salary'])
plt.show()
#设虚拟变量
dummy_ranks1 = pd.get_dummies(sa1['gender'], prefix='gender')
print dummy_ranks1.head()
cols_to_keep = ['salary','work_year']
x1 = sa1[cols_to_keep].join(dummy_ranks1.ix[:,'gender_0.0':])
x6=sm.add_constant(x6)
y = sa1['salary']
y = sa1.salary
x_train,x_test = train_test_split(x6,test_size=0.3, random_state=1)
d1=(x_train, y_train)
#最小二乘法
#sa1["intercept"]=1.0
est=sm.OLS(y_train,x_train).fit()
est.summary()
y_pred = est.predict(x_test)
y_pred = est.predict(x_test)
print type(y_pred),type(x_test)
print len(y_pred),len(x_test)
print y_pred.shape,x_test.shape
from sklearn import metrics
import numpy as np
sum_mean=0
for i in range(len(y_pred)):
sum_mean+=(y_pred[i]-x_test.values[i])**2
sum_erro=np.sqrt(sum_mean/151252)
# calculate RMSE by hand
print "RMSE by hand:",sum_erro
网友评论