美文网首页数据分析
Python回归分析——以爬取数据预测收入

Python回归分析——以爬取数据预测收入

作者: 发觉原来我只是250 | 来源:发表于2017-05-26 17:00 被阅读0次

import pandas as pd

from pandas import DataFrame

import numpy as np

import matplotlib.pyplot as plt

from scipy.stats import mode

from sklearn.cross_validation import train_test_split

from sklearn.svm import SVC

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import precision_recall_curve

import os

import pylab as pl

from sklearn.linear_model import LinearRegression

from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm

import statsmodels.formula.api as smf

import scipy.stats as stats

from scipy.stats import chisquare

#import seaborn as sns

数据分析师交流群:283296032

##导入数据

os.chdir('D:\\model\\sal')

sa = pd.read_csv('data.csv',sep='\001', encoding='utf-8', header=None)

sa.columns=['cv_id','gender','school','degree','major','location','age','work_year','industry','salary']

##数据探索

for i in sa.columns:

if sa[i].dtypes=='object' and i != 'cv_id':

print sa[i].value_counts()

##填充异常值

sa['work_year'] = sa['work_year'].replace('\N',np.nan)

sa['age'] = sa['age'].replace('\N',np.nan)

sa['salary'] = sa['salary'].replace('\N',np.nan)

sa['work_year'] = sa['work_year'].astype('float')

sa['age'] = sa['age'].astype('float')

sa['salary'] = sa['salary'].astype('float')

#sa['school'] = sa['school'].astype('float')

#sa['location'] = sa['location'].astype('float')

#sa['salary'] = sa['gender'].astype('float')

#work_year异常值去除

s1=sa[sa['age']>=18]

s2=s1[s1['age']<=40]

s3=s2[s2['work_year']>=0]

s4=s3[s3['work_year']<=22]

sa1=s4[s4['gender']>0]

for i in sa1.columns:

if sa1[i].dtypes=='object' and i != 'cv_id':

print sa1[i].value_counts()

sa1['gender'] = sa1['gender'].astype('float')

sa1['degree'] = sa1['degree'].astype('float')

sa1['school'] = sa1['school'].astype('float')

sa1['location'] = sa1['location'].astype('float')

sa1['work_year'] = sa1['work_year'].astype('float')

sa1['salary'] = sa1['salary'].astype('float')

sa1['major'] = sa1['major'].astype('float')

sa1['industry'] = sa1['industry'].astype('float')

#看整体分布

sa1.describe()

# 查看每一列的标准差

print sa1.std()

#相关性分析:

d1=sa1.corr() #相关系数矩阵,即给出了任意二个变量之间的相关系数

plt.hist(sa1['gender'])

plt.show()

plt.hist(sa1['age'])

plt.show()

plt.hist(sa1['degree'])

plt.show()

plt.hist(sa1['school'])

plt.show()

plt.hist(sa1['location'])

plt.show()

pl.hist(sa1['work_year'])

pl.show()

plt.hist(sa1['major'])

plt.show()

plt.hist(sa1['salary'])

plt.show()

#设虚拟变量

dummy_ranks1 = pd.get_dummies(sa1['gender'], prefix='gender')

print dummy_ranks1.head()

cols_to_keep = ['salary','work_year']

x1 = sa1[cols_to_keep].join(dummy_ranks1.ix[:,'gender_0.0':])

x6=sm.add_constant(x6)

y = sa1['salary']

y = sa1.salary

x_train,x_test = train_test_split(x6,test_size=0.3, random_state=1)

d1=(x_train, y_train)

#最小二乘法

#sa1["intercept"]=1.0

est=sm.OLS(y_train,x_train).fit()

est.summary()

y_pred = est.predict(x_test)

y_pred = est.predict(x_test)

print type(y_pred),type(x_test)

print len(y_pred),len(x_test)

print y_pred.shape,x_test.shape

from sklearn import metrics

import numpy as np

sum_mean=0

for i in range(len(y_pred)):

sum_mean+=(y_pred[i]-x_test.values[i])**2

sum_erro=np.sqrt(sum_mean/151252)

# calculate RMSE by hand

print "RMSE by hand:",sum_erro

相关文章

网友评论

    本文标题:Python回归分析——以爬取数据预测收入

    本文链接:https://www.haomeiwen.com/subject/fcyxfxtx.html