Scikit-learn 是包括机器学习,数据处理, 交叉验证和可是化的库。
但是在python调用中应使用名字为:sklearn。
eg:
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
其接受数组,矩阵,DataFrame形式的数据。
1 training and test data划分
from sklrean.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=123)
2 数据预处理
2.1 StandardScaler
from sklearn.preprocessing import StandardScaler
stand = StandardScaler()
Xtrain_stand = stand.fit_transform(X_train)
Xtest_stand = stand.transform(X_test)
2.2 Binarization
将y连续变量变成二变量
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshould=0.0).fit(y) # 需要设定阈值,这里我设为0
binary_y = binarizer.transform(y)
2.3 变量从新编码
有的变量为str类型,但在有的机器学习算法中,只能输入float类型的变量,所以要对变量进行从新编码。
from sklearn.preprocessing import LabelEncoder
eco = LabelEncoder()
X["var"] = eco.fit_transform(X["var"])
或者pandas中函数
weather2['Location'],_=pd.factorize(weather2.Location)
2.4 缺失值填充
可以使用单个变量的中位数,平均数,众数等填充这个变量的缺失值。时间序列的,可以使用相邻两个时间点对其进行预测填充等。
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values = 0, strategy = 'mean', axis = 0) # 缺失值为0, 使用平均值按列填充
imp.fit_trainform(X_train)
2.5 生成多项式特征
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
ploy.fit_transform(X[""])
For example, if an input sample is two-dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
3 模型构建
3.1 监督学习
#线性回归
form sklearn.Linear_model import LinearRegression
lr = LinearRegression(normalize=Ture)
# Support vector Machines (SVM)
form sklearn.svm import SVC
svc = SVC(kernel='linear')
#类似的还有Naive Bayes, KNN
3.2 非监督学习
#PCA
form sklearn.decomposition import PCA
pca = PCA(n_component=0.95)
# K means
from sklearn.cluster import KMeans
k_means = Kmeans(n_clusters = 3, random_state=123)
4 Model fitting
4.1 监督学习
lr.fit(X,y)
svc.fit(X_train, y_train)
#其他类似
4.2 非监督学习
pca_model = pca.fit_transform(X_train)
k_means.fit(X_train)
#其他类似
5 使用模型
5.1 监督学习
y_pred = lr.predict(X_test) #直接预测y值(y为连续值)
y_pred = knn.predict_proba(X_test) # 预测类别的概率(分类)
#其他类似
5.2 非监督学习
y_pred = k_means.predict(X_test)
6 模型评价
6.1 分类
# Accuracy score
knn.score(X_test, y_test) # Estimator 估分法
from sklearn.metrics import accuracy_score ## metric 评分矩阵
accuracy_score(y_test, y_pred)
# classfification report
from sklearn.metrics import classification_report ## 给出precision, recall, f1-soce, and support
print(classification_report(y_test, y_pred))
# confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
6.2 regression
# mean absolute error
from sklearn.metrics import mean_absolute_error
y_true = [3,5,6, -1, -6]
mean_absolute_error(y_true, y_pred)
# mean squard error
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)
# R2 score
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)
6.3 clustering metrix
# Adjusted Rand Index
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_true, y_pred)
# Homogeneity
from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_pred)
# V-measure
from sklearn.metrics import v_measure_score
metrics.v_measure_score(y_true, y_pred)
6.4交叉验证
from sklearn.cross_validation import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(lr, X, y, cv=2))
7 优化模型
7.1 Grid Search
from sklearn.grid_search import GridSearchCV
params = {"n_neighbors": np.arange(1,3),
"metric": ["euclidean", "cityblock"]}
gird = GridSearchCV(estimator = knn,
param_grid = params)
grid.fix(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)
7.2 Randomzied Parameter Optimization
from sklearn.grid_search import RandomizedSearchCV
params = {"n_neighbors": np.arange(1,5),
"wights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV (estimator = knn,
param_distributions= params, cv=4, n_iter=8, random_state=5)
rsearch.fix(Xtrain, y_train)
print(rsearch.best_score_)
参考:来自DataCamp
更详细的看官方文档及例子:
https://scikit-learn.org/stable/user_guide.html
https://scikit-learn.org/stable/auto_examples/index.html
网友评论