Shown below is a Data Preprocessing Template
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#import the dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1:].values
#handle missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN",strategy="mean",axis=0) #0-> along col
imputer = imputer.fit(X[:,1:3])
X[: , 1:3] = imputer.transform(X[: , 1:3])
#handle categary variables
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelEncoder_x = LabelEncoder()
X[:,0] = labelEncoder_x.fit_transform(X[:,0])
oneHotEncoder = OneHotEncoder(categorical_features = [0])
X = oneHotEncoder.fit_transform(X).toarray()
labelEncoder_y = LabelEncoder()
y = labelEncoder_y.fit_transform(y)
#Split the dataset into Traning set and Test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)
#Scaling (Standardisation or Normalisation)
#Standardisation -> x = (x-mean(x))/standardDeviation(x)
#x = x与平均数的差和标准差的比 标准差事方差的sqrt
#方差是与平均数差的平方的平均
#Normalisation -> X = (x - min(x))/(max(x)-min(x))
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
x_train = sc_X.fit_transform(x_train)
x_test = sc_X.transform(x_test)
网友评论