美文网首页
离散制造过程中典型工件的质量符合率分类(2019-08-25)

离散制造过程中典型工件的质量符合率分类(2019-08-25)

作者: 南海金雕 | 来源:发表于2019-08-25 09:15 被阅读0次

    最近,在DataFountain平台由中国计算机学会 & 西门子举办了一个“离散制造过程中典型工件的质量符合率预测”的比赛,就尝试了一下。

    1.数据清洗

    发现数据集的中每个特征最大最小值相差非常大,而且很大的数字不在少数。尝试用正态分布异常点检测法初步对异常值进行判断,并用均值进行替换。然而,替换完后,还是存在不少的异常点。

    清洗前的数据 清洗后的数据

    2.构建模型

    其实,还可以利用加减乘除构建人工特征……并用多个模型融合以提高准确率,这里就是做了一个Baseline,在没有调参数的情况下,分别用SVM、MLP、CNN、LihtGBM、XGBoost跑了一遍,发现最后一个准确率在50%左右,其他的都是在41%-45%之间。

    2.1 SVM (这个使用MATLAB跑的)

    Data=csvread('Train_AfterQinXi.csv');

    BiLi=0.1;  %注意点 1.最后一列要按顺序排列,并且最后一列一定是类型,需要设定测试集的比例

    [m,n]=size(Data);%最后一列是分类的类型,Excel要排序

    Testnum=zeros(1,max(Data(:,n))+1);

    Speicesnum=Testnum;

    kkk=1;

    sum0=0;

    BJS=Data(1,n);

    for j=1:m

        if Data(j,n)==BJS

          sum0=sum0+1;

        else

          Speicesnum(kkk)=sum0;

          Testnum(kkk)=floor(BiLi*sum0);kkk=kkk+1;

          sum0=1;BJS=Data(j,n);

        end

    end

    Testnum(1,end)=floor(BiLi*sum0);

    Speicesnum(1,end)=sum0;

    for j=1:length(Testnum)

        if Testnum(j)==0

          Testnum(j)=1;

        end

    end

    %求出每类的个数

    Train_Feature=[];

    Train_Label=[];

    Test_Feature=[];

    Test_Label=[];

    for j=1:max(Data(:,n))+1

        if j==1

          Kaishi=1;

        else

          Kaishi=sum(Speicesnum(1,1:j-1))+1;

        end

        JieSu1=sum(Speicesnum(1,1:j))-Testnum(j);

        JieSu2=sum(Speicesnum(1,1:j));

        Train_Feature=[Train_Feature;Data(Kaishi:JieSu1,1:n-1)];

        Train_Label=[Train_Label;Data(Kaishi:JieSu1,n)];

        Test_Feature=[Test_Feature;Data(JieSu1+1:JieSu2,1:n-1)];

        Test_Label=[Test_Label;Data(JieSu1+1:JieSu2,n)];

    end

    %数据预处理,将训练集和测试集归一化到[0,1]区间

    [mtrain,ntrain] = size(Train_Feature);

    [mtest,ntest] = size(Test_Feature);

    dataset = [Train_Feature;Test_Feature];

    [dataset_scale,ps] = mapminmax(dataset',0,1);

    dataset_scale = dataset_scale';

    Train_Feature = dataset_scale(1:mtrain,:);

    Test_Feature = dataset_scale( (mtrain+1):(mtrain+mtest),: );

    %SVM网络训练和预测

    model = fitcecoc(Train_Feature,Train_Label);

    [predict_label] =predict(model,Test_Feature);

    accuracy=0;

    for j=1:length(Test_Label)

        if Test_Label(j)==predict_label(j)

          accuracy=accuracy+1;

        end

    end

    accuracy=accuracy/length(Test_Label)

    2.2 LightGBM

    import lightgbm as lgb

    import numpy as np

    from pandas import read_csv

    from sklearn import datasets

    from xgboost import plot_importance

    from matplotlib import pyplot as plt

    from sklearn.model_selection import train_test_split

    dataset = read_csv('ZeroOne_Train.csv')

    XXX = read_csv('ZeroOne_Test.csv')

    values = dataset.values

    XY= values

    Y = XY[:,10]

    n_train_hours1 =5398

    x_train=XY[:n_train_hours1,0:10]

    trainY =Y[:n_train_hours1]

    x_test =XY[n_train_hours1:, 0:10]

    testY =Y[n_train_hours1:]

    X_train=np.array(x_train,dtype=np.float)

    X_test=np.array(x_test,dtype=np.float)

    y_train=np.array(trainY,dtype=np.int)

    y_test=np.array(testY,dtype=np.int)

    XXX=np.array(XXX,dtype=np.float)

    params = {

    'boosting_type': 'gbdt',

    'objective': 'multiclassova',

    'num_class': 4, 

    'metric': 'multi_error',

    'num_leaves': 63,

    'learning_rate': 0.01,

    'feature_fraction': 0.9,

    'bagging_fraction': 0.9,

    'bagging_seed':0,

    'bagging_freq': 1,

    'verbose': -1,

    'reg_alpha':1,

    'reg_lambda':2,

    'lambda_l1': 0,

    'lambda_l2': 1,

    'num_threads': 8,

    }

    train_data=lgb.Dataset(X_train,label=y_train)

    validation_data=lgb.Dataset(X_test,label=y_test)

    clf=lgb.train(params,train_data,valid_sets=[validation_data],num_boost_round = 1300,verbose_eval = 100)

    y_pred=clf.predict(XXX, num_iteration=1300)

    2.3 XGBoost

    import xgboost as xgb

    import numpy as np

    from pandas import read_csv

    from xgboost import plot_importance

    from matplotlib import pyplot as plt

    from sklearn.model_selection import train_test_split

    dataset = read_csv('ZeroOne_Train.csv')

    XXX = read_csv('ZeroOne_Test.csv')

    values = dataset.values

    XY= values

    Y = XY[:,10]

    n_train_hours1 =5398

    x_train=XY[:n_train_hours1,0:10]

    trainY =Y[:n_train_hours1]

    x_test =XY[n_train_hours1:, 0:10]

    testY =Y[n_train_hours1:]

    X_train=np.array(x_train,dtype=np.float)

    X_test=np.array(x_test,dtype=np.float)

    y_train=np.array(trainY,dtype=np.int)

    y_test=np.array(testY,dtype=np.int)

    XXX=np.array(XXX,dtype=np.float)

    params = {

        'booster': 'gbtree',

        'objective': 'multi:softmax',

        'num_class': 4,

        'gamma': 0.1,

        'max_depth': 6,

        'lambda': 2,

        'subsample': 0.7,

        'colsample_bytree': 0.7,

        'min_child_weight': 3,

        'silent': 1,

        'eta': 0.1,

        'seed': 1000,

        'nthread': 4,

    }

    plst = params.items()

    dtrain = xgb.DMatrix(X_train, y_train)

    num_rounds = 500

    model = xgb.train(plst, dtrain, num_rounds)

    # 对测试集进行预测

    dtest = xgb.DMatrix(XXX)

    ans = model.predict(dtest)

    2.4 MLP

    from __future__ import print_function

    import keras

    from keras.models import Sequential

    from keras.layers import Dense, Dropout

    from pandas import read_csv

    batch_size = 100

    num_classes = 4

    epochs = 200

    dataset = read_csv('ZeroOne_Train.csv')

    XXX = read_csv('ZeroOne_Test.csv')

    values = dataset.values

    XY= values

    Y = XY[:,10]

    n_train_hours1 =5398

    x_train=XY[:n_train_hours1,0:10]

    trainY =Y[:n_train_hours1]

    x_test =XY[n_train_hours1:, 0:10]

    testY =Y[n_train_hours1:]

    y_train = keras.utils.to_categorical(trainY, num_classes)

    y_test = keras.utils.to_categorical(testY, num_classes)

    model = Sequential()

    model.add(Dense(128,input_dim=10,kernel_initializer='normal',activation='relu'))

    model.add(Dense(128,kernel_initializer='normal',activation='relu'))

    model.add(Dense(128,kernel_initializer='normal',activation='relu'))

    model.add(Dropout(0.25))

    model.add(Dense(num_classes, activation='softmax'))

    model.summary()

    model.compile(loss=keras.losses.categorical_crossentropy,

                  optimizer=keras.optimizers.Adadelta(),

                  metrics=['accuracy'])

    history=model.fit(x_train, y_train,

                            batch_size=batch_size,

                            epochs=epochs,

                            verbose=2,

                            validation_data=(x_test, y_test))

    prediction=model.predict_classes(XXX)

    2.5 CNN

    from __future__ import print_function

    import keras

    from keras.datasets import mnist

    from keras.models import Sequential

    from keras.layers import Dense, Dropout, Flatten

    from keras.layers import Conv2D, MaxPooling2D

    from keras import backend as K

    from pandas import read_csv

    batch_size = 32

    num_classes = 4

    epochs = 200

    # input image dimensions

    # 输入图像维度

    img_rows, img_cols = 4, 4

    input_shape = (img_rows, img_cols, 1)

    # the data, shuffled and split between train and test sets

    # 用于训练和测试的数据集,经过了筛选(清洗、数据样本顺序打乱)和分割(分割为训练和测试集)

    dataset = read_csv('ZeroOne_Train_CNN.csv')

    values = dataset.values

    XY= values

    Featurenumber=img_rows*img_cols

    Y = XY[:,Featurenumber]

    n_train_hours1 =5398

    x_train=XY[:n_train_hours1,0:Featurenumber]

    trainY =Y[:n_train_hours1]

    x_test =XY[n_train_hours1:, 0:Featurenumber]

    testY =Y[n_train_hours1:]

    x_train = x_train.reshape(-1,4,4,1)

    x_test = x_test.reshape(-1,4,4,1)

    y_train = keras.utils.to_categorical(trainY, num_classes)

    y_test = keras.utils.to_categorical(testY, num_classes)

    model = Sequential()

    model.add(Conv2D(16, kernel_size=(3, 3),

                    activation='relu',

                    padding='same',

                    input_shape=input_shape))

    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(32, kernel_size=(3, 3),

                    activation='relu',

                    padding='same'))

    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Dropout(0.25))

    model.add(Flatten())

    model.add(Dense(16, activation='relu'))

    model.add(Dropout(0.5))

    model.add(Dense(num_classes, activation='softmax'))

    model.summary()

    model.compile(loss=keras.losses.categorical_crossentropy,

                  optimizer=keras.optimizers.Adadelta(),

                  metrics=['accuracy'])

    history=model.fit(x_train, y_train,

                            batch_size=batch_size,

                            epochs=epochs,

                            verbose=2,

                            validation_data=(x_test, y_test))

    a=history.history['acc']

    b=history.history['val_acc']

    相关文章

      网友评论

          本文标题:离散制造过程中典型工件的质量符合率分类(2019-08-25)

          本文链接:https://www.haomeiwen.com/subject/sjfhectx.html