%%%%%%

作者: 纵春水东流 | 来源:发表于2020-03-07 10:03 被阅读0次

    一、结果

    1、ruitehol

              TP    TN   FP   FN        Sn        Sp  Accuracy       MCC       AUC    Fscore
    AAAAAG  55.5  52.3  6.0  9.2 0.8611364 0.9007420 0.8764394 0.7573759 0.8809392 0.8794544
    AAGAAA  52.4  49.2 10.1 13.3 0.7981628 0.8362825 0.8127305 0.6300846 0.8172227 0.8159164
    AATAAA 202.5 199.5 57.0 60.0 0.7722173 0.7786982 0.7745630 0.5500423 0.7754578 0.7756715
    AATACA  34.7  37.7  9.3  6.3 0.8464419 0.8045571 0.8227273 0.6482072 0.8254995 0.8154168
    AATAGA  16.9  16.2  1.6  2.3 0.8891735 0.9153497 0.8947724 0.7965120 0.9022616 0.8975811
    AATATA  15.7  18.3  4.8  2.2 0.8865425 0.8079497 0.8290767 0.6757853 0.8472461 0.8109587
    ACTAAA  25.1  27.2  9.4  7.3 0.7785348 0.7476896 0.7578669 0.5210172 0.7631122 0.7485714
    AGTAAA  23.0  27.2 10.5  6.3 0.7882839 0.7266734 0.7492424 0.5067008 0.7574787 0.7296239
    ATTAAA  90.8  90.5 29.2 29.5 0.7652125 0.7583934 0.7554167 0.5171202 0.7618029 0.7557712
    CATAAA  15.2  15.9  5.3  4.6 0.7685621 0.7680430 0.7591754 0.5275986 0.7683026 0.7469583
    GATAAA  18.2  19.0  4.8  4.0 0.8224295 0.8046294 0.8086957 0.6221677 0.8135294 0.8034097
    TATAAA  30.1  30.8  8.9  8.2 0.7905583 0.7828915 0.7807692 0.5674452 0.7867249 0.7769580
    

    2、多层感知机二分类

    损失,精确度
    AATAAA:[0.8702749238281177, 0.6274131271829937]
    

    3、cnn

    AATAAA:79.77% (+/- 3.01%)
    

    4、rnn

    5、vgg

    6、cnn-rnn

    二、代码

    数据

    AAAAAG.txt  AATAGA.txt  ATTAAA.txt     negAAGAAA.txt  negAATATA.txt  negCATAAA.txt
    AAGAAA.txt  AATATA.txt  CATAAA.txt     negAATAAA.txt  negACTAAA.txt  negGATAAA.txt
    AATAAA.txt  ACTAAA.txt  GATAAA.txt     negAATACA.txt  negAGTAAA.txt  negTATAAA.txt
    AATACA.txt  AGTAAA.txt  negAAAAAG.txt  negAATAGA.txt  negATTAAA.txt  TATAAA.txt
    

    1、ruitehol

    #load library
    library(ruimtehol)
    library(itertools2)
    require(magrittr) 
    require(caret)
    ######################
    set.seed(177)
    
    kmer <- function(x,k=5){
      b=nchar(x[1])
      for(i in 1:length(x)){
        delta1=paste(substr(x[i],1,100),substr(x[i],107,206),sep = '')
        delta2=NULL
        for(j in 1:(b-6-k-1)){
          delta2 <- c(delta2,substr(delta1,j,j+k-1))
        }
        x[i] <- paste(delta2,sep = '',collapse = ' ')
      }
      return(x)
    }
    
    fileNames <- dir('.')[c(1:11,24)]
    #fileNames='AATAAA.txt'
    total_access={}
    for(filename in fileNames){
      pData <- read.csv(filename,header = F,stringsAsFactors = F)[,1]  %>% as.vector() %>% kmer()
      negData <- read.csv(paste('neg',filename,sep = ''),header = F,stringsAsFactors = F)[,1] %>% as.vector() %>% kmer()
      data = data.frame(rbind(cbind(pData,label=1), cbind(negData,label=0 )),stringsAsFactors = F)
      
      require(caret)
      folds<-createFolds(y=data$label,k=10) #根据training的laber-Species把数据集切分成10等份
      
      access={}
      for(i in 1:10){
        train_x <- data[-folds[[i]],1]
        train_y <- data[-folds[[i]],2]
      
        test_x  <- data[folds[[ i]],1]
        test_y  <- data[folds[[ i]],2]
        #建立starspace模型
        model <- embed_tagspace(x=train_x,y=train_y,
                                dim = 30,epoch = 1, loss = "hinge", adagrad = T, 
                                similarity = "dot", negSearchLimit = 10,ngrams = 10,
                                minCount = 5)
        
        #结果评估
        result <- predict(model,test_x)
        TN=TP=FN=FP=0
        for(j in 1:length(test_x)){
          if(test_y[j]==1 & result[[j]]$prediction[1,1]==1){TP=TP+1 }
          if(test_y[j]==1 & result[[j]]$prediction[1,1]==0){FP=FP+1 }
          if(test_y[j]==0 & result[[j]]$prediction[1,1]==0){TN=TN+1 }
          if(test_y[j]==0 & result[[j]]$prediction[1,1]==1){FN=FN+1 }
        }
        Sn = TP/(TP+FN)
        Sp=TN/(TN+FP)
        Accuracy = (TP+TN)/(TN+FP+TP+FN)
        MCC=(TP*TN-FP*FN)/sqrt((TP+FP)*(TN+FN)*(TN+FP)*(TP+FN))
        AUC=(Sn+Sp)/2
        Fscore=(2*TP)/(2*TP+FP+FN)
        access=rbind(access,data.frame(TP,TN,FP,FN,Sn,Sp,Accuracy,MCC,AUC,Fscore))
      }
      total_access=rbind(total_access,apply(access,2,mean))
    }
    rownames(total_access) <- sapply(fileNames,function(x) substr(x,1,6),USE.NAMES = F)
    total_access
    
    
    
    

    2、多层感知机二分类

    #加载包
    import keras
    #Keras
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.models import Sequential
    from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
    from keras.layers.embeddings import Embedding
    
    from keras import models
    from keras import layers
    from keras import losses
    from keras import metrics
    from keras import optimizers
    ## Plotly
    import plotly.offline as py
    import plotly.graph_objs as go
    py.init_notebook_mode(connected=True)
    # Others
    import nltk
    import string
    import numpy as np
    import pandas as pd
    from nltk.corpus import stopwords
    from sklearn.manifold import TSNE
    import os
    #设置工作目录
    import os
    os.chdir('/home/uu/Desktop/polyA_predict/data/polyadata/')
    
    #读取数据
    df1 = pd.read_csv('AATAAA.txt',header=None,names=['sequance'])
    df1.loc[:,'labels']=1
    
    df2 = pd.read_csv('negAATAAA.txt',header=None,names=['sequance'])
    df2.loc[:,'labels']=0
    df = pd.concat([df1,df2])
    df['sequance']=df['sequance'].map(lambda x: list(x))
    #向量化
    vocabulary_size=4
    tokenizer = Tokenizer(num_words= vocabulary_size)
    tokenizer.fit_on_texts(df['sequance'])
                           
    sequences = tokenizer.texts_to_sequences(df['sequance'])
    data = pad_sequences(sequences, maxlen=200)
    labels = np.array(df['labels'])
    
    # Generate dummy data
    train=list(range(260,4931))
    test=list(range(1,260))+list(range(4931,5190))
    x_train = data[train]
    y_train = labels[train]
    x_test = data[test]
    y_test = labels[test]
    #建立网络、编译网络
    model = Sequential()
    model.add(Dense(128, input_dim=200, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    model.fit(x_train, y_train,
              epochs=20,batch_size=128)
    score = model.evaluate(x_test, y_test,batch_size=128)
    
    print(score)
    

    3、CNN

    from keras.preprocessing.text import Tokenizer
    from keras.models import Sequential
    from keras.layers import Dense, Dropout,Activation
    from keras.layers import Embedding
    from keras.layers import Conv2D, MaxPool2D,GlobalAveragePooling2D, MaxPooling2D,Flatten
    from keras.optimizers import Adam
    from sklearn.model_selection import StratifiedKFold
    import numpy as np
    import pandas as pd
    
    #设置工作目录
    import os
    os.chdir('/home/uu/Desktop/polyA_predict/data/polyadata/')
    np.random.seed(777)
    #读取数据
    df1 = pd.read_csv('AATAAA.txt',header=None,names=['sequance'])
    df1.loc[:,'labels']=1
    df2 = pd.read_csv('negAATAAA.txt',header=None,names=['sequance'])
    df2.loc[:,'labels']=0
    df = pd.concat([df1,df2])#sequences,labels
    df['sequance']=df['sequance'].map(lambda x: list(x))#list,labels
    
    #向量化
    vocabulary_size=200
    tokenizer = Tokenizer(num_words= vocabulary_size)
    tokenizer.fit_on_texts(df['sequance'])
    # vocabulary_size=200
    # tokenizer = np_utils.Tokenizer (num_words= vocabulary_size)
    # tokenizer.fit_on_texts(df['sequance'])                      
    sequences = tokenizer.texts_to_sequences(df['sequance'])#list[[],[]]
    sequences=[list(np.asarray(x)-1) for x in sequences]
    # sequences[1]
    # [1 0 0 0 2 1 2 2 1 1 2 2 1 1 0 3 0 2 1 1 2 1 2 1 0 2 1 0 2 1 1 0 0 2 2 0 3
    #  1 0 1 2 2 3 0 0 2 2 0 2 1 3 1 3 0 2 2 3 3 0 0 2 2 0 1 2 1 3 0 1 3 2 0 0 2
    #  0 2 1 0 1 1 1 3 0 0 0 1 2 3 0 0 2 0 0 0 1 2 0 0 2 0 0 0 1 0 0 0 1 3 1 1 1
    #  2 2 3 1 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 1
    #  3 0 1 3 0 2 1 1 3 1 3 0 0 3 3 1 2 2 2 0 3 0 3 0 3 3 1 3 0 2 0 0 0 3 0 3 3
    #  1 2 1 0 1 0 1 0 1 3 0 0 2 2 1 3 3 3 0 3 3]
    
    #删除中间6个元素剩下200个
    for i in range(len(sequences)):
        del sequences[i][100:106]
    
    #转化为数组[-1,200,4]
    data=[np_utils.to_categorical(x)for x in sequences]#list[array()]
    data=np.concatenate((data),axis=0).reshape(-1,1,200,4)# print(data.shape)# (5190, 200, 4)
    #labels = np_utils.to_categorical(np.asarray(df.iloc[:,1]),num_classes=2)#numpy.ndarray,5190,2
    labels = np.asarray(df.iloc[:,1])
    #print(labels)
    #分割训练数据和测试数据
    # train=list(range(260,4931))
    # test=list(range(1,260))+list(range(4931,5190))
    
    # x_train = data[train]
    # y_train = labels[train]
    # x_test =  data[test]
    # y_test = labels[test]
    # print("x_train.shape",x_train.shape)
    # print("y_train.shape",y_train.shape)
    # print("x_test.shape",x_test.shape)
    # print("y_test.shape",y_test.shape)
    
    
    seed=777
    np.random.seed(777)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    cvscores = []
    X=data
    Y=labels
    for train, test in kfold.split(X, Y):
        # create model
        model = Sequential()
        model.add(Conv2D(filters=16, kernel_size=(3,4), padding='same',
                         input_shape=(1,200,4)  ))
    
        model.add(Conv2D(filters=64,kernel_size=(6,4),padding='same'))
        model.add(Activation('relu'))
    
        model.add(MaxPool2D(pool_size=(3,4),padding='same'))
        model.add(Dropout(0.5))
    
        model.add(Flatten())
        model.add(Dense(64))
    
        model.add(Dense(1))
    
        #compile
        adam=Adam(lr=1e-4)
        model.compile(loss='binary_crossentropy',
                      optimizer=adam,
                      metrics=['accuracy'])
        
        # Fit the model
        model.fit(X[train], Y[train], epochs=30, batch_size=64, verbose=0)
        # evaluate the model
        scores = model.evaluate(X[test], Y[test], verbose=0)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        cvscores.append(scores[1] * 100)
    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
    

    相关文章

      网友评论

          本文标题:%%%%%%

          本文链接:https://www.haomeiwen.com/subject/jmblrhtx.html