寻找差异的feature

作者: 生信编程日常 | 来源:发表于2020-01-10 23:52 被阅读0次

    在生物学上,经常会遇到找control和treat的差异基因或者任意两个或者两个以上处理条件下,最差异的变化,比如我有这样一个数据,几千个细胞分为处理过的和没处理过的,然后通过拍照记录了他们的形态大小等几十个特征,我想知道哪个特征产生了最大的变化。

    head(actin_vim_TC)
    
    image.png
    library(limma)
    #分组
    group<-as.factor(unlist(lapply(rownames(actin_vim_TC),function(x){strsplit(x,"\\_")[[1]][2]})))
    levels(group)[levels(group) %in% c(2:12)] <-"C"
    levels(group)[levels(group) %in% c(3:23)] <-"T"
    design <- model.matrix(~0+factor(group))
    colnames(design)=levels(factor(group))
    rownames(design)=rownames(actin_vim_TC)
    design
    contrast.matrix<-makeContrasts(paste0(unique(group),collapse = "-"),levels = design)
    ##step1
    fit <- lmFit(t(actin_vim_TC),design)
    ##step2
    fit2 <- contrasts.fit(fit, contrast.matrix) 
    fit2 <- eBayes(fit2)  
    ##step3
    DEfeature = topTable(fit2, coef=1, n=Inf)
    DEfeature<-DEfeature[order(abs(DEfeature$logFC),decreasing = T),]
    head(DEfeature)
    

    前几个差异最大的feature如下


    image.png

    将所有细胞做PCA

    #PCA
    actin_vim.pca <- prcomp(as.matrix(actin_vim_TC)[,rownames(DEfeature)[1:20]], center = F,scale = F)
    summary(actin_vim_TC)
    str(actin_vim_TC)
    summary(actin_vim.pca)
    type_old<-as.factor(unlist(lapply(rownames(actin_vim.pca$x),function(x){strsplit(x,"\\_")[[1]][2]})))
    levels(type_old)[levels(type_old) %in% c(2:12) ]<- "C"
    levels(type_old)[levels(type_old) %in% c(13:23) ]<- "T"
    #plot
    gg<-cbind(as.data.frame(type_old),as.data.frame(actin_vim.pca$x))
    gg$type_old<-as.factor(gg$type_old)
    ggplot(gg,aes(x = PC1,y = PC2,color = gg$type_old))+geom_point(alpha = 0.5)
    
    image.png

    可以明显看到两群细胞分为不同的分布方向,所以查看较大特征值和特征向量

    #show the feature
    library(factoextra)
    # Visualize variable with cos2 >= 0.85
    f<-fviz_pca_var(actin_vim.pca, select.var = list(cos2 = 0.85), repel=T, col.var = "cos2", geom.var = c("arrow", "text") )
    f
    
    image.png
    #监督聚类
    library(caret)
    library(kernlab)
    Data<-as.matrix(actin_vim_TC)[,rownames(DEfeature)[1:20]]
    Data<-cbind(Data,as.data.frame(type_old))
    
    intrain<-createDataPartition(Data$type_old,p = 0.75,list = F)
    head(intrain)
    training<-Data[intrain,]
    testing<-Data[-intrain,]
    modelFit<-train(type_old~.,data=training,method = "glm")
    modelFit$finalModel
    predictions <- predict(modelFit,newdata=testing)
    predictions 
    confusionMatrix(predictions,testing$type)
    
    image.png

    正确率也有85%,看看ROC曲线

    library(pROC)
    Roc=roc(testing$type_old,factor(predictions,ordered = T))
    Roc
    plot(Roc, print.auc = TRUE, auc.polygon = TRUE, legacy.axes = TRUE, 
         grid = c(0.1, 0.2), grid.col = c("green", "red"), max.auc.polygon = TRUE,  
         auc.polygon.col = "skyblue", print.thres = TRUE, xlab = "specificity", ylab = "sensitivity",
         main = "")  
    
    image.png

    查看机器学习分群的feature重要性

    importance <- varImp(modelFit, scale=FALSE)
    # summarize importance
    print(importance)
    # plot importance
    plot(importance)
    
    image.png

    我们可以看到三种方式的结果几乎是差不多的,说明差异最显著的feature是在不同的方法计算方式都是稳定的。

    相关文章

      网友评论

        本文标题:寻找差异的feature

        本文链接:https://www.haomeiwen.com/subject/bufvactx.html