美文网首页
从GO数据库找基因集合做AUCell

从GO数据库找基因集合做AUCell

作者: 小洁忘了怎么分身 | 来源:发表于2023-06-28 07:33 被阅读0次

    AUCell

    1.提取指定GOterm的基因

    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7606976/

    在这篇文章里提到,Reactive oxygen species (ROS) 活性氧相关的17个GO条目,可以把这些基因提取出来。使用msigdbr包搞定。Data_Sheet_2.XLSX是文章的附件

    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7606976/bin/Data_Sheet_2.XLSX

    rm(list = ls())
    library(tidyverse)
    dat = rio::import("Data_Sheet_2.XLSX")
    g = dat[-1,1]
    g
    
    ##  [1] "GO_REACTIVE_OXYGEN_SPECIES_BIOSYNTHETIC_PROCESS"                       
    ##  [2] "GO_REACTIVE_OXYGEN_SPECIES_METABOLIC_PROCESS"                          
    ##  [3] "GO_POSITIVE_REGULATION_OF_REACTIVE_OXYGEN_SPECIES_BIOSYNTHETIC_PROCESS"
    ##  [4] "GO_NEGATIVE_REGULATION_OF_REACTIVE_OXYGEN_SPECIES_BIOSYNTHETIC_PROCESS"
    ##  [5] "GO_NEGATIVE_REGULATION_OF_REACTIVE_OXYGEN_SPECIES_METABOLIC_PROCESS"   
    ##  [6] "GO_POSITIVE_REGULATION_OF_REACTIVE_OXYGEN_SPECIES_METABOLIC_PROCESS"   
    ##  [7] "GO_MITOCHONDRIAL_ELECTRON_TRANSPORT_CYTOCHROME_C_TO_OXYGEN"            
    ##  [8] "GO_SUPEROXIDE_GENERATING_NADPH_OXIDASE_ACTIVITY"                       
    ##  [9] "GO_HYDROGEN_PEROXIDE_METABOLIC_PROCESS"                                
    ## [10] "GO_HYDROGEN_PEROXIDE_CATABOLIC_PROCESS"                                
    ## [11] "GO_PENTOSE_METABOLIC_PROCESS"                                          
    ## [12] "GO_RESPONSE_TO_REACTIVE_OXYGEN_SPECIES"                                
    ## [13] "GO_REGULATION_OF_RESPONSE_TO_REACTIVE_OXYGEN_SPECIES"                  
    ## [14] "GO_CELLULAR_RESPONSE_TO_REACTIVE_OXYGEN_SPECIES"                       
    ## [15] "GO_REGULATION_OF_REACTIVE_OXYGEN_SPECIES_BIOSYNTHETIC_PROCESS"         
    ## [16] "GO_REGULATION_OF_REACTIVE_OXYGEN_SPECIES_METABOLIC_PROCESS"            
    ## [17] "GO_NEGATIVE_REGULATION_OF_RESPONSE_TO_REACTIVE_OXYGEN_SPECIES"
    

    17条term👆

    2.从msigdbr查找这些term

    library(msigdbr)
    GO_df = msigdbr(species = "Homo sapiens",category = "C5") %>% 
      dplyr::select(gene_symbol,gs_name,gs_subcat)
    
    dim(GO_df)
    
    ## [1] 1424471       3
    
    table(GO_df$gs_subcat)
    
    ## 
    ##  GO:BP  GO:CC  GO:MF    HPO 
    ## 721379 115769 122674 464649
    
    GO_df = GO_df[GO_df$gs_subcat!="HPO",]
    
    table(g %in% GO_df$gs_name)
    
    ## 
    ## FALSE 
    ##    17
    

    全部匹配失败啦?

    其实是因为前缀不同

    head(GO_df$gs_name,3)
    
    ## [1] "GOBP_10_FORMYLTETRAHYDROFOLATE_METABOLIC_PROCESS"
    ## [2] "GOBP_10_FORMYLTETRAHYDROFOLATE_METABOLIC_PROCESS"
    ## [3] "GOBP_10_FORMYLTETRAHYDROFOLATE_METABOLIC_PROCESS"
    
    head(g,3)
    
    ## [1] "GO_REACTIVE_OXYGEN_SPECIES_BIOSYNTHETIC_PROCESS"                       
    ## [2] "GO_REACTIVE_OXYGEN_SPECIES_METABOLIC_PROCESS"                          
    ## [3] "GO_POSITIVE_REGULATION_OF_REACTIVE_OXYGEN_SPECIES_BIOSYNTHETIC_PROCESS"
    

    把前缀去掉来匹配,顺便把大写变成小写,变得易读一点。

    sn = GO_df$gs_name %>%
      str_to_lower()%>%
      str_replace("_","///") %>%
      str_split("///",simplify = T)%>%
      .[,2] %>%
      str_replace_all("_"," ")
    head(sn)
    
    ## [1] "10 formyltetrahydrofolate metabolic process"
    ## [2] "10 formyltetrahydrofolate metabolic process"
    ## [3] "10 formyltetrahydrofolate metabolic process"
    ## [4] "10 formyltetrahydrofolate metabolic process"
    ## [5] "10 formyltetrahydrofolate metabolic process"
    ## [6] "10 formyltetrahydrofolate metabolic process"
    
    GO_df = mutate(GO_df,short_name = sn)
    
    g = str_to_lower(g)%>%
      str_replace("_","///") %>%
      str_split("///",simplify = T)%>%
      .[,2] %>%
      str_replace_all("_"," ")
    head(g)
    
    ## [1] "reactive oxygen species biosynthetic process"                       
    ## [2] "reactive oxygen species metabolic process"                          
    ## [3] "positive regulation of reactive oxygen species biosynthetic process"
    ## [4] "negative regulation of reactive oxygen species biosynthetic process"
    ## [5] "negative regulation of reactive oxygen species metabolic process"   
    ## [6] "positive regulation of reactive oxygen species metabolic process"
    
    table(g %in% GO_df$short_name)
    
    ## 
    ## FALSE  TRUE 
    ##     1    16
    

    经过数据框搜索,发现匹配不上的那个其实是因为少了俩空格。。。

    所以直接改一下就好。

    “superoxide generating nadph oxidase activity”

    “superoxide generating nad p h oxidase activity”

    g[!(g %in% GO_df$short_name)] = "superoxide generating nad p h oxidase activity"
    table(g %in% GO_df$short_name)
    
    ## 
    ## TRUE 
    ##   17
    
    rosgene = GO_df %>%
      filter(short_name %in% g) %>%
      pull(gene_symbol) %>%
      unique()
    length(rosgene)
    
    ## [1] 402
    

    2.AUCell

    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9897923/
    用这个文章里的方法,将单细胞亚群的marker基因与ros相关基因取交集,用作AUCell的基因集

    The intersection of marker genes was selected based on strong population specificity (adj_p < 0.05 & |avg_log2FoldChange| > 1.5 & pct.1 > 0.5 & pct.2 < 0.5) from each cell subgroup and factors related to OS responses.

    sce.all是seurat+singleR得出的对象,markers是findallmarkers得到的数据框。

    load("sce.all.Rdata")
    load("makers.Rdata")
    k = markers$p_val_adj < 0.05 & 
      abs(markers$avg_log2FC) > 1.5 & 
      markers$pct.1 > 0.5 & 
      markers$pct.2 < 0.5
    table(k)
    
    ## k
    ## FALSE  TRUE 
    ## 13100  1483
    
    gi = intersect(markers[k,"gene"],rosgene) 
    gi = factor(gi,levels = gi)
    library(Seurat)
    DotPlot(sce.all,features = gi,cols = "RdYlBu",
            group.by = "RNA_snn_res.0.5")+RotatedAxis()
    

    AUCell用于计算每个细胞中特定基因集的活性程度。用上面的gi作为基因集来计算。

    AUCell的三个步骤:

    Build the rankings:矩阵中的每个细胞里,给基因进行排序。 Calculate the Area Under the Curve (AUC):计算每个细胞的AUC值 Set the assignment thresholds:计算活性区分的阈值

    library(GSEABase)
    geneSets <- GeneSet(as.character(gi), setName="ROS")
    geneSets
    
    ## setName: ROS 
    ## geneIds: ACP5, SOD3, ..., PMAIP1 (total: 32)
    ## geneIdType: Null
    ## collectionType: Null 
    ## details: use 'details(object)'
    
    library(AUCell)
    exprMatrix = sce.all@assays$RNA@data
    cells_rankings <- AUCell_buildRankings(exprMatrix)
    
    ##  min   1%   5%  10%  50% 100% 
    ##  474  716  794  859 1409 5996
    
    cells_AUC <- AUCell_calcAUC(geneSets, cells_rankings)
    set.seed(333)
    cells_assignment <- AUCell_exploreThresholds(cells_AUC, plotHist=TRUE, assign=TRUE) 
    
    auc_thr = cells_assignment$ROS$aucThr$selected
    auc_thr
    
    ##       L_k2 
    ## 0.07917315
    

    3.FeaturePlot可视化

    标准版:

    library(Seurat)
    a = as.numeric(getAUC(cells_AUC))
    sce.all$auc_score = a
    FeaturePlot(sce.all,features = "auc_score") 
    

    定制版:

    dat<- data.frame(sce.all@meta.data, 
                    sce.all@reductions$umap@cell.embeddings,
                    sce.all@active.ident)
    colnames(dat)[ncol(dat)] = "seurat_annotation"
    class_avg <- dat %>%
      group_by(seurat_annotation) %>%
      summarise(
        UMAP_1 = median(UMAP_1),
        UMAP_2 = median(UMAP_2)
      )
    
    library(ggpubr)
    ggplot(dat, aes(UMAP_1, UMAP_2))  +
      geom_point(aes(colour  = auc_score)) + viridis::scale_color_viridis(option="A") +
      ggrepel::geom_label_repel(aes(label = seurat_annotation),
                                data = class_avg,
                                label.size = 0,
                                segment.color = NA)+
      theme(legend.position = "none") + theme_bw()
    
    dat$auc_group = ifelse(dat$auc_score>auc_thr,"active","non_active")
    

    4.活跃和不活跃组的细胞比例条形图

    ggplot(dat,aes(auc_group,fill = seurat_annotation))+
      geom_bar(position = "fill", alpha = 0.9)+
      scale_fill_brewer(palette = "Set1")+
      theme_classic()
    

    https://www.jianshu.com/p/2fb20f44da67

    相关文章

      网友评论

          本文标题:从GO数据库找基因集合做AUCell

          本文链接:https://www.haomeiwen.com/subject/oyagydtx.html