【r<-方案】数据框ID拆分

作者: 王诗翔 | 来源:发表于2019-05-12 12:42 被阅读16次

    一个样例数据:

    > df
                 gene_id gene_symbol gene_class
    1  ENSG00000000003.9      TSPAN6     coding
    2  ENSG00000000003.9      TSPAN6     coding
    41 ENSG00000000005.5        TNMD     coding
    79 ENSG00000000457.8       SCYL3     coding
                                                                                        microrna       seed_pos seed_type
    1                                                                         miR-132/212/212-3p  chrX:99884666     8-mer
    2                                                                                 miR-133abc  chrX:99884907  7-mer-A1
    41 miR-93/93a/105/106a/291a-3p/294/295/302abcde/372/373/428/519a/520be/520acd-3p/1378/1420ac  chrX:99840338  7-mer-m8
    79                                                                        let-7/98/4458/4500 chr1:169822790  7-mer-A1
       repeat. total_cons_. primates_cons_. mammals_cons_. vertebrates_cons_. tr_region
    1        0           13              67              0                  0    3pUTR,
    2        0           11              33              9                  0    3pUTR,
    41       0           42              89             39                 15      CDS,
    79       0           76             100             91                 31    3pUTR,
    

    目标是将microrna的id进行拆分,得到下面的:

     df_res$MicroRNA
      [1] "miR-132-3p"               "miR-212-3p"               "miR-133abc"               "miR-93-3p-3p"            
      [5] "miR-93-3p-1378"           "miR-93-3p-1420ac"         "miR-93-294-3p"            "miR-93-294-1378"         
      [9] "miR-93-294-1420ac"        "miR-93-295-3p"            "miR-93-295-1378"          "miR-93-295-1420ac"       
     [13] "miR-93-302abcde-3p"       "miR-93-302abcde-1378"     "miR-93-302abcde-1420ac"   "miR-93-372-3p"           
     [17] "miR-93-372-1378"          "miR-93-372-1420ac"        "miR-93-373-3p"            "miR-93-373-1378"         
     [21] "miR-93-373-1420ac"        "miR-93-428-3p"            "miR-93-428-1378"          "miR-93-428-1420ac"       
     [25] "miR-93-519a-3p"           "miR-93-519a-1378"         "miR-93-519a-1420ac"       "miR-93-520be-3p"         
     [29] "miR-93-520be-1378"        "miR-93-520be-1420ac"      "miR-93-520acd-3p"         "miR-93-520acd-1378"      
     [33] "miR-93-520acd-1420ac"     "miR-93a-3p-3p"            "miR-93a-3p-1378"          "miR-93a-3p-1420ac"       
     [37] "miR-93a-294-3p"           "miR-93a-294-1378"         "miR-93a-294-1420ac"       "miR-93a-295-3p"          
     [41] "miR-93a-295-1378"         "miR-93a-295-1420ac"       "miR-93a-302abcde-3p"      "miR-93a-302abcde-1378"   
     [45] "miR-93a-302abcde-1420ac"  "miR-93a-372-3p"           "miR-93a-372-1378"         "miR-93a-372-1420ac"      
     [49] "miR-93a-373-3p"           "miR-93a-373-1378"         "miR-93a-373-1420ac"       "miR-93a-428-3p"          
     [53] "miR-93a-428-1378"         "miR-93a-428-1420ac"       "miR-93a-519a-3p"          "miR-93a-519a-1378"       
     [57] "miR-93a-519a-1420ac"      "miR-93a-520be-3p"         "miR-93a-520be-1378"       "miR-93a-520be-1420ac"    
     [61] "miR-93a-520acd-3p"        "miR-93a-520acd-1378"      "miR-93a-520acd-1420ac"    "miR-105-3p-3p"           
     [65] "miR-105-3p-1378"          "miR-105-3p-1420ac"        "miR-105-294-3p"           "miR-105-294-1378"        
     [69] "miR-105-294-1420ac"       "miR-105-295-3p"           "miR-105-295-1378"         "miR-105-295-1420ac"      
     [73] "miR-105-302abcde-3p"      "miR-105-302abcde-1378"    "miR-105-302abcde-1420ac"  "miR-105-372-3p"          
     [77] "miR-105-372-1378"         "miR-105-372-1420ac"       "miR-105-373-3p"           "miR-105-373-1378"        
     [81] "miR-105-373-1420ac"       "miR-105-428-3p"           "miR-105-428-1378"         "miR-105-428-1420ac"      
     [85] "miR-105-519a-3p"          "miR-105-519a-1378"        "miR-105-519a-1420ac"      "miR-105-520be-3p"        
     [89] "miR-105-520be-1378"       "miR-105-520be-1420ac"     "miR-105-520acd-3p"        "miR-105-520acd-1378"     
     [93] "miR-105-520acd-1420ac"    "miR-106a-3p-3p"           "miR-106a-3p-1378"         "miR-106a-3p-1420ac"      
     [97] "miR-106a-294-3p"          "miR-106a-294-1378"        "miR-106a-294-1420ac"      "miR-106a-295-3p"         
    [101] "miR-106a-295-1378"        "miR-106a-295-1420ac"      "miR-106a-302abcde-3p"     "miR-106a-302abcde-1378"  
    [105] "miR-106a-302abcde-1420ac" "miR-106a-372-3p"          "miR-106a-372-1378"        "miR-106a-372-1420ac"     
    [109] "miR-106a-373-3p"          "miR-106a-373-1378"        "miR-106a-373-1420ac"      "miR-106a-428-3p"         
    [113] "miR-106a-428-1378"        "miR-106a-428-1420ac"      "miR-106a-519a-3p"         "miR-106a-519a-1378"      
    [117] "miR-106a-519a-1420ac"     "miR-106a-520be-3p"        "miR-106a-520be-1378"      "miR-106a-520be-1420ac"   
    [121] "miR-106a-520acd-3p"       "miR-106a-520acd-1378"     "miR-106a-520acd-1420ac"   "miR-291a-3p-3p"          
    [125] "miR-291a-3p-1378"         "miR-291a-3p-1420ac"       "miR-291a-294-3p"          "miR-291a-294-1378"       
    [129] "miR-291a-294-1420ac"      "miR-291a-295-3p"          "miR-291a-295-1378"        "miR-291a-295-1420ac"     
    [133] "miR-291a-302abcde-3p"     "miR-291a-302abcde-1378"   "miR-291a-302abcde-1420ac" "miR-291a-372-3p"         
    [137] "miR-291a-372-1378"        "miR-291a-372-1420ac"      "miR-291a-373-3p"          "miR-291a-373-1378"       
    [141] "miR-291a-373-1420ac"      "miR-291a-428-3p"          "miR-291a-428-1378"        "miR-291a-428-1420ac"     
    [145] "miR-291a-519a-3p"         "miR-291a-519a-1378"       "miR-291a-519a-1420ac"     "miR-291a-520be-3p"       
    [149] "miR-291a-520be-1378"      "miR-291a-520be-1420ac"    "miR-291a-520acd-3p"       "miR-291a-520acd-1378"    
    [153] "miR-291a-520acd-1420ac"   "let-7"                    "let-98"                   "let-4458"                
    [157] "let-4500"  
    

    方案:

    • 先按-拆分为多列
    • 再按/拆分为多行
    • 然后合并列,用-分隔

    代码:

    library(tidyverse)
    df <- str_split(df$microrna, pattern = "-", simplify = TRUE) %>% 
        as_tibble() %>% 
        bind_cols(df) 
    
    
    iter_split = function(df, pattern = "^V", sep = "/") {
        iter_cols = grep(pattern, colnames(df), value = TRUE)
        
        res = new.env(parent = emptyenv())
        res$df = df
        
        for (i in iter_cols) {
            if (any(str_detect(df[[i]], pattern = sep))) {
                res$df = res$df %>% separate_rows_(i)
            }
        }
        
        return(res$df)
    }
    
    df
    df_res = iter_split(df)
    df_res
    
    
    df_res$MicroRNA = 
        apply(df_res[startsWith(colnames(df_res),"V")], 1, function(x) {
            x = x[x != ""]
            paste(x, collapse = "-")
        })
    
    df_res = unique(df_res)
    
    # df_res %>% 
    #     unite(MicroRNA, starts_with("V"), sep = "-") %>% 
    #     pull(MicroRNA)
    

    相关文章

      网友评论

        本文标题:【r<-方案】数据框ID拆分

        本文链接:https://www.haomeiwen.com/subject/bmhcaqtx.html