美文网首页每日练笔
GPL16686芯片平台分析

GPL16686芯片平台分析

作者: 天涯清水 | 来源:发表于2022-06-20 22:33 被阅读0次

    #芯片分析中经常会遇到Affymetrix Human Transcriptome Array 2.0芯片,由于目前还没有现成的R包可以用,因此分析方法也不统一。见生信技能树Jimmy老师HTA2.0芯片比较麻烦,其实这类常见的有3个平台,3种类型:

    GPL17586 [HTA-2_0] Affymetrix Human Transcriptome Array 2.0 [transcript (gene) version]
    GPL19251 [HuGene-2_0-st] Affymetrix Human Gene 2.0 ST Array [probe set (exon) version]
    GPL16686 [HuGene-2_0-st] Affymetrix Human Gene 2.0 ST Array [transcript (gene) version]
    对于这三种平台可以去Affymetrix的官网去查看其区别,也可以去NCBI去查看:
    GPL17586
    GPL19251
    GPL16686

    我安装芯片分析的一般流程进行分析,以GPL16686平台,GSE77532

    1、读入soft文件,手动下载GSE77532对应的soft文件,实际应该下载GPL16686对应的GPL16686.soft文件,网速原因,只能退而求其次。

    rm(list = ls())
    options(stringsAsFactors = F)
    
    #读入soft文件
    library(GEOquery)
    gse77532 <- getGEO(filename = "GSE77532_family.soft.gz",destdir = ".")
    dim(gse77532)
    
    y <- gse77532@gpls$GPL16686@dataTable@table
    dim(y)
    
    head(y)
    y[1:4,1:8]
    
    

    2、 id转换

    
    #### id conversion
    
    library(clusterProfiler)
    ENTREZID<- bitr(y[,6], fromType = "ACCNUM", 
                    toType=c("SYMBOL","ENSEMBL","ENTREZID"),
                    OrgDb = org.Hs.eg.db)
    ls(package:clusterProfiler)
    
    dim(ENTREZID)
    ENTREZID[1:5,1:4]
    
    save(y,ENTREZID,file = "ids.Rdata")
    
    #ids过滤探针
    table(y$GB_ACC %in% ENTREZID$ACCNUM)
    y1 <- y[y$GB_ACC %in% ENTREZID$ACCNUM,]
    y1[1:5,1:8]
    y2 <- y1[,c(1,6)]
    names(y2) <- c("probe_id","ACCNUM")
    
    #合并y2与ENTREZID
    ids <- merge(y2,ENTREZID,by ="ACCNUM",all=F)
    ids[1:5,1:5]
    dim(ids)
    ##载入表达矩阵
    load("GSE77532_exprSet.Rdata")
    
    exprSet <- exprSet2
    exprSet[1:5,1:6]
    
    #过滤表达矩阵
    
    exprSet <- exprSet[rownames(exprSet) %in% ids$probe_id,]
    dim(exprSet)
    exprSet[1:5,1:5]
    
    #ids过滤探针
    ids <- ids[match(rownames(exprSet),ids$probe_id),]
    dim(ids)
    ids[1:2,1:5]
    ids <- ids[,c(2,3)]
    dim(ids)
    ids[1:2,1:2]
    #合并表达矩阵和ids
    
    idcombine <- function(exprSet, ids){
      tmp <- by(exprSet,
                ids$SYMBOL,
                function(x) rownames(x)[which.max(rowMeans(x))])
      probes <- as.character(tmp)
      print(dim(exprSet))
      exprSet <- exprSet[rownames(exprSet) %in% probes,]
      
      print(dim(exprSet))
      rownames(exprSet) <- ids[match(rownames(exprSet), ids$probe_id),2]
      return(exprSet)
    }
    
    new_exprSet <- idcombine(exprSet,ids)
    new_exprSet[1:4,1:6]
    

    id 转换用biomaRt包,更方便一些,知识网速支持不下来。

    GPL17586平台芯片

    #
    rm(list = ls())
    options(stringsAsFactors = F)
    
    #加载R包
    
    library(GEOquery)
    
    #读入soft文件
    
    GSE110359 <- getGEO(filename = "GSE110359_family.soft.gz",destdir = ".")
    dim(GSE110359)
    
    y <- GSE110359@gpls$GPL17586@dataTable@table
     
    
    dim(y)
    
    head(y)
    y[1:4,1:15]
    View(head(y))## you need to check this , which column do you need
    
    probe2gene <- y[,c(2,8)] 
    
    library(stringr)  
    probe2gene$symbol=trimws(str_split(probe2gene$gene_assignment,'//',simplify = T)[,2])
    plot(table(table(probe2gene$symbol)),xlim=c(1,50))
    head(probe2gene)
    
    
    dim(probe2gene)
    View(head(probe2gene))
    ids2 <- probe2gene[,c(1,3)]
    View(head(ids))
    ids2[1:20,1:2]#含有缺失值
    save(ids2,probe2gene,file='GSE110359-probe2gene.Rdata')
    
    load("GSE110359-probe2gene.Rdata")
    #####
    View(head(probe2gene))
    dim(probe2gene)
    

    ID 转换

    library(biomaRt)
    
    x <- probe2gene$probeset_id
    value <- x
    attr <- c("affy_hta_2_0","hgnc_symbol")
    
    ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") 
    
    
    
    ids <- getBM(attributes = attr,
                 filters = "affy_hta_2_0",
                 values = value,
                 mart = ensembl,
                 useCache = F)
    
    dim(ids)#[1] 1041    2
    View(head(ids))
    
    save(ids,file = "GPL17586_ids.Rdata")
    
    
    #去重之后
    table(unique(ids$hgnc_symbol))#28262
    
    attributes <- listAttributes(ensembl)
    View(attributes) # 查看转换格式
    
    save(ids,ensembl,y,file = "ensembl.Rdata")
    

    相关文章

      网友评论

        本文标题:GPL16686芯片平台分析

        本文链接:https://www.haomeiwen.com/subject/xojzxktx.html