GEO数据分析之数据下载

作者: 天涯清水 | 来源:发表于2020-01-03 22:13 被阅读0次

step1-Load Data and Quality Control

一、数据下载和读入

数据下载和读入，有与下载很慢，直接导入下载好的数据。

##数据下载和读入

#GEO Accession viewer https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63067
suppressPackageStartupMessages(library(GEOquery))
eSet <- getGEO('GSE63067', 
                destdir = ".",
                getGPL= F)
save(eSet,file = 'GSE6306.Rdata')
load('GSE63067_exprSet.Rdata')

(1)提取表达矩阵

#(1)提取表达矩阵
#eSet #查看eSet
exprSet <- exprSet
# class(eSet)#eSet的的类型
# 
# #str(eSet) #查看结构
# 
# length(eSet)#查看list有多少个元素；eSet这个list只有一个元素

#exprSet <- exprs(eSet[[1]])#exprs函数提取表达矩阵

exprSet[1:5,1:5]#查看表达矩阵

##           GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881
## 1007_s_at   6.317799   6.220083   5.801907   6.322376   6.090536
## 1053_at     5.411170   5.445441   4.764031   5.170063   5.771946
## 117_at      6.139097   7.166847   5.842836   6.899896   6.499958
## 121_at      6.548681   6.317436   6.224910   5.986812   6.181588
## 1255_g_at   4.000176   4.130992   3.986064   3.904768   4.006547

dim(exprSet)

## [1] 54675    18

(2)提取临床信息

#(2)提取临床信息
# pdata <- pData(eSet[[1]])#样本分组信息
# head(pdata)
# 
# samples <- sampleNames(eSet[[1]])#有多少GSM
# head(samples)

#构建样本信息
group_list=c(rep('Steatosis',2),rep('Non-alcoholic steatohepatitis',9),rep('Healthy',7))
group_list=factor(group_list)
head(group_list)

## [1] Steatosis                     Steatosis                    
## [3] Non-alcoholic steatohepatitis Non-alcoholic steatohepatitis
## [5] Non-alcoholic steatohepatitis Non-alcoholic steatohepatitis
## Levels: Healthy Non-alcoholic steatohepatitis Steatosis

(3)质控

一定要用boxplot看一下各个样本的表达量是不是在同一条水平线上，如果不在同一条平行线上，需要进行质控。

# 质控，一定要看boxplot
#有几个样本的表达量与其他样本不在同一条水平线上，质控
boxplot(exprSet,las = 2,  cex = 0.3)

image.png

#质控

library(limma)
exprSet <- normalizeBetweenArrays(exprSet)
#View(exprSet)
par(cex = 0.7)
n.sample=ncol(exprSet)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
boxplot(exprSet, col = cols,main="expression value",las=2)

image.png

二、ID转换和过滤

通常需要转换ID和过滤某些探针

if(! require("hgu133plus2.db")) install("hgu133plus2.db")
library(hgu133plus2.db)

ls('package:hgu133plus2.db')#查看hugene10sttranscriptcluster.db有多少个对象

##  [1] "hgu133plus2"              "hgu133plus2.db"          
##  [3] "hgu133plus2_dbconn"       "hgu133plus2_dbfile"      
##  [5] "hgu133plus2_dbInfo"       "hgu133plus2_dbschema"    
##  [7] "hgu133plus2ACCNUM"        "hgu133plus2ALIAS2PROBE"  
##  [9] "hgu133plus2CHR"           "hgu133plus2CHRLENGTHS"   
## [11] "hgu133plus2CHRLOC"        "hgu133plus2CHRLOCEND"    
## [13] "hgu133plus2ENSEMBL"       "hgu133plus2ENSEMBL2PROBE"
## [15] "hgu133plus2ENTREZID"      "hgu133plus2ENZYME"       
## [17] "hgu133plus2ENZYME2PROBE"  "hgu133plus2GENENAME"     
## [19] "hgu133plus2GO"            "hgu133plus2GO2ALLPROBES" 
## [21] "hgu133plus2GO2PROBE"      "hgu133plus2MAP"          
## [23] "hgu133plus2MAPCOUNTS"     "hgu133plus2OMIM"         
## [25] "hgu133plus2ORGANISM"      "hgu133plus2ORGPKG"       
## [27] "hgu133plus2PATH"          "hgu133plus2PATH2PROBE"   
## [29] "hgu133plus2PFAM"          "hgu133plus2PMID"         
## [31] "hgu133plus2PMID2PROBE"    "hgu133plus2PROSITE"      
## [33] "hgu133plus2REFSEQ"        "hgu133plus2SYMBOL"       
## [35] "hgu133plus2UNIGENE"       "hgu133plus2UNIPROT"

#找到hugene10sttranscriptclusterSYMBOL，找到对应的个呢名字

ids <- toTable(hgu133plus2SYMBOL)#totable获得对应关系

length(unique(ids$symbol))#长度,unique除去重复后元素个数

## [1] 20183

tail(sort(table(ids$symbol)))#查看重复的基因

## 
##  DNAH1   TCF3  CFLAR   NRP2    HFE YME1L1 
##     13     13     14     14     15     22

table(sort(table(ids$symbol)))#table统计重复的基因

## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 9426 5151 2841 1383  741  339  168   67   29   17    7    5    5    2    1 
##   22 
##    1

plot(table(sort(table(ids$symbol))))#

image.png

探针对应到基因和过滤表达矩阵

#探针对应到基因

#过滤表达矩阵
head(ids)

##    probe_id symbol
## 1   1053_at   RFC2
## 2    117_at  HSPA6
## 3    121_at   PAX8
## 4 1255_g_at GUCA1A
## 5   1316_at   THRA
## 6   1320_at PTPN21

head(exprSet)

##           GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881
## 1007_s_at   6.282738   6.212412   5.821629   6.298573   6.055701
## 1053_at     5.408377   5.423271   4.734848   5.192682   5.747881
## 117_at      6.113114   7.211946   5.868391   6.845572   6.434376
## 121_at      6.506860   6.313513   6.273058   5.970227   6.143037
## 1255_g_at   3.998576   4.130220   3.963418   3.921568   4.036102
## 1294_at     6.531638   6.630746   6.092748   6.262294   6.333928
##           GSM1539882 GSM1539883 GSM1539884 GSM1539885 GSM1539886
## 1007_s_at   6.060390   6.210868   6.039242   6.277970   5.765510
## 1053_at     5.387612   5.180355   5.577747   5.348066   5.105622
## 117_at      5.613340   5.658942   5.364068   5.627686   5.619727
## 121_at      6.281549   5.997775   6.572306   6.549663   6.590396
## 1255_g_at   4.039059   3.979454   4.108688   4.153167   4.005298
## 1294_at     6.303366   6.226402   6.603085   6.208452   6.153673
##           GSM1539887 GSM1539888 GSM1539889 GSM1539890 GSM1539891
## 1007_s_at   6.286630   6.156261   6.082300   5.664447   6.876097
## 1053_at     5.856089   5.255968   4.968610   4.889945   5.165363
## 117_at      6.269895   5.546778   5.456274   6.442117   5.650169
## 121_at      6.573122   6.351897   6.526092   6.375800   6.609897
## 1255_g_at   4.073529   4.098591   3.995851   4.140052   3.994319
## 1294_at     6.393489   6.159599   6.438031   6.139996   5.674530
##           GSM1539892 GSM1539893 GSM1539894
## 1007_s_at   6.255433   6.143778   6.619000
## 1053_at     5.297843   5.042678   4.825426
## 117_at      5.546913   5.666350   5.498495
## 121_at      6.432975   6.175153   6.291851
## 1255_g_at   4.228946   4.043580   4.022117
## 1294_at     6.113279   6.579910   6.183669

#rownames(exprSet) #rownames(exprSet)为表达矩阵exprSet的探针;
length(rownames(exprSet))

## [1] 54675

table(rownames(exprSet) %in% ids$probe_id)#x %in% y 的意思是“对x里的每个元素进行判断，判断它是否在y中存在，存在就返回TRUE，不存在就返回FALSE”。

## 
## FALSE  TRUE 
## 12743 41932

#table(rownames(exprSet) %in% ids$probe_id)
exprSet <- exprSet[rownames(exprSet) %in% ids$probe_id,]

dim(exprSet)

## [1] 41932    18

#ids过滤探针
ids <- ids[match(rownames(exprSet),ids$probe_id),]
head(ids)

##    probe_id symbol
## 1   1053_at   RFC2
## 2    117_at  HSPA6
## 3    121_at   PAX8
## 4 1255_g_at GUCA1A
## 5   1316_at   THRA
## 6   1320_at PTPN21

exprSet[1:5,1:5]

##           GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881
## 1053_at     5.408377   5.423271   4.734848   5.192682   5.747881
## 117_at      6.113114   7.211946   5.868391   6.845572   6.434376
## 121_at      6.506860   6.313513   6.273058   5.970227   6.143037
## 1255_g_at   3.998576   4.130220   3.963418   3.921568   4.036102
## 1316_at     4.834299   4.879104   4.829465   4.881184   4.858719

#合并表达矩阵和ids

merge2 <- function(exprSet, ids){
  tmp <- by(exprSet,
            ids$symbol,
            function(x) rownames(x)[which.max( rowMeans(x))])
  probes <- as.character(tmp)
  print(dim(exprSet))
  exprSet <- exprSet[rownames(exprSet) %in% probes,]

  print(dim(exprSet))
  rownames(exprSet) <- ids[match(rownames(exprSet), ids$probe_id),2]
  return(exprSet)
}

new_exprSet <- merge2(exprSet,ids)

## [1] 41932    18
## [1] 20183    18

#View(new_exprSet)    

#查看某个基因的表达水平
new_exprSet['GAPDH',]

## GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881 GSM1539882 
##   12.05410   12.21721   11.88043   11.81438   11.80473   12.05410 
## GSM1539883 GSM1539884 GSM1539885 GSM1539886 GSM1539887 GSM1539888 
##   11.88043   12.14874   12.00448   11.59026   12.27825   12.05883 
## GSM1539889 GSM1539890 GSM1539891 GSM1539892 GSM1539893 GSM1539894 
##   12.12765   11.55415   11.68287   12.04985   12.19551   11.80945

boxplot(new_exprSet, las = 2)
#new_exprSet['MALAT1',]
new_exprSet['ACTB',]

## GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881 GSM1539882 
##   12.53922   12.27825   11.94215   12.19997   11.91843   12.08690 
## GSM1539883 GSM1539884 GSM1539885 GSM1539886 GSM1539887 GSM1539888 
##   12.07694   11.88981   11.67853   11.95695   11.86387   12.29221 
## GSM1539889 GSM1539890 GSM1539891 GSM1539892 GSM1539893 GSM1539894 
##   13.01597   11.65957   12.06532   11.78780   12.33845   11.61137

boxplot(new_exprSet, las = 2) #las=2样本名称为竖排列

image.png

#中间三个存在存在差异
# 从上面的图中可以看到有一个样本的中位数和其他样本有些不在一条水平显示，这个normalizeBetweenArrays函数，
# 可以把他拉回正常水平，normalizeBetweenArrays只能是在同一个数据集里面使用。这一步可以省略。

library(limma)
exprSet <- normalizeBetweenArrays(new_exprSet)
par(cex = 0.7)
n.sample=ncol(exprSet)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
boxplot(exprSet, col = cols,main="expression value",las=2)

image.png

三、绘图

转置矩阵，生成s三列列矩阵

#绘图,转置矩阵，生成s三列列矩阵
library("reshape2")

exprSet_L <- melt(exprSet)

colnames(exprSet_L) <- c('probe','sample','value')
head(exprSet_L)

##    probe     sample     value
## 1   PAX8 GSM1539877  6.459490
## 2 CYP2A6 GSM1539877 11.972124
## 3 SCARB1 GSM1539877  9.084149
## 4 TTLL12 GSM1539877  6.098230
## 5  CYTOR GSM1539877  4.608951
## 6 ADAM32 GSM1539877  3.766005

#样本分组信息
head(group_list)

## [1] Steatosis                     Steatosis                    
## [3] Non-alcoholic steatohepatitis Non-alcoholic steatohepatitis
## [5] Non-alcoholic steatohepatitis Non-alcoholic steatohepatitis
## Levels: Healthy Non-alcoholic steatohepatitis Steatosis

exprSet_L$group <- rep(group_list, each=nrow(exprSet))
head(exprSet_L)

##    probe     sample     value     group
## 1   PAX8 GSM1539877  6.459490 Steatosis
## 2 CYP2A6 GSM1539877 11.972124 Steatosis
## 3 SCARB1 GSM1539877  9.084149 Steatosis
## 4 TTLL12 GSM1539877  6.098230 Steatosis
## 5  CYTOR GSM1539877  4.608951 Steatosis
## 6 ADAM32 GSM1539877  3.766005 Steatosis

ggplot2

 library(ggplot2)

  p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+
    geom_boxplot()
  print(p)

image.png

  p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+
    geom_violin()
  print(p)

image.png

 p=ggplot(exprSet_L,aes(value,fill=group))+
    geom_histogram(bins = 200)+
    facet_wrap(~sample, nrow = 4)
  print(p)

image.png

 p=ggplot(exprSet_L,aes(value,col=group))+geom_density()+
    facet_wrap(~sample, nrow = 4)
  print(p)

image.png

p=ggplot(exprSet_L,aes(value,col=group))+geom_density()
  print(p)

image.png

p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
  print(p)

image.png

p=p+stat_summary(fun.y="mean",geom="point",shape=23,size=3,fill="red")
  print(p)

image.png

  p=p+theme_set(theme_set(theme_bw(base_size=20)))
  print(p)

image.png

  p=p+theme(text=element_text(face='bold'),axis.text.x=element_text(angle=30,hjust=1),axis.title=element_blank())
  print(p)

image.png

hclust

  ## hclust
  colnames(exprSet)=paste(group_list,1:ncol(exprSet),sep='_')
  # Define nodePar
  nodePar <- list(lab.cex = 0.4, pch = c(NA, 19),
                  cex = 0.7, col = "blue")
  hc=hclust(dist(t(exprSet)))
  par(mar=c(5,5,5,10))
  png('hclust.tif',res=120)

  plot(as.dendrogram(hc), nodePar = nodePar, horiz = TRUE)
  dev.off()

hclust.jpg

## png 
##   2

PCA

 ## PCA

  library(ggfortify)
  df=as.data.frame(t(exprSet))
  df$group=group_list
  png('pca.png',res=120)

  pp <- autoplot(prcomp( df[,1:(ncol(df)-1)] ), 
           data=df,
           colour = 'group')+
    theme_bw()
  pp
  dev.off()

pca.png

参考：生信技能树Jimmy大神的解读GEO数据存放规律及下载，一文就够

GEO数据分析之数据下载

step1-Load Data and Quality Control

一、数据下载和读入

数据下载和读入，有与下载很慢，直接导入下载好的数据。

(1)提取表达矩阵

(2)提取临床信息

(3)质控

一定要用boxplot看一下各个样本的表达量是不是在同一条水平线上，如果不在同一条平行线上，需要进行质控。

二、ID转换和过滤

通常需要转换ID和过滤某些探针

探针对应到基因和过滤表达矩阵

三、绘图

转置矩阵，生成s三列列矩阵

ggplot2

hclust

PCA

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

转录组数据分析

R作图

RNA-seq分析