数据来源:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE54236
这个数据有161个样本,使用GPL6480平台。
rm(list = ls())
options(stringsAsFactors = F)
#安装GEOquery包:install.packages("GEOquery")
library(GEOquery)
#安装stringr包:install.packages("stringr")
library(stringr)
Sys.setenv("VROOM_CONNECTION_SIZE"=1131072) #调高VROOM_CONNECTION_SIZE,不然下面步骤会报错
gset <- getGEO("GSE54236", destdir=".",AnnotGPL = F,getGPL = F)
a=gset[[1]]
dat=exprs(a)
dim(dat)
#[1] 41000 161
# 检查数据,判断需不需要取log,如果没有几百上千的数值,数值之间相差不大,就不需要取log值了。
dat[1:4,1:4]
# GSM1310570 GSM1310571 GSM1310572 GSM1310573
#A_23_P100001 8.232 8.248 7.576 8.708
#A_23_P100011 5.998 6.079 5.695 6.653
#A_23_P100022 6.107 6.630 5.686 7.886
#A_23_P100056 6.718 7.630 7.410 5.762
pd=pData(a) #取出临床信息
head(pd)[,1:3] # 发现title里面的内容可以用来分组
# title geo_accession status
#GSM1310570 Tumor T_A_001 GSM1310570 Public on Jan 22 2014
#GSM1310571 Tumor T_B_003 GSM1310571 Public on Jan 22 2014
#GSM1310572 Tumor T_C_005 GSM1310572 Public on Jan 22 2014
#GSM1310573 Tumor T_D_007 GSM1310573 Public on Jan 22 2014
#GSM1310574 Tumor T_E_009 GSM1310574 Public on Jan 22 2014
#GSM1310575 Tumor T_F_011 GSM1310575 Public on Jan 22 2014
group_list=ifelse(grepl('Non',pd$title),'Normal','Tumor')
table(group_list)
#group_list
#Normal Tumor
# 80 81
接下来下载GPL6480
gpl = getGEO('GPL6480',destdir = ".")
id_pre = gpl@dataTable@table
colnames(id_pre)
#[1] "ID" "SPOT_ID"
# [3] "CONTROL_TYPE" "REFSEQ"
# [5] "GB_ACC" "GENE"
# [7] "GENE_SYMBOL" "GENE_NAME"
# [9] "UNIGENE_ID" "ENSEMBL_ID"
#[11] "TIGR_ID" "ACCESSION_STRING"
#[13] "CHROMOSOMAL_LOCATION" "CYTOBAND"
#[15] "DESCRIPTION" "GO_ID"
#[17] "SEQUENCE"
ids2 = id_pre[,c("ID","GENE_SYMBOL")] #只需要"ID","GENE_SYMBOL"这两列
colnames(ids2) = c("probe_id","symbol") #给这两列重新取名
head(ids2)
# probe_id symbol
#1 A_23_P100001 FAM174B
#2 A_23_P100011 AP3S2
#3 A_23_P100022 SV2B
#4 A_23_P100056 RBPMS2
#5 A_23_P100074 AVEN
#6 A_23_P100092 ZSCAN29
#接下来,使探针与基因symbol一一对应
ids=as.data.frame(ids2)
table(rownames(dat) %in% ids$probe_id)
# TRUE
#41000
ids=ids[match(rownames(dat),ids$probe_id),] #将dat和ids两个数据框按probe_id对应
table(duplicated(ids$symbol)) #查看有多少个基因名有重复
#FALSE TRUE
#19596 21404
ids=ids[!duplicated(ids$symbol),]
dat=dat[ids$probe_id,]
rownames(dat)=ids$symbol
dat[1:4,1:4]
# GSM1310570 GSM1310571 GSM1310572 GSM1310573
#FAM174B 8.232 8.248 7.576 8.708
#AP3S2 5.998 6.079 5.695 6.653
#SV2B 6.107 6.630 5.686 7.886
#RBPMS2 6.718 7.630 7.410 5.762
save(dat,group_list,file = "GSE54236_download.Rdata") #保存
表达矩阵准备好了,下面就可以开始做后面的分析了。
GEO数据挖掘
GEO数据挖掘(一)数据下载及基因ID转换
网友评论