作业 1
根据R包org.Hs.eg.db找到下面ensembl 基因ID 对应的基因名(symbol)
ENSG00000000003.13
ENSG00000000005.5
ENSG00000000419.11
ENSG00000000457.12
ENSG00000000460.15
ENSG00000000938.11
# copy ensembl基因ID 保存为 e1.txt
# rm(list = ls())
# options(stringsAsFactors = F) # 最好先run这两行
> content.e1 <- read.table('e1.txt')
> library(org.Hs.eg.db)
> ?org.Hs.eg.db
> ls("package:org.Hs.eg.db")
[1] "org.Hs.eg" "org.Hs.eg.db" "org.Hs.eg_dbconn" "org.Hs.eg_dbfile"
[5] "org.Hs.eg_dbInfo" "org.Hs.eg_dbschema" "org.Hs.egACCNUM" "org.Hs.egACCNUM2EG"
[9] "org.Hs.egALIAS2EG" "org.Hs.egCHR" "org.Hs.egCHRLENGTHS" "org.Hs.egCHRLOC"
[13] "org.Hs.egCHRLOCEND" "org.Hs.egENSEMBL" "org.Hs.egENSEMBL2EG" "org.Hs.egENSEMBLPROT"
[17] "org.Hs.egENSEMBLPROT2EG" "org.Hs.egENSEMBLTRANS" "org.Hs.egENSEMBLTRANS2EG" "org.Hs.egENZYME"
[21] "org.Hs.egENZYME2EG" "org.Hs.egGENENAME" "org.Hs.egGO" "org.Hs.egGO2ALLEGS"
[25] "org.Hs.egGO2EG" "org.Hs.egMAP" "org.Hs.egMAP2EG" "org.Hs.egMAPCOUNTS"
[29] "org.Hs.egOMIM" "org.Hs.egOMIM2EG" "org.Hs.egORGANISM" "org.Hs.egPATH"
[33] "org.Hs.egPATH2EG" "org.Hs.egPFAM" "org.Hs.egPMID" "org.Hs.egPMID2EG"
[37] "org.Hs.egPROSITE" "org.Hs.egREFSEQ" "org.Hs.egREFSEQ2EG" "org.Hs.egSYMBOL"
[41] "org.Hs.egSYMBOL2EG" "org.Hs.egUCSCKG" "org.Hs.egUNIGENE" "org.Hs.egUNIGENE2EG"
[45] "org.Hs.egUNIPROT"
> g2s <- toTable(org.Hs.egSYMBOL)# id转symbol
> head(g2s)
gene_id symbol
1 1 A1BG
2 2 A2M
3 3 A2MP1
4 9 NAT1
5 10 NAT2
6 11 NATP
> print(content.e1)# 看一眼e1.txt的内容,确认是ensemble id
V1
1 ENSG00000000003.13
2 ENSG00000000005.5
3 ENSG00000000419.11
4 ENSG00000000457.12
5 ENSG00000000460.15
6 ENSG00000000938.11
> g2e <- toTable(org.Hs.egENSEMBL)# id转ensemble
> head(g2e)
gene_id ensembl_id
1 1 ENSG00000121410
2 2 ENSG00000175899
3 3 ENSG00000256069
4 9 ENSG00000171428
5 10 ENSG00000156006
6 12 ENSG00000196136
> fetch <- function(x) #设函数,以.为分割符将x这个list的第1元素下面的第1元素分割
+ {
+ strsplit(as.character(x), '[.]')[[1]][1]
+ }
> lapply(content.e1$V1, fetch )# fetch对content.e1$V1里的每一个元素做同样的事
[[1]]
[1] "ENSG00000000003"
[[2]]
[1] "ENSG00000000005"
[[3]]
[1] "ENSG00000000419"
[[4]]
[1] "ENSG00000000457"
[[5]]
[1] "ENSG00000000460"
[[6]]
[1] "ENSG00000000938"
> trans <- unlist(lapply(content.e1$V1, fetch ))
> print(trans)
[1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457" "ENSG00000000460" "ENSG00000000938"
> content.e1$ensembl_id <- trans # conten.e1增加ensembl_id字段
> print(content.e1)
V1 ensembl_id
1 ENSG00000000003.13 ENSG00000000003
2 ENSG00000000005.5 ENSG00000000005
3 ENSG00000000419.11 ENSG00000000419
4 ENSG00000000457.12 ENSG00000000457
5 ENSG00000000460.15 ENSG00000000460
6 ENSG00000000938.11 ENSG00000000938
> head(g2e) # 再看一眼g2e
gene_id ensembl_id
1 1 ENSG00000121410
2 2 ENSG00000175899
3 3 ENSG00000256069
4 9 ENSG00000171428
5 10 ENSG00000156006
6 12 ENSG00000196136
> tmp <- merge(content.e1, g2e, by = 'ensembl_id') # 不用by也可以,此时是取交集
> print(tmp)
ensembl_id V1 gene_id
1 ENSG00000000003 ENSG00000000003.13 7105
2 ENSG00000000005 ENSG00000000005.5 64102
3 ENSG00000000419 ENSG00000000419.11 8813
4 ENSG00000000457 ENSG00000000457.12 57147
5 ENSG00000000460 ENSG00000000460.15 55732
6 ENSG00000000938 ENSG00000000938.11 2268
> print(head(g2s)) # 再看一眼g2s
gene_id symbol
1 1 A1BG
2 2 A2M
3 3 A2MP1
4 9 NAT1
5 10 NAT2
6 11 NATP
> tmp.end <- merge(tmp, g2s) # 注意,没加by
> print(tmp.end)
gene_id ensembl_id V1 symbol
1 2268 ENSG00000000938 ENSG00000000938.11 FGR
2 55732 ENSG00000000460 ENSG00000000460.15 C1orf112
3 57147 ENSG00000000457 ENSG00000000457.12 SCYL3
4 64102 ENSG00000000005 ENSG00000000005.5 TNMD
5 7105 ENSG00000000003 ENSG00000000003.13 TSPAN6
6 8813 ENSG00000000419 ENSG00000000419.11 DPM1
######原代码如下,太浓缩,不好理解#####
a=read.table('e1.txt')
head(a)
library(org.Hs.eg.db)
ls("package:org.Hs.eg.db")
g2s=toTable(org.Hs.egSYMBOL);head(g2s)
g2e=toTable(org.Hs.egENSEMBL);head(g2e)
head(g2e)
library(stringr)
a$ensembl_id=unlist(lapply(a$V1,function(x){
strsplit(as.character(x),'[.]')[[1]][1]
})
)
tmp=merge(a,g2e,by='ensembl_id')
tmp=merge(tmp,g2s,by='gene_id')
######修改后,我觉得好理解一些#############
content.e1 <- read.table('e1.txt')
library(org.Hs.eg.db)
?org.Hs.eg.db
ls("package:org.Hs.eg.db")
g2s <- toTable(org.Hs.egSYMBOL)# id转symbol
head(g2s)
print(content.e1)# 看一眼e1.txt的内容,确认是ensemble id
g2e <- toTable(org.Hs.egENSEMBL)# id转ensemble
head(g2e)
fetch <- function(x) #设函数,以.为分割符将x这个list的第1元素下面的第1元素分割
{
strsplit(as.character(x), '[.]')[[1]][1]
}
lapply(content.e1$V1, fetch )# fetch对content.e1$V1里的每一个元素做同样的事
trans <- unlist(lapply(content.e1$V1, fetch ))
print(trans)
content.e1$ensembl_id <- trans # conten.e1增加ensembl_id字段
print(content.e1)
head(g2e) # 再看一眼g2e
tmp <- merge(content.e1, g2e, by = 'ensembl_id') # 不用by也可以,此时是取交集
print(tmp)
print(head(g2s)) # 再看一眼g2s
tmp.end <- merge(tmp, g2s) # 注意,没加by
print(tmp.end)
作业 2
#根据R包hgu133a.db找到下面探针对应的基因名(symbol)
1053_at
117_at
121_at
1255_g_at
1316_at
1320_at
1405_i_at
1431_at
1438_at
1487_at
1494_f_at
1598_g_at
160020_at
1729_at
177_at
#########
> rm(list = ls())
> options(stringsAsFactors = F)
> content.e2 <- read.table('e2.txt')
> print(content.e2)
V1
1 1053_at
2 117_at
3 121_at
4 1255_g_at
5 1316_at
6 1320_at
7 1405_i_at
8 1431_at
9 1438_at
10 1487_at
11 1494_f_at
12 1598_g_at
13 160020_at
14 1729_at
15 177_at
>
> library(hgu133a.db)
> ids <- toTable(hgu133aSYMBOL)
> head(ids)
probe_id symbol
1 1053_at RFC2
2 117_at HSPA6
3 121_at PAX8
4 1255_g_at GUCA1A
5 1316_at THRA
6 1320_at PTPN21
>
> print(content.e2)
V1
1 1053_at
2 117_at
3 121_at
4 1255_g_at
5 1316_at
6 1320_at
7 1405_i_at
8 1431_at
9 1438_at
10 1487_at
11 1494_f_at
12 1598_g_at
13 160020_at
14 1729_at
15 177_at
>
> colnames(content.e2) <- "probe_id"
> print(content.e2)
probe_id
1 1053_at
2 117_at
3 121_at
4 1255_g_at
5 1316_at
6 1320_at
7 1405_i_at
8 1431_at
9 1438_at
10 1487_at
11 1494_f_at
12 1598_g_at
13 160020_at
14 1729_at
15 177_at
>
> tmp <- merge(ids, content.e2)
> print(tmp)
probe_id symbol
1 1053_at RFC2
2 117_at HSPA6
3 121_at PAX8
4 1255_g_at GUCA1A
5 1316_at THRA
6 1320_at PTPN21
7 1405_i_at CCL5
8 1431_at CYP2E1
9 1438_at EPHB3
10 1487_at ESRRA
11 1494_f_at CYP2A6
12 1598_g_at GAS6
13 160020_at MMP14
14 1729_at TRADD
15 177_at PLD1
作业 3
###找到R包CLL内置的数据集的表达矩阵里面的TP53基因的表达量,并且绘制在 progres.-stable分组的boxplot图
> rm(list = ls())
> options(stringsAsFactors = F)
> suppressPackageStartupMessages(library(CLL))
> data(sCLLex) # 官网读包的说明书
> sCLLex
ExpressionSet (storageMode: lockedEnvironment)
assayData: 12625 features, 22 samples
element names: exprs
protocolData: none
phenoData
sampleNames: CLL11.CEL CLL12.CEL ... CLL9.CEL (22 total)
varLabels: SampleID Disease
varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
Annotation: hgu95av2
>
> exprSet <- exprs(sCLLex)
> pd <- pData(sCLLex) # progres.-stable分组信息,在Rstudio右上角Environment中点pd看分组信息
> library(hgu95av2.db)
> ids <- toTable(hgu95av2SYMBOL) # 在Rstudio右上角Environment中点击ids变量
> #然后在左上边窗口的搜索框中输入TP53,取当中对应的探针,例如1939_at
>
> expr.1939 <- exprSet['1939_at', ] #看图表达量差异最大,通常保留,好说明问题
> expr.1974 <- exprSet['1974_s_at', ]
> expr.31618 <- exprSet['31618_at', ]
> # 变量对分组
> boxplot(expr.1939 ~ pd$Disease)
> boxplot(expr.1974 ~ pd$Disease)
> boxplot(expr.31618 ~ pd$Disease)
网友评论