1.R脚本
rm(list = ls())
#gc()
options(stringsAsFactors = F)
library(data.table)
library(tidyr)
library(dplyr)
library(pwr)
library(ggplot2)
library(ggpubr)
#get the working directory
getwd()
#新建6个向量,基于不同的“原子类型”
v1 <- letters[1:6];class(v1)
v2 <- c(1.2,3.4,5,6.7);class(v2)
v3 <- 1L:6L;class(v3)
v4 <- c(T,F,T,T,F,F);class(v4)
v5 <- 2i + 1;class(v5)
v6 <- charToRaw("Biotrainee");class(v6)
#新建一些数据结构,数据框data1,矩阵data2,三维数组data3,列表data4
data1 <- data.frame(c1 = paste0("gene",1:4),
c2 = 1:4,
c3 = 2:5,
c4 = 3:6,
row.names = paste0("r",1:4));data1
class(colnames(data1))
data2 <- matrix(seq(24,2,-2),nrow = 4,dimnames = list(paste0("r",seq(4)),paste0("c",seq(3))));data2
data3 <- array(1:12,c(2,3,2));data3
data4 <- list(data1,data2,data3);data4
#get the subset of data frame & matrix
(sub1_data1 <- data1[1:2,])
(sub2_data1 <- data1[,2:3])
(sub1_data2 <- data2[1:2,])
(sub2_data2 <- data2[,2:3])
#加载内置数据集`rivers`并加以描述
d <- rivers;head(d,20)
class(d)
length(d)
#统计描述,极值、中位数、四分位数等信息
(summary(d))
#GSE111229
name_folder <- "RunInfoTable"
name_fold <- "SraRunTable.txt"
runinfo <- data.frame(fread(paste(name_folder,name_fold,sep = "/")))
dim(runinfo);colnames(runinfo)
#The class of each column in `runinfo`
sapply(runinfo,class)
#Read `sample`
sample <- data.frame(fread("sample.csv"))
dim(sample);colnames(sample)
#The class of each column in `sample`
sapply(sample,class)
#merge
merge_GSE111229 <- merge(runinfo,sample,by.x = "Sample_Name",by.y = "Accession")
#########################################
##############Visualization##############
#########################################
mbs <- runinfo$MBases
#Boxplot of MBases
boxplot(mbs)
#fivenum of MBases
fivenum(mbs)
#Hist of MBases
hist(mbs,col = "orange",ylab = "freq of MBases",
main = "MBases of RunInfoTable--GSE111229")
#Density of MBases
density(mbs)
plot(density(mbs))
######################################
###############分组统计################
######################################
#merge_GSE111229内拆分
sep_merge <- separate(merge_GSE111229,Title,c("sep1","sep2","plate","sep3"),sep = "_",remove = F)
select_merge <- select(sep_merge,c("MBases","plate"))#简化数据框,只取两列
colnames(select_merge)
#查看分组
table(sep_merge$plate)##根据plate,分为两组,数量相同,各占一半
#显著性水平为0.05,以plate分组,检验MBases是否有显著差异
t.test(select_merge$MBases~select_merge$plate)
#结论:p=0.02<0.05,拒绝原假设,即MBases两组数据均值有显著差异
###############分组作图################
boxplot(select_merge$MBases~select_merge$plate)
#ggplot2
ggplot(select_merge,aes(x=plate,y=MBases))+geom_boxplot()
#ggpubr
ggboxplot(select_merge, x = "plate", y = "MBases", color = "plate", palette = "jco", add = "jitter")
#随机取384条
samp_merge <- sample_frac(select_merge,0.5)
#与前面plate合并成新数据框
comb_sample <- rbind(samp_merge,select_merge);nrow(comb_sample)
2.运行结果及解析
运行结果-part1
> rm(list = ls())
> #gc()
> options(stringsAsFactors = F)
> library(data.table)
> library(tidyr)
> library(dplyr)
> library(pwr)
> library(ggplot2)
> library(ggpubr)
> #get the working directory
> getwd()
[1] "E:/mingliya/Biotrainee/R_basics"
> #新建6个向量,基于不同的“原子类型”
> v1 <- letters[1:6];class(v1)
[1] "character"
> v2 <- c(1.2,3.4,5,6.7);class(v2)
[1] "numeric"
> v3 <- 1L:6L;class(v3)
[1] "integer"
> v4 <- c(T,F,T,T,F,F);class(v4)
[1] "logical"
> v5 <- 2i + 1;class(v5)
[1] "complex"
> v6 <- charToRaw("Biotrainee");class(v6)
[1] "raw"
> #新建一些数据结构,数据框data1,矩阵data2,三维数组data3,列表data4
> data1 <- data.frame(c1 = paste0("gene",1:4),
+ c2 = 1:4,
+ c3 = 2:5,
+ c4 = 3:6,
+ row.names = paste0("r",1:4));data1
c1 c2 c3 c4
r1 gene1 1 2 3
r2 gene2 2 3 4
r3 gene3 3 4 5
r4 gene4 4 5 6
> class(colnames(data1))
[1] "character"
> data2 <- matrix(seq(24,2,-2),nrow = 4,dimnames = list(paste0("r",seq(4)),paste0("c",seq(3))));data2
c1 c2 c3
r1 24 16 8
r2 22 14 6
r3 20 12 4
r4 18 10 2
> data3 <- array(1:12,c(2,3,2));data3
, , 1
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
, , 2
[,1] [,2] [,3]
[1,] 7 9 11
[2,] 8 10 12
> data4 <- list(data1,data2,data3);data4
[[1]]
c1 c2 c3 c4
r1 gene1 1 2 3
r2 gene2 2 3 4
r3 gene3 3 4 5
r4 gene4 4 5 6
[[2]]
c1 c2 c3
r1 24 16 8
r2 22 14 6
r3 20 12 4
r4 18 10 2
[[3]]
, , 1
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
, , 2
[,1] [,2] [,3]
[1,] 7 9 11
[2,] 8 10 12
> #get the subset of data frame & matrix
> (sub1_data1 <- data1[1:2,])
c1 c2 c3 c4
r1 gene1 1 2 3
r2 gene2 2 3 4
> (sub2_data1 <- data1[,2:3])
c2 c3
r1 1 2
r2 2 3
r3 3 4
r4 4 5
> (sub1_data2 <- data2[1:2,])
c1 c2 c3
r1 24 16 8
r2 22 14 6
> (sub2_data2 <- data2[,2:3])
c2 c3
r1 16 8
r2 14 6
r3 12 4
r4 10 2
运行结果--part2
#加载内置数据集`rivers`并加以描述
> d <- rivers;head(d,20)
[1] 735 320 325 392 524 450 1459 135 465 600 330 336 280 315 870 906 202 329 290 1000
> class(d)
[1] "numeric"
> is.vector(d)
[1] TRUE
> length(d)
[1] 141
> #统计描述,极值、中位数、四分位数等信息
> (summary(d))
Min. 1st Qu. Median Mean 3rd Qu. Max.
135.0 310.0 425.0 591.2 680.0 3710.0
> #GSE111229
> name_folder <- "RunInfoTable"
> name_fold <- "SraRunTable.txt"
> runinfo <- data.frame(fread(paste(name_folder,name_fold,sep = "/")))
> dim(runinfo)
[1] 768 31
> #The class of each column in `runinfo`
> sapply(runinfo,class)
BioSample Experiment MBases MBytes Run
"character" "character" "integer" "integer" "character"
SRA_Sample Sample_Name Assay_Type AssemblyName AvgSpotLen
"character" "character" "character" "character" "integer"
BioProject Center_Name Consent DATASTORE_filetype DATASTORE_provider
"character" "character" "character" "character" "character"
InsertSize Instrument LibraryLayout LibrarySelection LibrarySource
"integer" "character" "character" "character" "character"
LoadDate Organism Platform ReleaseDate SRA_Study
"character" "character" "character" "character" "character"
age cell_type marker_genes source_name strain
"character" "character" "character" "character" "character"
tissue
"character"
> #Read `sample`
> sample <- data.frame(fread("sample.csv"))
> dim(sample)
[1] 768 12
> #The class of each column in `sample`
> sapply(sample,class)
Accession Title Sample.Type Taxonomy Channels
"character" "character" "character" "character" "integer"
Platform Series Supplementary.Types Supplementary.Links SRA.Accession
"character" "character" "character" "character" "character"
Contact Release.Date
"character" "character"
> #######merge#######
> merge_GSE111229 <- merge(runinfo,sample,by.x = "Sample_Name",by.y = "Accession")
运行结果--part3
> ###############Visualization############
> mbs <- runinfo$MBases
> #Boxplot of MBases
> boxplot(mbs)
> #fivenum of MBases
> fivenum(mbs)
[1] 0 8 12 16 74
> #Hist of MBases
> hist(mbs,col = "orange",ylab = "freq of MBases",
+ main = "MBases of RunInfoTable--GSE111229")
> #Density of MBases
> density(mbs)
Call:
density.default(x = mbs)
Data: mbs (768 obs.); Bandwidth 'bw' = 1.423
x y
Min. :-4.269 Min. :0.0000000
1st Qu.:16.366 1st Qu.:0.0000353
Median :37.000 Median :0.0003001
Mean :37.000 Mean :0.0121039
3rd Qu.:57.634 3rd Qu.:0.0142453
Max. :78.269 Max. :0.0665647
> plot(density(mbs))
> ###############分组统计################
> #merge_GSE111229内拆分
> sep_merge <- separate(merge_GSE111229,Title,c("sep1","sep2","plate","sep3"),sep = "_",remove = F)
> select_merge <- select(sep_merge,c("MBases","plate"))#简化数据框,只取两列
> colnames(select_merge)
[1] "MBases" "plate"
> #查看分组
> table(sep_merge$plate)##根据plate,分为两组,数量相同,各占一半
0048 0049
384 384
> #显著性水平为0.05,以plate分组,检验MBases是否有显著差异
> t.test(select_merge$MBases~select_merge$plate)
Welch Two Sample t-test
data: select_merge$MBases by select_merge$plate
t = 2.3019, df = 728.18, p-value = 0.02162
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.1574805 1.9831445
sample estimates:
mean in group 0048 mean in group 0049
13.08854 12.01823
> #结论:p=0.02<0.05,拒绝原假设,即MBases两组数据均值有显著差异
> ###############分组作图################
> boxplot(select_merge$MBases~select_merge$plate)
> #ggplot2
> ggplot(select_merge,aes(x=plate,y=MBases))+geom_boxplot()
> #ggpubr
> ggboxplot(select_merge, x = "plate", y = "MBases", color = "plate", palette = "jco", add = "jitter")
> #随机取384条
> samp_merge <- sample_frac(select_merge,0.5)
> #与前面plate合并成新数据框
> comb_sample <- rbind(samp_merge,select_merge);nrow(comb_sample)
[1] 1152
p.s.如有误望大家不吝赐教,提出错误和建议。作者:Leah 如需转载,请注明出处。
网友评论