美文网首页
R作业初级

R作业初级

作者: 熊逸Byron | 来源:发表于2019-06-29 08:50 被阅读0次

    初级题目

    R语言练习题-初级


    正式开始我们的旅程

    library(tidyverse)
    library(ggpubr)
    
    1. 软件安装以及R包安装
      参考:http://www.bio-info-trainee.com/3727.html
    # # 先注释掉,避免在Rmarkdown中运行
    # rm(list = ls()) 
    # options()$repos 
    # options()$BioC_mirror
    # options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
    # options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
    # options()$repos 
    # options()$BioC_mirror
    # 
    # # https://bioconductor.org/packages/release/bioc/html/GEOquery.html
    # if (!requireNamespace("BiocManager", quietly = TRUE))
    #  install.packages("BiocManager")
    # BiocManager::install("KEGG.db",ask = F,update = F)
    # BiocManager::install(c("GSEABase","GSVA","clusterProfiler" ),ask = F,update = F)
    # BiocManager::install(c("GEOquery","limma","impute" ),ask = F,update = F)
    # BiocManager::install(c("genefu","org.Hs.eg.db","hgu133plus2.db" ),ask = F,update = F)
    # 
    # # 下面代码被我注释了,意思是这些代码不需要运行,因为它过时了,很多旧教程就忽略
    # # source("https://bioconductor.org/biocLite.R") 
    # # library('BiocInstaller') 
    # # options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") 
    # # BiocInstaller::biocLite("GEOquery")
    # # BiocInstaller::biocLite(c("limma"))
    # # BiocInstaller::biocLite(c("impute"))
    # 
    # options()$repos
    # install.packages('WGCNA')
    # install.packages(c("FactoMineR", "factoextra"))
    # install.packages(c("ggplot2", "pheatmap","ggpubr"))
    # library("FactoMineR")
    # library("factoextra")
    # 
    # library(GSEABase)
    # library(GSVA)
    # library(clusterProfiler)
    # library(genefu)
    # library(ggplot2)
    # library(ggpubr)
    # library(hgu133plus2.db)
    # library(limma)
    # library(org.Hs.eg.db)
    # library(pheatmap)
    
    1. 打开 Rstudio 告诉我它的工作目录
    getwd()
    

    3.新建6个向量,基于不同的原子类型。(重点是字符串,数值,逻辑值)

    a1 <- 1:10
    a1
    class(a1)
    a2 <- c('hello','the','world')
    class(a2)
    a3 <- c(T,T,T,F,F,T,T,F)
    a3
    class(a3)
    a4 <- 607L
    a4
    class(a4)
    a5 <- seq(from = 0, to = 20, by =2)
    a5
    class(a5)
    a6 <- c(13.14,14)
    a6
    class(a6)
    

    4.新建一些数据结构,比如矩阵,数组,数据框,列表等重点是数据框,矩阵)

    c <- c(1,2,3)
    list <- list(c(1,2,3),'3,14')
    M = matrix( c('a','a','b','c','b','a'), nrow = 2, ncol = 3, byrow = TRUE)
    a <- array(c('a','b'),dim = c(3,3,2))
    f <- factor(c)
    df <- data.frame(gene=paste0("gene",1:15),
                     s1=rnorm(n=15),s2=rnorm(n=15),s3=rnorm(n=15),s4=rnorm(n=15),s5=rnorm(n=15))
    # 取df的第1,3行,取第4,6列
    df[c(1,3),]
    df[,c(4,6)]
    

    5.使用data函数来加载R内置数据集 rivers,其他数据集:

    data("rivers")
    #Lengths of Major North American Rivers
    
    head(rivers)
    tail(rivers)
    
    # 数据集的长
    length(rivers)
    # structure 显示对象内部结构
    str(rivers) 
    # 获取描述性统计量(最小值/最大值/四分位数/数值型变量/因子向量/逻辑型向量)
    summary(rivers) 
    rm(list = ls())
    
    1. 下载 https://www.ncbi.nlm.nih.gov/sra?term=SRP133642 里面的 RunInfo Table 文件读入到R里面,了解这个数据框,多少列,每一列都是什么属性的元素
      打开链接https://www.ncbi.nlm.nih.gov/sra?term=SRP133642
      点击Send results to Run selector链接
      点击RunInfo Table按钮即可下载RunInfo Table文件
    rm(list = ls())
    options(stringsAsFactors = F)
    Table <- read.table(file = "SraRunTable.txt",header = T,sep = '\t')
    class(Table)
    dim(Table) # 查看data.frame维度
    ncol(Table) #查看data.frame列数
    str(Table)
    
    1. 下载 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111229 里面的样本信息sample.csv读入到R里面,了解这个数据框,多少列,每一列都是什么属性的元素
      7.1 打开GEO主页https://www.ncbi.nlm.nih.gov/geo/,点击Samples链接
      7.2 在搜索栏中输入GSE111229,点击Search按钮,然后点击Export按钮
      7.3 在弹出的对话框点击Export按钮,得到sample.csv
    # 读取样本信息
    sample <-read.csv("sample.csv")
    colnames(sample)
    dim(Table)
    ## [1] 768  31
    dim(sample)
    ## [1] 768  12
    d = merge(Table,sample,by.x = "Sample_Name",by.y = "Accession")
    # merge() 函数 此时Table的Sample_Name列和sample的Accession列内容相同,合并这一列
    dim(d)
    library(tidyverse)
    d = sample %>% 
      dplyr::rename("Sample_Name" = "Accession") %>%
      left_join(Table,by = "Sample_Name")
    dim(d)
    
    1. 对前面读取的 RunInfo Table 文件在R里面探索其MBases列,包括 箱线图(boxplot)和五分位数(fivenum),还有频数图(hist),以及密度图(density)
    # Mbases样本的碱基数
    # 举例:
    boxplot(Table$MBases, main = "boxplot of MBases")
    plot(fivenum(Table$MBases), main = "fivenum of MBases")
    hist(Table$MBases, main = "hist of MBases")
    plot(density(Table$MBases,na.rm=T), main = "density of MBases")
    
    
    1. 把前面读取的样本信息表格的样本名字根据下划线分割看第3列元素的统计情况。第三列代表该样本所在的plate
    title = sample$Title
    plate = unlist(lapply(title,function(x){stringr::str_split(x,"_")[[1]][3]}))
    table(plate) 
    
    1. 根据plate把关联到的 RunInfo Table 信息的MBases列分组检验是否有统计学显著的差异
    # plate 指384孔PCR板,编号分别是48号和49号
    
    t.test(Table$MBases~plate)
    
    1. 分组绘制箱线图(boxplot),频数图(hist),以及密度图(density)
    boxplot(d$MBases~plate)
    typeof(plate)
    e = d[,c("MBases","Title")]
    e$plate = plate
    hist(e$MBases,freq = F, breaks = "sturges")
    plot(density(e$MBases,na.rm=T))
    # 比较简陋,可以尝试用ggplot2和ggpubr画图包
    
    1. 使用ggplot2把上面的图进行重新绘制
    suppressMessages(library(ggplot2))
    e$plate = plate
    e$num=c(1:768)
    colnames(e)
    # 箱线图
    ggplot(e,aes(x=plate,y=MBases)) + geom_boxplot()   
    # 频数图
    ggplot(e,aes(x=MBases)) + geom_histogram(fill="lightblue",colour="grey") + facet_grid(plate ~ .) # 频数图
    ggplot(e,aes(x=MBases,fill=plate))+geom_histogram()
    # 密度图
    ggplot(e,aes(y=MBases,x=num)) + geom_point() + stat_density2d(aes(alpha=..density..),geom = "raster",contour = F)+ facet_grid(plate ~ .)  
    ggplot(e,aes(x=MBases,fill=plate))+geom_density()
    
    1. 使用ggpubr把上面的图进行重新绘制
    suppressMessages(library(ggpubr))
    ggboxplot(e, x="plate", y="MBases", color = "plate", palette = "aaas",add = "jitter") + stat_compare_means(method = "t.test")
    gghistogram(e, x="MBases", fill = "plate",palette = c("#f4424e", "#41a6f4"))
    ggdensity(e, x="MBases", fill = "plate", color = "plate", add = "mean",palette = c("#f4424e", "#41a6f4"))
    
    1. 随机取384个MBases信息,跟前面的两个plate的信息组合成新的数据框,第一列是分组,第二列是MBases,总共是384*3行数据
    # sample() 函数 随机抽样
    data <- e[sample(nrow(e),384),][,c(3,1,2)]
    str(data)
    

    感谢

    https://www.jianshu.com/p/c07e67e2c757
    https://www.jianshu.com/p/2e5a5192f219
    欢迎关注生信菜鸟团、生信技能树!!!

    相关文章

      网友评论

          本文标题:R作业初级

          本文链接:https://www.haomeiwen.com/subject/cnsucctx.html