美文网首页TCGA
R可视化:基础图形可视化之Distribution(三)

R可视化:基础图形可视化之Distribution(三)

作者: 生信学习者2 | 来源:发表于2021-01-30 11:33 被阅读0次

    数据分析的图形可视化是了解数据分布、波动和相关性等属性必不可少的手段。数据分布可视化图形主要有:小提琴图、核密度曲线图、柱状图、箱线图和山脊图等。更多知识分享请到 https://zouhua.top/

    小提琴图Violin

    # Libraries
    library(ggplot2)
    library(dplyr)
    library(hrbrthemes)
    library(viridis)
    
    # create a dataset
    data <- data.frame(
      name=c( rep("A",500), rep("B",500), rep("B",500), rep("C",20), rep('D', 100)  ),
      value=c( rnorm(500, 10, 5), rnorm(500, 13, 1), rnorm(500, 18, 1), rnorm(20, 25, 4), rnorm(100, 12, 1) )
    )
    
    # sample size
    sample_size = data %>% group_by(name) %>% summarize(num=n())
    
    # Plot
    data %>%
      left_join(sample_size) %>%
      mutate(myaxis = paste0(name, "\n", "n=", num)) %>%
      ggplot( aes(x=myaxis, y=value, fill=name)) +
        geom_violin(width=1.4) +
        geom_boxplot(width=0.1, color="grey", alpha=0.2) +
        scale_fill_viridis(discrete = TRUE) +
        theme_ipsum() +
        theme(
          legend.position="none",
          plot.title = element_text(size=11)
        ) +
        ggtitle("A Violin wrapping a boxplot") +
        xlab("")
    
    # Libraries
    library(ggplot2)
    library(dplyr)
    library(tidyr)
    library(forcats)
    library(hrbrthemes)
    library(viridis)
    
    # Load dataset from github
    data <- read.table("dataset/viz/probly.csv", header=TRUE, sep=",")
    
    # Data is at wide format, we need to make it 'tidy' or 'long'
    data <- data %>% 
      gather(key="text", value="value") %>%
      mutate(text = gsub("\\.", " ",text)) %>%
      mutate(value = round(as.numeric(value),0)) %>%
      filter(text %in% c("Almost Certainly","Very Good Chance","We Believe","Likely","About Even", "Little Chance", "Chances Are Slight", "Almost No Chance"))
    
    # Plot
    p <- data %>%
      mutate(text = fct_reorder(text, value)) %>% # Reorder data
      ggplot( aes(x=text, y=value, fill=text, color=text)) +
        geom_violin(width=2.1, size=0.2) +
        scale_fill_viridis(discrete=TRUE) +
        scale_color_viridis(discrete=TRUE) +
        theme_ipsum() +
        theme(
          legend.position="none"
        ) +
        coord_flip() + # This switch X and Y axis and allows to get the horizontal version
        xlab("") +
        ylab("Assigned Probability (%)")
    
    p
    

    核密度图 density chart

    library(ggplot2)
    library(hrbrthemes)
    library(dplyr)
    library(tidyr)
    library(viridis)
    
    data <- read.table("dataset/viz/probly.csv", header=TRUE, sep=",")
    data <- data %>%
      gather(key="text", value="value") %>%
      mutate(text = gsub("\\.", " ",text)) %>%
      mutate(value = round(as.numeric(value),0))
    
    # A dataframe for annotations
    annot <- data.frame(
      text = c("Almost No Chance", "About Even", "Probable", "Almost Certainly"),
      x = c(5, 53, 65, 79),
      y = c(0.15, 0.4, 0.06, 0.1)
    )
    
    # Plot
    data %>%
      filter(text %in% c("Almost No Chance", "About Even", "Probable", "Almost Certainly")) %>%
      ggplot( aes(x=value, color=text, fill=text)) +
        geom_density(alpha=0.6) +
        scale_fill_viridis(discrete=TRUE) +
        scale_color_viridis(discrete=TRUE) +
        geom_text( data=annot, aes(x=x, y=y, label=text, color=text), hjust=0, size=4.5) +
        theme_ipsum() +
        theme(
          legend.position="none"
        ) +
        ylab("") +
        xlab("Assigned Probability (%)")
    
    # library
    library(ggplot2)
    library(ggExtra)
     
    # classic plot :
    p <- ggplot(mtcars, aes(x=wt, y=mpg, color=cyl, size=cyl)) +
          geom_point() +
          theme(legend.position="none")
     
    # Set relative size of marginal plots (main plot 10x bigger than marginals)
    p1 <- ggMarginal(p, type="histogram", size=10)
     
    # Custom marginal plots:
    p2 <- ggMarginal(p, type="histogram", fill = "slateblue", xparams = list(  bins=10))
     
    # Show only marginal plot for x axis
    p3 <- ggMarginal(p, margins = 'x', color="purple", size=4)
    
    cowplot::plot_grid(p, p1, p2, p3, ncol = 2, align = "hv", 
                       labels = LETTERS[1:4])
    

    柱状图 histogram

    # library
    library(ggplot2)
    library(dplyr)
    library(hrbrthemes)
    
    # Build dataset with different distributions
    data <- data.frame(
      type = c( rep("variable 1", 1000), rep("variable 2", 1000) ),
      value = c( rnorm(1000), rnorm(1000, mean=4) )
    )
    
    # Represent it
    p <- data %>%
      ggplot( aes(x=value, fill=type)) +
        geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
        scale_fill_manual(values=c("#69b3a2", "#404080")) +
        theme_ipsum() +
        labs(fill="")
    p
    
    # Libraries
    library(ggplot2)
    library(hrbrthemes)
    
    # Dummy data
    data <- data.frame(
      var1 = rnorm(1000),
      var2 = rnorm(1000, mean=2)
    )
    
    # Chart
    p <- ggplot(data, aes(x=x) ) +
      # Top
      geom_density( aes(x = var1, y = ..density..), fill="#69b3a2" ) +
      geom_label( aes(x=4.5, y=0.25, label="variable1"), color="#69b3a2") +
      # Bottom
      geom_density( aes(x = var2, y = -..density..), fill= "#404080") +
      geom_label( aes(x=4.5, y=-0.25, label="variable2"), color="#404080") +
      theme_ipsum() +
      xlab("value of x")
    
    p1 <- ggplot(data, aes(x=x) ) +
      geom_histogram( aes(x = var1, y = ..density..), fill="#69b3a2" ) +
      geom_label( aes(x=4.5, y=0.25, label="variable1"), color="#69b3a2") +
      geom_histogram( aes(x = var2, y = -..density..), fill= "#404080") +
      geom_label( aes(x=4.5, y=-0.25, label="variable2"), color="#404080") +
      theme_ipsum() +
      xlab("value of x")
    cowplot::plot_grid(p, p1, ncol = 2, align = "hv", 
                       labels = LETTERS[1:2])
    

    箱线图 boxplot

    # Library
    library(ggplot2)
    library(dplyr)
    library(forcats)
    
    # Dataset 1: one value per group
    data <- data.frame(
      name=c("north","south","south-east","north-west","south-west","north-east","west","east"),
      val=sample(seq(1,10), 8 )
    )
    
    
    # Reorder following the value of another column:
    p1 <- data %>%
      mutate(name = fct_reorder(name, val)) %>%
      ggplot( aes(x=name, y=val)) +
        geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
        coord_flip() +
        xlab("") +
        theme_bw()
     
    # Reverse side
    p2 <- data %>%
      mutate(name = fct_reorder(name, desc(val))) %>%
      ggplot( aes(x=name, y=val)) +
        geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
        coord_flip() +
        xlab("") +
        theme_bw()
    
    # Using median
    p3 <- mpg %>%
      mutate(class = fct_reorder(class, hwy, .fun='median')) %>%
      ggplot( aes(x=reorder(class, hwy), y=hwy, fill=class)) + 
        geom_boxplot() +
        geom_jitter(color="black", size=0.4, alpha=0.9) +
        xlab("class") +
        theme(legend.position="none") +
        xlab("")
     
    # Using number of observation per group
    p4 <- mpg %>%
      mutate(class = fct_reorder(class, hwy, .fun='length' )) %>%
      ggplot( aes(x=class, y=hwy, fill=class)) + 
      stat_summary(fun.y=mean, geom="point", shape=20, size=6, color="red", fill="red") +
        geom_boxplot() +
        xlab("class") +
        theme(legend.position="none") +
        xlab("") +
        xlab("")
    
    p5 <- data %>%
      arrange(val) %>%    # First sort by val. This sort the dataframe but NOT the factor levels
      mutate(name=factor(name, levels=name)) %>%   # This trick update the factor levels
      ggplot( aes(x=name, y=val)) +
        geom_segment( aes(xend=name, yend=0)) +
        geom_point( size=4, color="orange") +
        coord_flip() +
        theme_bw() +
        xlab("")
     
    p6 <- data %>%
      arrange(val) %>%
      mutate(name = factor(name, levels=c("north", "north-east", "east", "south-east", "south", "south-west", "west", "north-west"))) %>%
      ggplot( aes(x=name, y=val)) +
        geom_segment( aes(xend=name, yend=0)) +
        geom_point( size=4, color="orange") +
        theme_bw() +
        xlab("")
    
    cowplot::plot_grid(p1, p2, p3, p4, p5, p6, 
                       ncol = 2, align = "hv", 
                       labels = LETTERS[1:6])
    
    library(dplyr)
    # Dummy data
    names <- c(rep("A", 20) , rep("B", 8) , rep("C", 30), rep("D", 80))
    value <- c( sample(2:5, 20 , replace=T) , sample(4:10, 8 , replace=T), 
           sample(1:7, 30 , replace=T), sample(3:8, 80 , replace=T) )
    data <- data.frame(names, value) %>%
      mutate(names=factor(names))
     
    # Draw the boxplot. Note result is also stored in a object called boundaries
    boundaries <- boxplot(data$value ~ data$names , col="#69b3a2" , ylim=c(1,11))
    # Now you can type boundaries$stats to get the boundaries of the boxes
    
    # Add sample size on top
    nbGroup <- nlevels(data$names)
    text( 
      x=c(1:nbGroup), 
      y=boundaries$stats[nrow(boundaries$stats),] + 0.5, 
      paste("n = ",table(data$names),sep="")  
    )
    

    山脊图 ridgeline

    # library
    library(ggridges)
    library(ggplot2)
    library(dplyr)
    library(tidyr)
    library(forcats)
    
    # Load dataset from github
    data <- read.table("dataset/viz/probly.csv", header=TRUE, sep=",")
    data <- data %>% 
      gather(key="text", value="value") %>%
      mutate(text = gsub("\\.", " ",text)) %>%
      mutate(value = round(as.numeric(value),0)) %>%
      filter(text %in% c("Almost Certainly","Very Good Chance","We Believe","Likely","About Even", "Little Chance", "Chances Are Slight", "Almost No Chance"))
    
    # Plot
    p1 <- data %>%
      mutate(text = fct_reorder(text, value)) %>%
      ggplot( aes(y=text, x=value,  fill=text)) +
        geom_density_ridges(alpha=0.6, stat="binline", bins=20) +
        theme_ridges() +
        theme(
          legend.position="none",
          panel.spacing = unit(0.1, "lines"),
          strip.text.x = element_text(size = 8)
        ) +
        xlab("") +
        ylab("Assigned Probability (%)")
    
    p2 <- data %>%
      mutate(text = fct_reorder(text, value)) %>%
      ggplot( aes(y=text, x=value,  fill=text)) +
        geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
        theme_ridges() +
        theme(
          legend.position="none",
          panel.spacing = unit(0.1, "lines"),
          strip.text.x = element_text(size = 8)
        ) +
        xlab("") +
        ylab("Assigned Probability (%)")
    
    cowplot::plot_grid(p1, p2, 
                       ncol = 2, align = "hv", 
                       labels = LETTERS[1:2])
    

    参考

    1. The R Graph Gallery

    参考文章如引起任何侵权问题,可以与我联系,谢谢。

    相关文章

      网友评论

        本文标题:R可视化:基础图形可视化之Distribution(三)

        本文链接:https://www.haomeiwen.com/subject/ocbhtltx.html