本节我们使用TidyTuesday 2021-02-02发布的项目数据集来进行数据分析;该数据集是关于历史悠久的黑人大学(HBCU)的入学情况,通过绘制经典哑铃图来进行数据可视化;各位看官老爷细细品味,喜欢请关注个人公众号R语言数据分析指南持续分享更多优质资源
点击原文链接获取完整代码
加载R包
library(tidyverse)
library(ggtext)
library(tidytuesdayR)
下载数据集
tidytuesdayR包可以轻松的访问每周的TidyTuesday项目数据集
rm(list=ls())
tuesdata <- tidytuesdayR::tt_load('2021-02-02')
hbcu_all <- tuesdata$hbcu_all
# A tibble: 6 x 12
Year `Total enrollme… Males Females `4-year` `2-year`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1976 222613 104669 117944 206676 15937
2 1980 233557 106387 127170 218009 15548
3 1982 228371 104897 123474 212017 16354
4 1984 227519 102823 124696 212844 14675
5 1986 223275 97523 125752 207231 16044
6 1988 239755 100561 139194 223250 16505
# … with 6 more variables: `Total - Public` <dbl>, `4-year -
# Public` <dbl>, `2-year - Public` <dbl>, `Total - Private` <dbl>,
# `4-year - Private` <dbl>, `2-year - Private` <dbl>
我们可以看到不同年度的入学人数及其类别
数据清洗
重点研究了从1990年开始的入学情况。使用该dplyr包进行数据处理,得到的数据集中包含每年男性和女性入学数据以及两者之间的数字差异
dat_gender <- hbcu_all %>%
filter(Year >= 1990) %>%
select(Year, Males, Females) %>%
mutate(diff = Females - Males) %>%
pivot_longer(cols = c(Males, Females)) %>%
rename(Gender = name,Enrollments = value)
head(dat_gender)
Year diff Gender Enrollments
<dbl> <dbl> <chr> <dbl>
1 1990 46838 Males 105157
2 1990 46838 Females 151995
3 1991 48451 Males 110442
4 1991 48451 Females 158893
为了稍后绘制两组之间的范围,我们需要两个小标题,每个小标题仅包含1个性别的数据
# A tibble: 6 x 4
Year diff Gender Enrollments
<dbl> <dbl> <chr> <dbl>
1 1990 46838 Females 151995
2 1991 48451 Females 158893
3 1992 50297 Females 164919
4 1993 50062 Females 166459
5 1994 52059 Females 166065
6 1995 53451 Females 166088
绘制基础哑铃图
p <- ggplot(dat_gender)+
geom_segment(data = Males,aes(x = Enrollments, y = Year,
yend = Females$Year,
xend = Females$Enrollments),
color = "#aeb6bf",size = 4.5,alpha = .5) +
geom_point(aes(x = Enrollments, y = Year, color = Gender),
size = 4, show.legend = TRUE)
p
数据清洗II
为了使图更加美观我们需要在行上添加标签,并为每组的均值和标准差添加阴影。因此需要进一步整理数据以计算所有组在所有年份中的平均+/- 标准偏差
dat_gender %>%
group_by(Gender) %>%
summarise(mean = mean(Enrollments),
SE = sd(Enrollments)) %>%
mutate(meanpos = mean + 1 *SE,
meanneg = mean - 1 *SE)-> stats
stats_males <- stats %>%
filter(Gender == "Males")
stats_females <- stats %>%
filter(Gender == "Females")
head(stats)
## # A tibble: 2 x 5
## Gender mean SE meanpos meanneg
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Females 177866. 13908. 191774. 163958.
## 2 Males 115789. 6065. 121854. 109724.
至于范围之间的标签,添加一个小标题,以保持每年各组之间的差异,并在列中指示标签的位置
diff <- dat_gender %>%
filter(Gender == "Males") %>%
mutate(x_pos = Enrollments + (diff/2))
head(diff)
## # A tibble: 6 x 5
## Year diff Gender Enrollments x_pos
## <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1990 46838 Males 105157 128576
## 2 1991 48451 Males 110442 134668.
在点范围上添加标签
p_labelled <- p + geom_text(data = diff,
aes(label = paste("D: ",diff), x = x_pos, y = Year),
fill = "white",
color = "#4a4e4d",
size = 2.5,
family = "Segoe UI Semibold")
p_labelled
添加分面并加上平均值和标准偏差
ggplot(dat_gender)+
geom_rect(xmin = stats_males$meanneg,
xmax = stats_males$meanpos,
ymin = 2016, ymax = 1989, fill = "#762a83", alpha = .05)+
geom_vline(xintercept = stats_males$mean,
linetype = "solid", size = .5,
alpha = .8, color = "#762a83")+
geom_rect(xmin = stats_females$meanneg,
xmax = stats_females$meanpos,
ymin = 2016, ymax = 1989,
fill = "#009688", alpha = .05)+
geom_vline(xintercept = stats_females$mean,
color = "#009688", linetype = "solid",
size = .5, alpha = .8)+
geom_segment(data = Males, aes(x = Enrollments, y = Year,
yend = Females$Year,
xend = Females$Enrollments),
color = "#aeb6bf", size = 4.5, alpha = .5)+
geom_point(aes(x = Enrollments, y = Year, color = Gender),
size = 4, show.legend = FALSE) +
scale_color_manual(values = c("#009688","#762a83"))+
geom_text(data = diff, aes(label = paste("∆",diff),
x = x_pos, y = Year),fill = "white",
color = "#4a4e4d", size = 2.5) +
geom_text(x = stats_females$mean - 1500,y = 1990,
label = "MEAN", angle = 90, size = 2.5, color = "#009688")+
geom_text(x = stats_females$meanpos -1500, y = 1990, label = "STDEV",
angle = 90, size = 2.5, color = "#009688")+
facet_grid(Year ~ ., scales = "free", switch = "y")+
scale_size_continuous(range = c(2, 8), breaks = c(2,4,6,7,8),)+
theme_minimal()+xlab(NULL)+ylab(NULL)+
theme(panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.ticks.x = element_line(color = "#4a4e4d"),
text = element_text(color = "#4a4e4d"),
strip.text.y.left = element_text(angle = 0),
panel.background = element_rect(fill = "white", color = "white"),
strip.background = element_rect(fill = "white", color = "white"),
strip.text = element_text(color = "#4a4e4d"),
plot.background = element_rect(fill = "white"),
panel.spacing = unit(0, "lines"),
plot.margin = margin(1,1,.5,1, "cm"))
喜欢请关注个人公众号R语言数据分析指南,在此先行拜谢了
网友评论