中国老年人婚姻状况图（数据转换及拼图）

作者: 冬之心 | 来源:发表于2019-08-23 16:22 被阅读0次

中国老年人婚姻状况图（数据转换及拼图）
数字格式转换、老年婚姻状况图及拼图
2019-04-11函数参数传递，数据类型及转换
03-08
2.4-全栈Java笔记:基本类型转化及常见错误和问题
2018-04-09
CINRAD数据处理
数制及数据转换
婚恋大数据及(闽南婚姻状况)
JS 里的数据类型转换

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

自然数转为比例

语法

rescale(x, to = c(0, 1), from = range(x, na.rm = TRUE, finite = TRUE), ...)

参数

x：continuous vector of values to manipulate. 要操作的数值向量
to： output range (numeric vector of length two) 输出范围
from：input range (vector of length two). If not given, is calculated from the range of x 输入范围，默认为x的范围
... ：other arguments passed on to methods

例子

library(scales)

x <- c(127.31, 5787.47, 3703.58, 78.61)

#转成（0,1)区间，输入范围为 [0，sum(x)]  ，实际上算的是各值占比。
rescale(x, to=c(0,1), from=c(0,sum(x)))

#转成（0，100）区间
rescale(x, to=c(0,100), from=c(0,sum(x)))

小数点转成百分比

语法

percent(x, accuracy = NULL, scale = 100, prefix = "",
suffix = "%", big.mark = " ", decimal.mark = ".", trim = TRUE,
...)

参数

accuracy: Number to round to, NULL for automatic guess. 数值精确度。不设置，函数会自动判断设定小数位。
scale：A scaling factor: x will be multiply by scale before formating (useful if the underlying data is on another scale, e.g. for computing percentages or thousands). 刻度，默认为100。
prefix, suffix：Symbols to display before and after value. 前缀，后缀。后缀默认为%
big.mark：Character used between every 3 digits to separate thousands. 大数的分隔符。
decimal.mark：The character to be used to indicate the numeric decimal point. 小数点的分隔符。
trim：Logical, if FALSE, values are right-justified to a common width 修剪，默认为TRUE。如果是FALSE，值右对齐到一个公共宽度（所有值等宽）。如果是TRUE，值的前导空格就会删除。

percent(rescale(x, to=c(0,1), from=c(0,sum(x[1:4]))))
[1] "0%" "17%" "33%" "50%" "67%" "83%" "100%" "117%" "133%" "150%" "167%" "833%"
percent(rescale(x, to=c(0,1), from=c(0,sum(x[1:4]))), trim=FASLE)
[1] " 0%" " 17%" " 33%" " 50%" " 67%" " 83%" "100%" "117%" "133%" "150%" "167%" "833%"

例子


#不保留小数位
percent(rescale(x, to=c(0,1), from=c(0,sum(x[1:4]))), accuracy = 1)

#保留两位小数
percent(rescale(x, to=c(0,1), from=c(0,sum(x[1:4]))), accuracy = .01)

中国老年人婚姻状况变化

library(reshape2)
library(tidyverse)
library(scales)

# 录入数据，并整理成数据框
marriage <- c("未婚","有配偶","丧偶","离婚")
y1990 <- c(127.31, 5787.47, 3703.58, 78.61)
y2000 <- c(212.17, 8616.39, 3885.58,84.26)
y2010 <- c(313.68, 12459.03, 4747.92, 138.08)


marriage2 <- data.frame(marriage, y1990, y2000, y2010)

marriage3 <- melt(marriage2, id.vars="marriage", variable.name="year", value.name="population")

marriage3$marriage <- factor(marriage3$marriage, levels= c("未婚","有配偶","丧偶","离婚"), c("未婚","有配偶","丧偶","离婚"))

marriage3$year <- str_replace(marriage3$year,"y","")



#根据人数，计算比例
y1990p <- percent(rescale(y1990, to =c(0,1), from=c(0,sum(y1990))),accuracy=.01, suffix = "")
y2000p <- percent(rescale(y2000, to =c(0,1), from=c(0,sum(y2000))),accuracy=.01, suffix = "")
y2010p <- percent(rescale(y2010, to =c(0,1), from=c(0,sum(y2010))),accuracy=.01, suffix = "")

marriage_P <- data.frame(marriage, y1990p, y2000p, y2010p)
marriage_P2 <- melt(marriage_P, id.vars="marriage", variable.name = "year", value.name = "percent")

marriage_P2$marriage <- factor(marriage_P2$marriage, levels= c("未婚","有配偶","丧偶","离婚"), c("未婚","有配偶","丧偶","离婚"))

#删除year值中的"y"和“p"，只提取年份。
marriage_P2$year <- str_sub(marriage_P2$year, 2,5)

# 不知道为什么，如果直接把marriage_P2整个表合并进来，不会改变原变量的数据类型。但是如果只合并marriage_P2$percent,会改变percent变量的数据类型，变成因子型。
marriage4 <- cbind(marriage3, marriage_P2$percent)

names(marriage4)[4] <- "percent"

#因子型 转数值型，不能直接转，一定要先转成字符型，再转成数值型。

marriage4$percent <- as.numeric(as.character(marriage4$percent))

library(ggplot2)

ggplot(marriage4, aes(x=year, y=percent, group=marriage) ) + 
  geom_col(aes(fill=marriage), position="dodge") + 
  geom_text(aes(label=percent, y= percent+0.5), position = position_dodge(width = 0.9), vjust=0) + 
  labs(x=NULL, y=NULL, fill="婚姻类型", title="比例变化图") +
  theme(legend.position = c(0.8, 0.8))

ggplot(marriage4, aes(x=year, y=population, group=marriage)) + 
  geom_line(aes(colour=marriage), size=2) +
  geom_point(aes(shape=marriage),size=2) + 
  facet_wrap(.~marriage, scales="free") + 
  labs(x=NULL, y="人口数（万人）", title="人口变化图") + 
  theme(legend.position = "none")

Rplot04.png

Rplot021.png

拼图

拼图包常用有三个：

gridExtra包的grid.arrange()函数
ggpubr包的ggarange()函数
cowplot包的ggdraw()+draw_plot()函数

参见:


library(cowplot)

p1<- ggplot(marriage4, aes(x=year, y=percent, group=marriage) ) + 
     geom_col(aes(fill=marriage), position="dodge") + 
     geom_text(aes(label=percent, y= percent+0.5), position = position_dodge(width = 0.9), vjust=0) + 
     labs(x=NULL, y="比例(%)", fill="婚姻类型") + 
     theme(legend.position = c(0.92, 0.85), legend.background = element_blank())

p2 <- ggplot(marriage4, aes(x=year, y=population, group=marriage)) + 
  geom_line(aes(colour=marriage), size=2) +
  geom_point(aes(shape=marriage),size=2) + 
  facet_wrap(.~marriage, scales="free") + 
  labs(x=NULL, y="人口数（万人）") + 
  theme(legend.position = "none")

ggdraw() + 
  draw_plot(p1, 0,0.1,0.5,0.85) + 
  draw_plot(p2, 0.5,0.1,0.5,0.85) + 
  draw_plot_label(c("比例图","人口图"),x=c(0,0.5), y=c(1,1)) +
  draw_plot_label("数据来源：中国人口普查 制图：李亮", x=0.63, y=0.1, size=8)

ggdraw() 在ggplot图的上面调协一个绘图层。
draw_plot(plot, x = 0, y = 0, width = 1, height = 1, scale = 1,
hjust = 0, vjust = 0) 在ggdraw画布上的某个地方放置一个plot图
- (0, 0, 0.5, 1) 左半图；(0.5, 0, 0.5, 1) 右半图
- (0, 0.5 , 1, 0.5) 上半图；(0, 0, 1, 0.5) 下半图
draw_plot_label(label, x = 0, y = 1, hjust = -0.5, vjust = 1.5,
size = 16, fontface = "bold", family = NULL, color = NULL,
colour, ...) 给图添加标签。

Rplot06.png

程序改进

利用通道分组计算新值。


marriage <- c("未婚","有配偶","丧偶","离婚")
y1990 <- c(127.31, 5787.47, 3703.58, 78.61)
y2000 <- c(212.17, 8616.39, 3885.58,84.26)
y2010 <- c(313.68, 12459.03, 4747.92, 138.08)

marriage2 <- data.frame(marriage, y1990, y2000, y2010)
marriage3 <- melt(marriage2, id.vars="marriage", variable.name="year", value.name="population")
marriage3$marriage <- factor(marriage3$marriage, levels= c("未婚","有配偶","丧偶","离婚"), c("未婚","有配偶","丧偶","离婚"))
marriage3$year <- str_replace(marriage3$year,"y","")

# 按year分组，计算各婚姻类别人口占某一year组人口的百分比。
marriage3 <- marriage3 %>%
 group_by(year) %>%
 mutate(percent= percent(rescale(population, to= c(0,1), from=c(0,sum(population))), accuracy=0.01, suffix=""))

marriage3

合并数据框时应注意数据类型

y1990p <- percent(rescale(y1990, to =c(0,1), from=c(0,sum(y1990))),accuracy=.01, suffix = "")
y2000p <- percent(rescale(y2000, to =c(0,1), from=c(0,sum(y2000))),accuracy=.01, suffix = "")
y2010p <- percent(rescale(y2010, to =c(0,1), from=c(0,sum(y2010))),accuracy=.01, suffix = "")

# 注意percent转化出来的是字符型列表
# 注意字符型列表转成数据框时，默认会变成因子，给后面数据处理带来麻烦。因此要加参数stringsAsFactors=FALSE

marriage_P <- data.frame(marriage, y1990p, y2000p, y2010p, stringsAsFactors=FALSE)
marriage_P2 <- melt(marriage_P, id.vars="marriage", variable.name = "year", value.name = "percent")
marriage_P2 <- as_tibble(marriage_P2)
marriage_P2
# 注意melt()函数在数据框转置时measure.vars变成的新变量是因子型，如本例中的year


# 使用tidyr包中的gather()函数, 默认factor_key = FALSE, 即Key值被存为字符型。如果TRUE，则存为因子型。
marriage_P <- data.frame(marriage, y1990p, y2000p, y2010p, stringsAsFactors=FALSE)
marriage_P2 <- gather(marriage_P, key = "year", value = "percent", - marriage)
marriage_P2 <- as_tibble(marriage_P2)
marriage_P2