Structure of data
help(mtcars)
[, 1] mpg Miles/(US) gallon 油耗
[, 2] cyl Number of cylinders 气缸
[, 3] disp Displacement (cu.in.) 位移
[, 4] hp Gross horsepower 马力
[, 5] drat Rear axle ratio 后轴传动比
[, 6] wt Weight (1000 lbs)
[, 7] qsec 1/4 mile time
[, 8] vs Engine (0 = V-shaped, 1 = straight)
[, 9] am Transmission (0 = automatic, 1 = manual) 手动、自动
[,10] gear Number of forward gears 几个前进档位
[,11] carb Number of carburetors 化油器数量

dim(mtcars)

names(mtcars)

查看变量数据类型
str(mtcars)

隔离变量
通过使用从数据框中选择一个变量DataName$VariableName并将结果存储在向量中。
mpg= mtcars$mpg
请注意,RStudio具有代码完成功能,因此将自动预测您的命令。当您键入时mtcars$,所有变量的名称都会出现。

class(mpg)
str(mpg)

length(mpg)

sum(mpg)

sort(mpg)

降序
sort(mpg, decreasing = T)
对变量的5个最低值求和。
sum(sort(mpg)[1:5])
选择子集
mpg[1]
mpg[5]
mpg[c(1,5)]
mtcars$mpg[c(1,5)]
mtcars[1,1]
mtcars[5,1] #mpg is 1st colu

变更分类
您可能不同意R的初始分类,并且想要更改它。

class(mtcars$carb)
carbF = factor(mtcars$carb)
class(carbF)

ageCanVote = factor(setNames(c(16, 18, 18, "Unknown"), c("Austria", "Australia", "Afghanistam", "Zambia")))
as.numeric(ageCanVote) # This is a mistake, as it converts to the rank of the factor level
as.numeric(as.character(ageCanVote)) # This converts properly

绘图

ggplot安装
install.packages("ggplot2")
library(ggplot2)
柱状图绘制
查看32个样本的前进挡个数
> barplot(mtcars$gear)

查看值的统计数量(每个值有几个)
table(mtcars$gear)

counts = table(mtcars$gear)
barplot(counts)

help(barplot)

barplot(counts, names.arg=c("3 Gears","4 Gears","5 Gears"),col="lightblue")

par(las=2)
barplot(counts, names.arg=c("3 Gears","4 Gears","5 Gears"),col="lightblue")
par(las=1)
barplot(counts, names.arg=c("3 Gears","4 Gears","5 Gears"),col="lightblue")


counts1 = table(mtcars$cyl, mtcars$gear)
barplot(counts1,names.arg=c("3 Gears","4 Gears","5 Gears"),col=c("lightblue","lightgreen","lightyellow"),legend = rownames(counts1))
barplot(counts1,names.arg=c("3 Gears","4 Gears","5 Gears"),col=c("lightblue","lightgreen","lightyellow"),legend = c("4 cyl","6 cyl","8 cyl"))
barplot(counts1, names.arg = c("3 Gears", "4 Gears", "5 Gears"), col = c("lightblue",
"lightgreen", "lightyellow"), legend = c("4 cyl", "6 cyl", "8 cyl"), beside = TRUE)



library(ggplot2)
# Select the mtcars data, and focus on cyl as factor (qualitative) on x axis
p = ggplot(mtcars, aes(x=factor(cyl))) # 指定x轴y轴的数据
p + geom_bar() # Produce a barplot 是柱状图

library(ggplot2)
# Select the mtcars data, and focus on cyl as factor (qualitative) on x axis
p = ggplot(mtcars, aes(y=factor(cyl))) # 指定x轴y轴的数据
p + geom_bar()


https://blog.csdn.net/qq_42458954/article/details/82356061


# mpg data
p1 = ggplot(mpg, aes(class))
# Select the mpg data, and focus on class as x axis 最后一列列名为class
p1 + geom_bar() # (1) Produce a barplot
p1 + geom_bar(aes(weight = displ)) # (2) Produce a barplot with counts from displacement variable
说明:g1= ggplot(mpg, aes(x=class))

g1 + geom_bar() # 注意到赋值的对象直接可以使用在“+”运算中
常用的 aes 参数有:
-
x= / y= :这是基本参数。
-
fill= / color= :一般指定一个因子,让 ggplot2 自动根据因子的水平数分配颜色并绘图。
-
shape= :类似上,不过是自动分配点样式。


双重条形图
p1 + geom_bar(aes(fill = drv)) # 一般指定一个因子,让 ggplot2 自动根据因子的水平数分配颜色并绘图
p1 +geom_bar(aes(fill = drv), position = position_stack(reverse = TRUE)) +coord_flip() +theme(legend.position = "top") # (4) Customising (3)

直方图
hist(mtcars$wt)

hist(mtcars$wt,freq=F)

我们将考虑概率直方图(第二个),这意味着直方图的总面积为1。
hist(mtcars$wt, br=seq(0,6,by=0.5), freq=F, col="lightgreen",xlab="weight of cars (1000 lbs)",main="Histogram of Weights of Cars US 1973-74")
br=seq(0,6,by=1) seq是(开始, 结束, by=步长)

ggplot直方图
使用aes(y=..density..)将原始直方图转换为概率直方图。
p = ggplot(data=mtcars, aes(x=wt)) # Select the mtcars data, and focus on wt (quantitative) on x axis
p + geom_histogram(aes(y=..density..),binwidth=0.5)
+ xlab('Weight')+ylab('Density') # Produce a histogram with x and y axis labels

箱线图
箱线图是定量变量的另一个摘要。
产生一个单一的汽车的重量箱线图。
boxplot(mtcars$wt)

boxplot(mtcars$wt, horizontal = T)

用在箱线图中boxrpolt(y~x1),表示将x1视作分组变量,分组输出y1的箱线图
boxplot(mtcars$wt~mtcars$cyl)
boxplot(mtcars$wt~mtcars$cyl, names=c("4 cyl", "6 cyl","8 cyl"),ylab="Weight of cars (1000 lbs)")

ggplot箱线图
p = ggplot(data=mtcars, aes(x="", y=wt))
# Select the mtcars data, and focus on wt (quantitative) on y axis (with no filtering on x axis)
p + geom_boxplot() # Produce a boxplot

双箱线图
p = ggplot(data=mtcars, aes(x=factor(cyl),y=wt)) # Select the mtcars data, and focus on wt (quantitative) on y axis and cyl (qualitative) on x axis
p + geom_boxplot() # Produce a boxplot, of wt filtered by cyl

geom_jitter绘制带有少量随机噪声的点。我们使用它来调查小数据集中的过度绘图。有时候,数据量大,我们需要显示每一个数据点,可以很直观的看到数据的分布情况。在文章中这种图是很常见的。我们只需要在上面的箱形图中加入geom_jitter函数就可以啦
p = ggplot(data=mtcars, aes(x=factor(cyl),y=wt))
p + geom_boxplot() + geom_jitter()

p = ggplot(data=mtcars, aes(x=factor(cyl),y=wt))
p + geom_boxplot() + coord_flip() #横向转换坐标:把x轴和y轴互换,没有特殊参数
p + geom_boxplot(notch = TRUE)
p + geom_boxplot(outlier.colour = "green", outlier.size = 3) #outlier 异常处理
notch参数:如果为假(默认),则制作标准盒图。如果为真,做一个缺口盒图。凹槽用来比较组;如果两个盒子的凹槽不重叠,说明中位数有显著差异。notchwidth参数:如果指定notch参数为TRUE的话,指定切口宽度,默认0.5。

boxplot(mpg~cyl,data=mtcars,notch=TRUE,col="green",xlab="Number of Cylinders",ylab="MPG",main="Boxplot for MPG by Number of Cylinders")


p + geom_boxplot(aes(fill = factor(cyl)))
p + geom_boxplot(aes(fill = factor(am)))
1

2

Mosaicplot 马赛克图 暂时不讲
counts2 = table(mtcars$gear, mtcars$am) # Produces contingency table
plot(counts2) # Produces mosaic plot from contingency table

https://cran.rproject.org/web/packages/ggmosaic/vignettes/ggmosaic.html
散点图
plot(mtcars$wt,mtcars$mpg)

plot(mtcars$wt,mtcars$mpg, xlab="Car Weight", ylab="Miles per Gallon",col="darkred",pch=19)

abline(lm(mtcars$mpg~mtcars$wt))

lm是用来适应线性模型的。它可用于进行回归、单层分析方差分析和协方差分析。
Usage
lm(formula, data, subset, weights, na.action,
method = "qr", model = TRUE, x = FALSE, y = FALSE, qr = TRUE,
singular.ok = TRUE, contrasts = NULL, offset, ...)
Arguments | 参数
- formula:指要拟合的模型形式,
- data:是一个数据框,包含了用于拟合模型的数据。
Example | 例子
lm(mtcars$mpg~mtcars$wt)

abline 函数的作用是在一张图表上添加直线(参考线), 可以是一条斜线,通过x或y轴的交点和斜率来确定位置;也可以是一条水平或者垂直的线,只需要指定与x轴或y轴交点的位置就可以了

plot(mtcars)

pairs(~mpg+disp+drat+wt,data=mtcars)
用在回归模型中lm(yx1+x2),右边为自变量,左边为因变量。

pairs(mtcars)
ggplot中散点图的使用
p = ggplot(mtcars, aes(wt, mpg)) # Select the mtcars data, and focus on wt (quantitative) on x axis and mpg (quantiative) on y axis
p + geom_point() # Produce a scatterplot of mpg vs wt

p + geom_point(aes(colour = factor(cyl))) # Colour the points by cyl (qualitative)
下面几个颜色绘制方法等价
aes(col = x)
aes(fg = x)
aes(color = x)
aes(colour = x)


p + geom_point(aes(shape = factor(cyl))) # Shape the points by cyl (qualitative)

p + geom_point(aes(shape = factor(cyl))) + scale_shape(solid = FALSE)

p + geom_point(aes(size = qsec)) # Size the points by qsec (qualitative)

p + geom_point(aes(colour = cyl)) + scale_colour_gradient(low = "blue") # Colour the points by cyl (quantitative)

install.packages("plotly")
library("plotly")
p1 = plot_ly(mtcars, x = ~mpg, y = ~wt, type="scatter")
print(p1)

ggplot2
函数可用以下的代码模板概括:
ggplot(data = <DATA>) +
<GEOM_FUNCTION>(
mapping = aes(<MAPPING>),
stat = <STAT>,
position = <POSITION>
) +
<COORINATE_FUNCTION> +
<FACET_FUNCTION>
其中,这7个参数分别为数据集(DATA)、几何对象( GEOM_FUNCTION)、映射集合(MAPPING)、统计变换(STAT)、位置调整(POSITION)、坐标系(COORINATE_FUNCTION)和分面模式(FACET_FUNCTION)。
这套模板也可以作为我们学习ggplot2的总领,搞清楚每幅图中的这7个参数并能熟练地运用它们,在可视化这个领域就有了“初窥门径”的境界了。
废话不多说,接下来就拆开这套函数,将7个参数逐个击破。
关于要输入ggplot的数据其实应该在可视化之前就处理好了,但是要注意的是输入的是数据类型必须要是data.frame(or other object coercible by fortify() ?)。
在上游处理好数据后,便可以使用ggplot进行可视化处理。
映射集合
在使用aes()函数确定x、y轴的关系以及数据后,就可以作图了。以下用ggplot2包中自带的数据集mpg进行绘图(散点图)
displ 和 hwy 是 数据集mpg中的列名
ggplot(data = mpg) + geom_point(aes(x=displ, y=hwy))
#hwy:highway miles per gallon
#displ:engine displacement, in litres

网友评论