探索单一变量

作者: esskeetit | 来源:发表于2018-01-09 15:19 被阅读0次

探索单一变量
机器学习实战-数据探索(变量变换、生成)
R-数据分析系列-数据去重
Objective-Csetter和getter方法
机器学习实战-数据探索(异常值处理)
chap3.5 群组几何对象
探索多个变量
算法笔记（19）自动特征选择及Python代码实现
dart学习笔记（1）

What to Do First?

getwd()
list.files
pf <- read.csv("pseudo_facebook.tsv",sep='\t')

Histogram of Users' Birthdays

names(pf)
library(ggplot2)
summary(pf$dob_day)

qplot(x=dob_day,data=pf,bins=31)+
  scale_x_continuous(breaks=1:31)

ggplot(aes(x=dob_day),data=pf)+
  geom_histogram(bins=31)+
  scale_x_continuous(breaks=1:31)

Faceting

qplot(x=dob_day,data=pf,bins=31)+
  scale_x_continuous(breaks=1:31)+
  facet_wrap(~dob_month,ncol=3)

ggplot(aes(x=dob_day),data=pf)+
  geom_histogram(bins=31)+
  scale_x_continuous(breaks=1:31)+
  facet_wrap(~dob_month,ncol=3)

facet_grid(vertical~horizontal)

传递两个或多个变量时使用facet_grid

Friend Count

qplot(x=friend_count,data=pf)

ggplot(aes(x=friend_count),data=pf)+
  geom_histogram()

Limiting the Axes

限制轴，避免长尾数据

qplot(x=friend_count,data=pf,xlim=c(0,1000))

qplot(x=friend_count,data=pf)+
  scale_x_continuous(limits = c(0,1000))

ggplot(aes(x=friend_count),data=pf)+
  geom_histogram()+
  scale_x_continuous(limits=c(0,1000))

Adjusting the Bin Width

qplot(x=friend_count,data=pf,binwidth=25)+
  scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))

ggplot(aes(x=friend_count),data=pf)+
  geom_histogram(binwidth=25)+
  scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))

Faceting Friend Count

qplot(x=friend_count,data=pf,binwidth=25)+
  scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

ggplot(aes(x=friend_count),data=pf)+
  geom_histogram(binwidth=25)+
  scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

Omitting NA Values

R 将缺失值表现为NA

qplot(x=friend_count,data=subset(pf,!is.na(gender)),binwidth=25)+
  scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

ggplot(aes(x=friend_count),data=subset(pf,!is.na(gender)))+
  geom_histogram(binwidth=25)+
  scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

na.omit(pf)将去掉数据集中所有包含NA的条目

qplot(x=friend_count,data=na.omit(pf),binwidth=25)+
  scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

ggplot(aes(x=friend_count),data=na.omit(pf))+
  geom_histogram(binwidth=25)+
  scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

通过上述生成的直方图，很难判断哪个性别的平均好友数更多

Statistics 'by' Gender

table(pf$gender)
by(pf$friend_count,pf$gender,summary)

Tenure

Notes:

color为16进制颜色代码，参见https://en.wikipedia.org/wiki/Web_colors

qplot(x=tenure,data=pf,binwidth=30,
      color=I('Black'),fill=I('#099DD9'))

ggplot(aes(x=tenure),data=pf)+
  geom_histogram(binwidth=30,color='Black',fill='#099DD9')

create a histogram of tenure by year?

qplot(x=tenure/365,data=pf,binwidth=1,
      color=I('Black'),fill=I('#099DD9'))

ggplot(aes(x=tenure/365),data=pf)+
  geom_histogram(binwidth=1,color='Black',fill='#099DD9')

Labeling Plots

qplot(x=tenure/365,data=pf,
      xlab='Number of years using Facebook',
      ylab='Number of users in sample',
      color=I('Black'),fill=I('#099DD9'))+
  scale_x_continuous(breaks=seq(1,7,1),limits=c(0,7))

ggplot(aes(x=tenure/365),data=pf,
       xlab='Number of years using Facebook',
       ylab='Number of users in sample')+
  geom_histogram(color='Black',fill='#099DD9')+
  scale_x_continuous(breaks=seq(1,7,1),limits=c(0,7))

User Ages

summary(pf$age)

qplot(x=age,data=pf,binwidth=1,
      color=I('Black'),fill=I('#099DD9'))+
  scale_x_continuous(breaks=seq(0,113,5),limits=c(0,113))

ggplot(aes(x=age),data=pf)+
  geom_histogram(color='Black',fill='#099DD9',binwidth = 1)+
  scale_x_continuous(breaks=seq(0,113,5),limits=c(0,113))

Transforming Data

Notes:

p1 <- qplot(x=friend_count,data=pf)
summary(pf$friend_count)
summary(log10(pf$friend_count+1))
summary(sqrt(pf$friend_count))

p2 <- qplot(x=log10(pf$friend_count+1),data=pf)
p3 <- qplot(x=sqrt(pf$friend_count),data=pf)
library(gridExtra)
grid.arrange(p1,p2,p3,ncol=1)

使用ggplot的版本

p1 <- ggplot(aes(x=friend_count),data=pf)+
  geom_histogram()
p2 <- p1+scale_x_log10()
p3 <- p1+scale_x_sqrt()
grid.arrange(p1,p2,p3,ncol=1)

Add a Scaling Layer

logScale <- qplot(x=log10(pf$friend_count),data=pf)
countScale <- ggplot(aes(x=friend_count),data=pf)+
  geom_histogram()+
  scale_x_log10()
grid.arrange(logScale,countScale,ncol=2)

qplot(x=pf$friend_count,data=pf)+
  scale_x_log10()

上面两幅图的区别在于X轴上的标记不同

频数多边形

qplot(x=friend_count,y=..count../sum(..count..),
      data=subset(pf,!is.na(gender)),
      xlab='Friend count',
      ylab='Proportion of users with that friend count',
      binwidth=10,geom='freqpoly',color=gender)+
  scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))

ggplot(aes(x = friend_count, y = ..count../sum(..count..)), 
       data = subset(pf, !is.na(gender)),
       xlab='好友数量',
       ylab='Percentage of users with that friend count') + 
  geom_freqpoly(aes(color = gender), binwidth=10) + 
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))

qplot(x=www_likes,data=subset(pf,!is.na(gender)),
      geom='freqpoly',color=gender)+
  scale_x_continuous()+
  scale_x_log10()

ggplot(aes(x=www_likes),data=subset(pf,!is.na(gender)))+
  geom_freqpoly(aes(color=gender))+
  scale_x_continuous()+
  scale_x_log10()

Likes on the Web

by(pf$www_likes,pf$gender,sum)

Box Plots

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')

ggplot(aes(x=gender,y=friend_count),
       data=subset(pf,!is.na(gender)))+
  geom_boxplot()

Adjust the code to focus on users who have friend counts between 0 and 1000.

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot',ylim=c(0,1000))

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')+
  scale_y_continuous(lim=c(0,1000))

ggplot(aes(x=gender,y=friend_count),
       data=subset(pf,!is.na(gender)))+
  geom_boxplot()+
  scale_y_continuous(lim=c(0,1000))

使用coord_cartesian

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')+
  coord_cartesian(ylim=c(0,1000))

ggplot(aes(x=gender,y=friend_count),
       data=subset(pf,!is.na(gender)))+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,1000))

Box Plots, Quartiles, and Friendships

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')+
  coord_cartesian(ylim=c(0,250))

ggplot(aes(x=gender,y=friend_count),
       data=subset(pf,!is.na(gender)))+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,250))

by(pf$friend_count,pf$gender,summary)

coord_cartesian的结果和表输出的结果一致（包括中位数等

names(pf)
by(pf$friendships_initiated,pf$gender,mean)
summary(pf$friendships_initiated)

qplot(x=gender,y=friendships_initiated,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')+
  coord_cartesian(ylim=c(0,200))

ggplot(aes(x=gender,y=friendships_initiated),
       data=subset(pf,!is.na(gender)))+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,200))

箱线图帮助我们理解数据的分布，感知异常值

Getting Logical 符合逻辑

summary(pf$mobile_likes)
summary(pf$mobile_likes>0)
pf$mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes>0,1,0)
pf$mobile_check_in <- factor(pf$mobile_check_in)
summary(pf$mobile_check_in)

what percent of check in using mobile?

sum(pf$mobile_check_in==1)/length(pf$mobile_check_in)

习题集
1.对数据的基本了解

data(diamonds)
View(diamonds)
str(diamonds)
?diamonds

2.价格直方图

qplot(data=diamonds,x=price,binwidth=300)+
  scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,2000))

ggplot(aes(x=price),data=diamonds)+
  geom_histogram(binwidth = 300)+
  scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,2000))

3.钻石数量

lessthan500 <-subset(diamonds,price<500)
dim(lessthan500)

lessthan250 <-subset(diamonds,price<250)
dim(lessthan250)

morethan15000 <-subset(diamonds,price>=15000)
dim(morethan15000)

4.廉价钻石

qplot(data=diamonds,x=price,binwidth=100)+
  scale_x_continuous(limits=c(0,2000),breaks=seq(0,2000,100))

ggplot(aes(x=price),data=diamonds)+
  geom_histogram(binwidth = 100)+
  scale_x_continuous(limits=c(0,2000),breaks=seq(0,2000,100))

ggsave('priceHistogram.png')

5.the histogram of diamond prices by cut.

qplot(data=diamonds,x=price,binwidth=1000)+
  scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,4000))+
  facet_wrap(~cut,ncol=5)

ggplot(aes(x=price),data=diamonds)+
  geom_histogram(binwidth = 1000)+
  scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,4000))+
  facet_wrap(~cut,ncol=5)

6.切工-价格

by(diamonds$price,diamonds$cut,max)
by(diamonds$price,diamonds$cut,min)
by(diamonds$price,diamonds$cut,median)

7.由切工决定的每克拉价格,使用scales,可使分隔后每个图的y轴标度不一样

ggplot(data=diamonds,aes(x=(price/carat)))+geom_histogram()+
  facet_wrap(~cut,scales='free_y')+
  scale_x_log10()

qplot(data=diamonds,x=(price/carat))+
  facet_wrap(~cut,scales='free_y')+
  scale_x_log10()

8.价格箱线图

qplot(data=diamonds,
      x=color,y=price,geom='boxplot')+
  coord_cartesian(ylim=c(0,10000))

ggplot(aes(x=color,y=price),data=diamonds)+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,10000))

9.四分位数以及IQR

quantile(subset(diamonds, color=='D')$price) 
quantile(subset(diamonds,color== 'J')$price)

IQR(subset(diamonds,color=='D')$price)
IQR(subset(diamonds,color=='J')$price)

10.由颜色表示的每克拉价格箱线图

ggplot(aes(x=color,y=price/carat),data=diamonds)+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,8000))

qplot(x=color,y=price/carat,data=diamonds,geom='boxplot')+
  coord_cartesian(ylim=c(0,8000))

11.克拉频率多边形


qplot(x=carat,data=diamonds,
      xlab='carat',
      ylab='frequency',
      binwidth=0.01,geom='freqpoly')+
  scale_x_continuous(breaks=seq(0,5,0.2))+
  scale_y_continuous(breaks=seq(0,12000,2000))

ggplot(aes(x=carat),data=diamonds,
       xlab='carat',ylab='frequency')+
  geom_freqpoly(binwidth=0.01)+
  scale_x_continuous(breaks=seq(0,5,0.2))+
  scale_y_continuous(breaks=seq(0,12000,2000))
table(diamonds$carat)[table(diamonds$carat)>2000]

探索单一变量
What to Do First? Histogram of Users' Birthdays Faceting ...
机器学习实战-数据探索(变量变换、生成)
《机器学习实战-数据探索(1、变量识别；2、单变量分析；3、双变量分析)》机器学习实战-数据探索(缺失值处理) ...
R-数据分析系列-数据去重
在进行数据分析的过程中，我们常常会根据不同变量做去重处理，有单一变量和多变量处理两种情况单一变量去重 resul...
Objective-Csetter和getter方法
setter和getter方法在OC里，为单一实例变量赋值的方法称作setter(设置器)。获取单一实例变量值的...
机器学习实战-数据探索(异常值处理)
《机器学习实战-数据探索(1、变量识别；2、单变量分析；3、双变量分析)》机器学习实战-数据探索(缺失值处理)上...
chap3.5 群组几何对象
分组变量由多个变量定义而非单一变量，则用interaction合并各个分组变量 3.5.3 修改默认分组直接添加...
探索多个变量
Third Qualitative Variable 在以性别为分类的年龄箱线图中，加入每个性别的平均年龄原箱线图...
JS基本语法 1.变量（variable）变量声明声明、赋值分解单一var 2.命名规则（1）变量名...
算法笔记（19）自动特征选择及Python代码实现
自动特征选择常用方法包括使用单一变量法进行特征选择、基于模型的特征选择、迭代式特征选择。使用单一变量法进行特征选...
dart学习笔记（1）
变量创建变量并初始化变量：var name = 'Bob' ; 如果对象不限于单一类型：dynamic name...

探索单一变量

What to Do First?

Histogram of Users' Birthdays

Faceting

Friend Count

Limiting the Axes

Adjusting the Bin Width

Faceting Friend Count

Omitting NA Values

Statistics 'by' Gender

Tenure

create a histogram of tenure by year?

Labeling Plots

User Ages

Transforming Data

Add a Scaling Layer

频数多边形

Likes on the Web

Box Plots

Adjust the code to focus on users who have friend counts between 0 and 1000.

Box Plots, Quartiles, and Friendships

Getting Logical 符合逻辑

相关文章

探索单一变量

机器学习实战-数据探索(变量变换、生成)

R-数据分析系列-数据去重

Objective-Csetter和getter方法

机器学习实战-数据探索(异常值处理)

chap3.5 群组几何对象

探索多个变量

算法笔记（19）自动特征选择及Python代码实现

dart学习笔记（1）

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读