美文网首页
探索单一变量

探索单一变量

作者: esskeetit | 来源:发表于2018-01-09 15:19 被阅读0次

    What to Do First?

    getwd()
    list.files
    pf <- read.csv("pseudo_facebook.tsv",sep='\t')
    

    Histogram of Users' Birthdays

    names(pf)
    library(ggplot2)
    summary(pf$dob_day)
    
    qplot(x=dob_day,data=pf,bins=31)+
      scale_x_continuous(breaks=1:31)
    
    ggplot(aes(x=dob_day),data=pf)+
      geom_histogram(bins=31)+
      scale_x_continuous(breaks=1:31)
    

    Faceting

    qplot(x=dob_day,data=pf,bins=31)+
      scale_x_continuous(breaks=1:31)+
      facet_wrap(~dob_month,ncol=3)
    
    ggplot(aes(x=dob_day),data=pf)+
      geom_histogram(bins=31)+
      scale_x_continuous(breaks=1:31)+
      facet_wrap(~dob_month,ncol=3)
    
    facet_grid(vertical~horizontal)
    

    传递两个或多个变量时使用facet_grid

    Friend Count

    qplot(x=friend_count,data=pf)
    
    ggplot(aes(x=friend_count),data=pf)+
      geom_histogram()
    

    Limiting the Axes

    限制轴,避免长尾数据

    qplot(x=friend_count,data=pf,xlim=c(0,1000))
    
    qplot(x=friend_count,data=pf)+
      scale_x_continuous(limits = c(0,1000))
    
    ggplot(aes(x=friend_count),data=pf)+
      geom_histogram()+
      scale_x_continuous(limits=c(0,1000))
    

    Adjusting the Bin Width

    qplot(x=friend_count,data=pf,binwidth=25)+
      scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))
    
    ggplot(aes(x=friend_count),data=pf)+
      geom_histogram(binwidth=25)+
      scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))
    

    Faceting Friend Count

    qplot(x=friend_count,data=pf,binwidth=25)+
      scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
      facet_wrap(~gender)
    
    ggplot(aes(x=friend_count),data=pf)+
      geom_histogram(binwidth=25)+
      scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
      facet_wrap(~gender)
    

    Omitting NA Values

    R 将缺失值表现为NA

    qplot(x=friend_count,data=subset(pf,!is.na(gender)),binwidth=25)+
      scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
      facet_wrap(~gender)
    
    ggplot(aes(x=friend_count),data=subset(pf,!is.na(gender)))+
      geom_histogram(binwidth=25)+
      scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
      facet_wrap(~gender)
    

    na.omit(pf)将去掉数据集中所有包含NA的条目

    qplot(x=friend_count,data=na.omit(pf),binwidth=25)+
      scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
      facet_wrap(~gender)
    
    ggplot(aes(x=friend_count),data=na.omit(pf))+
      geom_histogram(binwidth=25)+
      scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
      facet_wrap(~gender)
    

    通过上述生成的直方图,很难判断哪个性别的平均好友数更多


    Statistics 'by' Gender

    table(pf$gender)
    by(pf$friend_count,pf$gender,summary)
    

    Tenure

    Notes:

    color为16进制颜色代码,参见https://en.wikipedia.org/wiki/Web_colors

    qplot(x=tenure,data=pf,binwidth=30,
          color=I('Black'),fill=I('#099DD9'))
    
    ggplot(aes(x=tenure),data=pf)+
      geom_histogram(binwidth=30,color='Black',fill='#099DD9')
      
    

    create a histogram of tenure by year?

    qplot(x=tenure/365,data=pf,binwidth=1,
          color=I('Black'),fill=I('#099DD9'))
    
    ggplot(aes(x=tenure/365),data=pf)+
      geom_histogram(binwidth=1,color='Black',fill='#099DD9')
    

    Labeling Plots

    qplot(x=tenure/365,data=pf,
          xlab='Number of years using Facebook',
          ylab='Number of users in sample',
          color=I('Black'),fill=I('#099DD9'))+
      scale_x_continuous(breaks=seq(1,7,1),limits=c(0,7))
    
    ggplot(aes(x=tenure/365),data=pf,
           xlab='Number of years using Facebook',
           ylab='Number of users in sample')+
      geom_histogram(color='Black',fill='#099DD9')+
      scale_x_continuous(breaks=seq(1,7,1),limits=c(0,7))
      
    

    User Ages

    summary(pf$age)
    
    qplot(x=age,data=pf,binwidth=1,
          color=I('Black'),fill=I('#099DD9'))+
      scale_x_continuous(breaks=seq(0,113,5),limits=c(0,113))
    
    ggplot(aes(x=age),data=pf)+
      geom_histogram(color='Black',fill='#099DD9',binwidth = 1)+
      scale_x_continuous(breaks=seq(0,113,5),limits=c(0,113))
      
    

    Transforming Data

    Notes:

    p1 <- qplot(x=friend_count,data=pf)
    summary(pf$friend_count)
    summary(log10(pf$friend_count+1))
    summary(sqrt(pf$friend_count))
    
    p2 <- qplot(x=log10(pf$friend_count+1),data=pf)
    p3 <- qplot(x=sqrt(pf$friend_count),data=pf)
    library(gridExtra)
    grid.arrange(p1,p2,p3,ncol=1)
    
    

    使用ggplot的版本

    p1 <- ggplot(aes(x=friend_count),data=pf)+
      geom_histogram()
    p2 <- p1+scale_x_log10()
    p3 <- p1+scale_x_sqrt()
    grid.arrange(p1,p2,p3,ncol=1)
    

    Add a Scaling Layer

    logScale <- qplot(x=log10(pf$friend_count),data=pf)
    countScale <- ggplot(aes(x=friend_count),data=pf)+
      geom_histogram()+
      scale_x_log10()
    grid.arrange(logScale,countScale,ncol=2)
    
    qplot(x=pf$friend_count,data=pf)+
      scale_x_log10()
    
    

    上面两幅图的区别在于X轴上的标记不同


    频数多边形

    qplot(x=friend_count,y=..count../sum(..count..),
          data=subset(pf,!is.na(gender)),
          xlab='Friend count',
          ylab='Proportion of users with that friend count',
          binwidth=10,geom='freqpoly',color=gender)+
      scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))
    
    ggplot(aes(x = friend_count, y = ..count../sum(..count..)), 
           data = subset(pf, !is.na(gender)),
           xlab='好友数量',
           ylab='Percentage of users with that friend count') + 
      geom_freqpoly(aes(color = gender), binwidth=10) + 
      scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
    
    qplot(x=www_likes,data=subset(pf,!is.na(gender)),
          geom='freqpoly',color=gender)+
      scale_x_continuous()+
      scale_x_log10()
    
    ggplot(aes(x=www_likes),data=subset(pf,!is.na(gender)))+
      geom_freqpoly(aes(color=gender))+
      scale_x_continuous()+
      scale_x_log10()
      
    

    Likes on the Web

    by(pf$www_likes,pf$gender,sum)
    

    Box Plots

    qplot(x=gender,y=friend_count,
          data=subset(pf,!is.na(gender)),
          geom='boxplot')
    
    ggplot(aes(x=gender,y=friend_count),
           data=subset(pf,!is.na(gender)))+
      geom_boxplot()
    
    
    

    Adjust the code to focus on users who have friend counts between 0 and 1000.

    qplot(x=gender,y=friend_count,
          data=subset(pf,!is.na(gender)),
          geom='boxplot',ylim=c(0,1000))
    
    qplot(x=gender,y=friend_count,
          data=subset(pf,!is.na(gender)),
          geom='boxplot')+
      scale_y_continuous(lim=c(0,1000))
    
    ggplot(aes(x=gender,y=friend_count),
           data=subset(pf,!is.na(gender)))+
      geom_boxplot()+
      scale_y_continuous(lim=c(0,1000))
    

    使用coord_cartesian

    qplot(x=gender,y=friend_count,
          data=subset(pf,!is.na(gender)),
          geom='boxplot')+
      coord_cartesian(ylim=c(0,1000))
    
    ggplot(aes(x=gender,y=friend_count),
           data=subset(pf,!is.na(gender)))+
      geom_boxplot()+
      coord_cartesian(ylim=c(0,1000))
    

    Box Plots, Quartiles, and Friendships

    qplot(x=gender,y=friend_count,
          data=subset(pf,!is.na(gender)),
          geom='boxplot')+
      coord_cartesian(ylim=c(0,250))
    
    ggplot(aes(x=gender,y=friend_count),
           data=subset(pf,!is.na(gender)))+
      geom_boxplot()+
      coord_cartesian(ylim=c(0,250))
    
    by(pf$friend_count,pf$gender,summary)
    

    coord_cartesian的结果和表输出的结果一致(包括中位数等

    names(pf)
    by(pf$friendships_initiated,pf$gender,mean)
    summary(pf$friendships_initiated)
    
    qplot(x=gender,y=friendships_initiated,
          data=subset(pf,!is.na(gender)),
          geom='boxplot')+
      coord_cartesian(ylim=c(0,200))
    
    ggplot(aes(x=gender,y=friendships_initiated),
           data=subset(pf,!is.na(gender)))+
      geom_boxplot()+
      coord_cartesian(ylim=c(0,200))
    
    

    箱线图帮助我们理解数据的分布,感知异常值


    Getting Logical 符合逻辑

    summary(pf$mobile_likes)
    summary(pf$mobile_likes>0)
    pf$mobile_check_in <- NA
    pf$mobile_check_in <- ifelse(pf$mobile_likes>0,1,0)
    pf$mobile_check_in <- factor(pf$mobile_check_in)
    summary(pf$mobile_check_in)
    

    what percent of check in using mobile?

    sum(pf$mobile_check_in==1)/length(pf$mobile_check_in)
    

    习题集
    1.对数据的基本了解

    data(diamonds)
    View(diamonds)
    str(diamonds)
    ?diamonds
    

    2.价格直方图

    qplot(data=diamonds,x=price,binwidth=300)+
      scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,2000))
    
    ggplot(aes(x=price),data=diamonds)+
      geom_histogram(binwidth = 300)+
      scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,2000))
    

    3.钻石数量

    lessthan500 <-subset(diamonds,price<500)
    dim(lessthan500)
    
    lessthan250 <-subset(diamonds,price<250)
    dim(lessthan250)
    
    morethan15000 <-subset(diamonds,price>=15000)
    dim(morethan15000)
    
    

    4.廉价钻石

    qplot(data=diamonds,x=price,binwidth=100)+
      scale_x_continuous(limits=c(0,2000),breaks=seq(0,2000,100))
    
    ggplot(aes(x=price),data=diamonds)+
      geom_histogram(binwidth = 100)+
      scale_x_continuous(limits=c(0,2000),breaks=seq(0,2000,100))
    
    ggsave('priceHistogram.png')
    

    5.the histogram of diamond prices by cut.

    qplot(data=diamonds,x=price,binwidth=1000)+
      scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,4000))+
      facet_wrap(~cut,ncol=5)
    
    ggplot(aes(x=price),data=diamonds)+
      geom_histogram(binwidth = 1000)+
      scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,4000))+
      facet_wrap(~cut,ncol=5)
    

    6.切工-价格

    by(diamonds$price,diamonds$cut,max)
    by(diamonds$price,diamonds$cut,min)
    by(diamonds$price,diamonds$cut,median)
    

    7.由切工决定的每克拉价格,使用scales,可使分隔后每个图的y轴标度不一样

    ggplot(data=diamonds,aes(x=(price/carat)))+geom_histogram()+
      facet_wrap(~cut,scales='free_y')+
      scale_x_log10()
    
    qplot(data=diamonds,x=(price/carat))+
      facet_wrap(~cut,scales='free_y')+
      scale_x_log10()
    
    

    8.价格箱线图

    qplot(data=diamonds,
          x=color,y=price,geom='boxplot')+
      coord_cartesian(ylim=c(0,10000))
    
    ggplot(aes(x=color,y=price),data=diamonds)+
      geom_boxplot()+
      coord_cartesian(ylim=c(0,10000))
    

    9.四分位数以及IQR

    quantile(subset(diamonds, color=='D')$price) 
    quantile(subset(diamonds,color== 'J')$price)
    
    IQR(subset(diamonds,color=='D')$price)
    IQR(subset(diamonds,color=='J')$price)
    

    10.由颜色表示的每克拉价格箱线图

    ggplot(aes(x=color,y=price/carat),data=diamonds)+
      geom_boxplot()+
      coord_cartesian(ylim=c(0,8000))
    
    qplot(x=color,y=price/carat,data=diamonds,geom='boxplot')+
      coord_cartesian(ylim=c(0,8000))
    

    11.克拉频率多边形

    
    qplot(x=carat,data=diamonds,
          xlab='carat',
          ylab='frequency',
          binwidth=0.01,geom='freqpoly')+
      scale_x_continuous(breaks=seq(0,5,0.2))+
      scale_y_continuous(breaks=seq(0,12000,2000))
    
    ggplot(aes(x=carat),data=diamonds,
           xlab='carat',ylab='frequency')+
      geom_freqpoly(binwidth=0.01)+
      scale_x_continuous(breaks=seq(0,5,0.2))+
      scale_y_continuous(breaks=seq(0,12000,2000))
    table(diamonds$carat)[table(diamonds$carat)>2000]
    

    相关文章

      网友评论

          本文标题:探索单一变量

          本文链接:https://www.haomeiwen.com/subject/pypynxtx.html