美文网首页
探索多个变量

探索多个变量

作者: esskeetit | 来源:发表于2018-01-16 10:55 被阅读0次
    getwd()
    list.files()
    pf <- read.csv('pseudo_facebook.tsv',sep='\t')
    

    Third Qualitative Variable

    在以性别为分类的年龄箱线图中,加入每个性别的平均年龄
    原箱线图:

    ggplot(aes(x = gender, y = age),
           data = subset(pf, !is.na(gender))) + geom_boxplot()
    

    添加后的箱线图:

    ggplot(aes(x = gender, y = age),
           data = subset(pf, !is.na(gender))) + geom_boxplot()+
      stat_summary(fun.y=mean,geom='point',shape=4)
    

    年龄、朋友数、性别三个变量:

    ggplot(aes(x=age,y=friend_count),
           data=subset(pf,!is.na(gender)))+
      geom_line(aes(color=gender),stat='summary',fun.y=median)
    

    按年龄和性别对数据进行分组,并计算每个组里的平均好友数,中位数好友数和每个组的数据条目数

    detach("package:plyr", unload=TRUE)
    library(dplyr)
    pf.fc_by_age_gender <- pf %>%
      filter(!is.na(gender))%>%
      group_by(age,gender)%>%
      summarise(friend_count_mean = mean(friend_count),
                friend_count_median=median(as.numeric(friend_count)),
                n=n())%>%
      ungroup()%>%
      arrange(age)
    
    head(pf.fc_by_age_gender)
    

    用上面的分组创建图表

    ggplot(aes(x=age,y=friend_count_median),data=pf.fc_by_age_gender)+
      geom_line(aes(color=gender))
    

    Thinking in Ratios

    女性用户的好友数是男性用户好友数的几倍?
    要回答这个问题,先重塑我们的数据
    pf.fc_by_age_gender是长格式数据,我们要把它转化成宽格式数据,
    每一行包括:
    年龄
    对应该年龄的男性用户的好友数(中位数)
    对应该年龄的女性用户的好友数(中位数)

    library(reshape2)
    pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
                                      age~gender,
                                      value.var = 'friend_count_median')
    head(pf.fc_by_age_gender.wide)
    

    函数dcast()中的d表示输出的数据结构为dataframe
    如果要输出矩阵或者数组,应使用acast()


    Ratio Plot

    横轴:年龄
    纵轴:男/女好友数的中位数

    ggplot(aes(x=age,y=female/male),data=pf.fc_by_age_gender.wide)+
      geom_line()+
      geom_hline(yintercept = 1,alpha=0.3,linetype=2)
    

    探索四个变量:年龄,性别,好友数,使用时长tenure

    以2014为基准年,添加[加入时间]这个变量

    pf$year_joined <- floor(2014-pf$tenure/365)
    

    floor为向下取整,返回不大于该数字的最大整数

    Cut a Variable

    切割变量year_joined,分为以下几组:
    2004-2009,2009-2011,2011-2012,2012-2014

    summary(pf$year_joined)
    table(pf$year_joined)
    pf$year_joined.buckets <- cut(pf$year_joined,
                                  c(2004,2009,2011,2012,2014))
    table(pf$year_joined.buckets,useNA='ifany')
    

    use variable year_joined.buckets to create a line graph

    ggplot(aes(x=age,y=friend_count),
           data=subset(pf,!is.na(year_joined.buckets)))+
      geom_line(aes(color=year_joined.buckets),
                stat='summary',
                fun.y='median')
    

    the parameter linetype can take the values 0-6:
    0 = blank,
    1 = solid,
    2 = dashed
    3 = dotted
    4 = dotdash
    5 = longdash
    6 = twodash


    Plot the Grand Mean

    ggplot(aes(x=age,y=friend_count),
           data=subset(pf,!is.na(year_joined.buckets)))+
      geom_line(aes(color=year_joined.buckets),
                stat='summary',
                fun.y=mean)+
      geom_line(stat='summary',fun.y=mean,linetype=2)
    

    Friending Rate

    with(subset(pf,tenure>1),summary(friend_count/tenure))
    

    Friendships Initiated

    ggplot(aes(x=tenure,y=friendships_initiated/tenure),
           data=subset(pf,tenure>=1))+
      geom_line(stat='summary',aes(color=year_joined.buckets),fun.y=mean)
    

    偏差-方差权衡

    
    ggplot(aes(x = tenure, y = friendships_initiated / tenure),
           data = subset(pf, tenure >= 1)) +
      geom_line(aes(color = year_joined.buckets),
                stat = 'summary',
                fun.y = mean)
    
    ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
           data = subset(pf, tenure > 0)) +
      geom_line(aes(color = year_joined.buckets),
                stat = "summary",
                fun.y = mean)
    
    ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
           data = subset(pf, tenure > 0)) +
      geom_line(aes(color = year_joined.buckets),
                stat = "summary",
                fun.y = mean)
    
    ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
           data = subset(pf, tenure > 0)) +
      geom_line(aes(color = year_joined.buckets),
                stat = "summary",
                fun.y = mean)
    ggplot(aes(x = tenure, y = friendships_initiated / tenure),
           data = subset(pf, tenure >= 1)) +
      geom_smooth(aes(color = year_joined.buckets))
    
    

    the Yogurt Data Set

    getwd()
    yo <- read.csv("yogurt.csv")
    View(yo)
    yo$id <- factor(yo$id)
    str(yo)
    

    酸奶价格直方图

    ggplot(aes(x=price),data=yo)+
      geom_histogram(fill=I('#FF6374'))
    

    不同的酸奶价格

    unique(yo$price)
    length(unique(yo$price))
    table(yo$price)
    

    将一条购买记录中不同口味的酸奶数量加总,汇总成新变量all.purchases

    names(yo)
    yo <- transform(yo,all_purchases=strawberry+
                      blueberry+
                      pina.colada+
                      plain+mixed.berry)
    

    all.purchases histogram

    ggplot(aes(x=all_purchases),data=yo)+
      geom_histogram(binwidth=1)
    

    随时间变化的价格

    ggplot(aes(x=time,y=price),data=yo)+
      geom_jitter(alpha=1/10,shape=21,fill=I('#F79420'))
      
    

    Sampling Observations

    对于酸奶数据集,我们可能需要更详细地调查小样本的家庭


    Looking at Samples of Households

    set.seed(4230)
    sample.ids <- sample(levels(yo$id),16)
    sample.ids
    
    ggplot(aes(x=time,y=price),
           data=subset(yo,id %in% sample.ids))+
      facet_wrap(~id)+
      geom_line()+
      geom_point(aes(size=all_purchases),pch=1)
        
    

    Scatterplot Matrix 散点图矩阵

    library(GGally)
    theme_set(theme_minimal(20))
    set.seed(1836)
    pf_subset <- pf[,c(2:15)]
    names(pf_subset)
    ggpairs(pf_subset[sample.int(nrow(pf_subset),1000),])
    

    set.seed确保得到可重复的结果


    Even More Variables

    nci <- read.table("nci.tsv")
    colnames(nci)
    colnames <-c(1:64)
    

    Heat Maps

    library(reshape2)
    nci.long.samp <- melt(as.matrix(nci[1:200,]))
    names(nci.long.samp) <- c("gene", "case", "value")
    head(nci.long.samp)
    
    ggplot(aes(y = gene, x = case, fill = value),
      data = nci.long.samp) +
      geom_tile() +
      scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
    

    习题集

    1.带有分面和颜色的价格直方图
    scale_fill_brewer(type = 'qual')可以修改颜色的编码方式

    data(diamonds)
    View(diamonds)
    ggplot(aes(x = price,fill=cut),
           data = diamonds) + 
      geom_histogram(bins=35) +
      facet_wrap(~ color) +
      scale_x_log10() +
      scale_fill_brewer(type = 'qual')
    

    2.价格与按切工填色的表格

    names(diamonds)
    p1 <- ggplot(aes(x=table,y=price),data=diamonds)+
      geom_point(aes(color=cut))+
      scale_color_brewer(type = 'qual')+
      scale_x_continuous(breaks=seq(50,80,2),lim=c(50,80))
    
    p2 <- ggplot(aes(x=table,y=price,fill=cut),data=diamonds)+
      geom_point(aes(color=cut))+
      scale_color_brewer(type = 'qual')+
      scale_x_continuous(breaks=seq(50,80,2),lim=c(50,80))
    
    library(gridExtra)
    grid.arrange(p1,p2)
    

    3.价格与体积和钻石净度

    diamonds$v = diamonds$x*diamonds$y*diamonds$z
    ggplot(aes(x=v,y=price,fill=clarity),
           data=diamonds)+
      xlim(0,quantile(diamonds$v,0.99))+
      scale_y_log10()+
      geom_point(aes(color=clarity))+
      scale_color_brewer(type = 'div')
    

    4.新建友谊的比例

    pf$prop_initiated <- pf$friendships_initiated/pf$friend_count
    

    5.prop_initiated 与使用时长

    pf$year_joined <- floor(2014-pf$tenure/365)
    pf$year_joined.buckets <- cut(pf$year_joined,
                                  c(2004,2009,2011,2012,2014))
    
    ggplot(aes(x=tenure,y=prop_initiated),
           data=na.omit(pf))+
      geom_line(aes(color=year_joined.buckets),
                stat='summary',
                fun.y=median)
    
    ggplot(pf, aes(x=tenure, 
                   y=prop_initiated, 
                   color=year_joined.buckets)) +
      geom_line(stat='summary', fun.y=median, na.rm=TRUE)
    
    ggplot(pf, aes(x=tenure, 
                   y=prop_initiated, 
                   color=year_joined.buckets)) +
      geom_line(stat='summary', fun.y=median, na.rm=TRUE)+
      geom_smooth()
    

    6.最大的组均值 prop_initiated

    with(pf,year_joined.buckets=)
    
    with(subset(pf,!is.na(prop_initiated)&year_joined.buckets=='(2012,2014]'),
         mean(prop_initiated))
    by(pf$prop_initiated,pf$year_joined.buckets,summary)
    

    7.经过分组、分面和填色的价格/克拉

    ggplot(aes(x=cut,y=price/carat),data=diamonds)+
      geom_jitter(aes(color=color))+
      facet_wrap(~clarity)+
      scale_color_brewer(type = 'div')
    

    相关文章

      网友评论

          本文标题:探索多个变量

          本文链接:https://www.haomeiwen.com/subject/yafsoxtx.html