美文网首页
数据预处理:dplyr package sample

数据预处理:dplyr package sample

作者: MC1229 | 来源:发表于2016-08-22 01:35 被阅读63次

    library(dplyr)

    options(width=105)

    chicago<-readRDS("chicago,rds")

    dim(chicago)

    str(chicago)

    names(chicago)

    head(select(chicago), city:dptp))  #把city到dptp之间的变量都选出来

    head(select(chicago), -(city:dptp)))  #把除city到dptp之间的变量外的都选出来

    #如果不用dplyr,代码是

    # i<-match("city", names(chicago))

    # j<-match("dptp", names(chicago))

    # head(chicago[, -(i:j)])

    # FILTER

    chic.f <-filter(chicago, pm25tmean2>30)

    chic.f <-filter(chicago, pm25tmean2>30 & tmpd>80)

    head(chic.f)

    # Arrange

    chicago <- arrange(chicago, date)   # 从小到大排列

    chicago <- arrange(chicago, desc(date))    # 从大到小排列

    head(chicago); tail(chicago)

    # Rename

    chicago <- rename(chicago, pm25=pm25mean2, dewpoint=dptp)

    # MUTATE

    chicago<-mutate(chicago, pm25detrend=pm25-mean(pm25, na.rm=TRUE))

    # 创造新变量

    #goup_by

    #此组代码目的在于求出在天气冷热的不同条件下,空气污染的均值、最高值会否有差异

    chicago<-mutate(chicago, tempat=factor(1*(tmpd>80),labels=c("cold","hot")))

    hotcold<-group_by(chicago, tempcat)

    summarize(hotcold, pm25=mean(pm25),o3=max(o3tmean2),no2=median(no2tmean2))

    summarize(hotcold, pm25=mean(pm25, na.rm=TRUE),o3=max(o3tmean2),no2=median(no2tmean2))

    # 忽略pm25中的missing value

    # Summarize 

    #此组代码目的在于求出不同年份的数据有无差异

    chicago<-mutate(chicago, year=as.POSIXlt(date)$year + 1900)

    # 此方法可以简单地得到年份数据

    years<-group_by(chicago, year)

    summarize(years, pm25=mean(pm25, na.rm=TRUE),o3=max(o3tmean2),no2=median(no2tmean2))

    # Pipeline Operation

    chicago %>% mutate(month=as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarize(pm25=mean(pm25, na.rm=TRUE),o3=max(o3tmean2),no2=median(no2tmean2))

    相关文章

      网友评论

          本文标题:数据预处理:dplyr package sample

          本文链接:https://www.haomeiwen.com/subject/dwrwsttx.html