美文网首页R/PythonR
dplyr和data.table

dplyr和data.table

作者: 茶苯海 | 来源:发表于2018-01-23 15:24 被阅读4次
    • .加载包
    library(dplyr)
    library(data.table)
    library(lubridate)
    library(jsonlite)
    library(tidyr)
    library(ggplot2)
    library(compare)
    

    使用jsonlite包中的fromJSON函数来下载数据集的JSON格式数据。

    spending=fromJSON("https://data.medicare.gov/api/views/nrth-mfg3/rows.json?accessType=DOWNLOAD")
    names(spending)
    
    • .数据处理
    meta <- spending$meta
    hospital_spending <- data.frame(spending$data)
    colnames(hospital_spending) <- make.names(meta$view$columns$name)
    
    ## 查看数据
    glimpse(hospital_spending)
    
    # select列筛选
    hospital_spending <- select(hospital_spending,-c(sid:meta))
    glimpse(hospital_spending)
    

    导入的所有数据列都是因子型数据。
    下面我们将列中数据为数值的列改为数值型数据:

    #因子型到数值型:先as.character再as.numeric
    
    cols = 6:11  #需要改变数据类型的列
    ## 这里操作可以把需要改变的列作为一个向量变量
    hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.character)
    hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.numeric)
    

    最后两列数据分别是数据收集的起始日期和结束日期。
    使用lubridate包来纠正这两列的数据类型:

    cols = 12:13;
    hospital_spending[,cols] <- lapply(hospital_spending[,cols],ymd_hms)
    

    检查数据列是否是我们想要的数据类型:

    sapply(hospital_spending,class)    #sapply的用法
    

    • .创建data.table类型数据

    使用data.table函数创建data.table类型数据:

    class(hospital_spending)
    
    hospital_spending_DT <- data.table(hospital_spending)
    class(hospital_spending_DT)
    
    • .选取数据集的某些列

    对于选取数据列,我们可以使用dplyr包中的select函数。
    另一方面,我们只需在data.table中指定对应的列名即可。

    选取一个变量

    from_dplyr <- select(hospital_spending,Hospital_Name)
    from_data_table <- hospital_spending_DT[,.(Hospital_Name)]
    
    ####from_data_table <- hospital_spending_DT[,Hospital_Name]
    ##直接这样也是可以的
    

    对比下dplyr和data.table给出的结果是否相同:

    compare(from_dplyr,from_data_table,allowAll = TRUE)
    

    删除一个变量

    from_dplyr <- select(hospital_spending,-Hospital_Name)
    from_data_table <- hospital_spending_DT[,Hospital_Name := NULL]
    ####另一种写法
    #from_data_table = hospital_spending_DT[,!c("Hospital.Name"),with=FALSE]
    
    compare(from_dplyr,from_data_table,allowAll = TRUE)
    

    对copy()函数所复制的输入对象得到的引用执行任何操作都不会对原始数据对象产生任何影响。如下所示:

    DT=copy(hospital_spending_DT)
    "Hospital_Name"%in% names(DT)
    
    #删除其中一列
    DT <- DT[,!c("Hospital_Name"),with = FALSE]
    "Hospital_Name"%in% names(DT)   #验证是不是在里面
    

    删除多个变量

    DT=copy(hospital_spending_DT)
    
    DT=DT[,c("Hospital_Name","State","Measure.Start.Date","Measure.End.Date"):=NULL] 
    
    c("Hospital_Name","State","Measure.Start.Date","Measure.End.Date") %in% names(DT)
    
    

    选取多个变量

    from_dplyr = select(hospital_spending, Hospital.Name,State,Measure.Start.Date,Measure.End.Date)
    from_data_table = hospital_spending_DT[,.(Hospital.Name,State,Measure.Start.Date,Measure.End.Date)]
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    TRUE
    dropped attributes 
    

    删除多个变量

    现在,我们要删除hospital_spending数据框和data.table类型数据hospital_spending_DT中的变量Hospital.Name,State,Measure.Start.Date,Measure.End.Date:

    from_dplyr = select(hospital_spending, -c(Hospital.Name,State,Measure.Start.Date,Measure.End.Date))
    from_data_table = hospital_spending_DT[,!c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date"),with=FALSE]
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    TRUE
    dropped attributes
    

    dplyr包中有contains(),starts_with(),ends_with()三个函数,它们可以跟select()函数一起结合使用。对于data.table,我们则可以使用正则表达式。下面我们将选取所有列名包含字符“Date”的列,示例如下:

    from_dplyr = select(hospital_spending,contains("Date"))
    from_data_table = subset(hospital_spending_DT,select=grep("Date",names(hospital_spending_DT)))
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    names(from_dplyr)
    

    重命名列名

    setnames(hospital_spending_DT,c("Hospital.Name", "Measure.Start.Date","Measure.End.Date"), c("Hospital","Start_Date","End_Date"))
    
    names(hospital_spending_DT)
    
    "Hospital" "Provider.Number." "State" "Period" "Claim.Type" "Avg.Spending.Per.Episode..Hospital." "Avg.Spending.Per.Episode..State." "Avg.Spending.Per.Episode..Nation." "Percent.of.Spending..Hospital." "Percent.of.Spending..State." "Percent.of.Spending..Nation." "Start_Date" "End_Date" 
    
    hospital_spending = rename(hospital_spending,Hospital= Hospital.Name, Start_Date=Measure.Start.Date,End_Date=Measure.End.Date)
    
    compare(hospital_spending,hospital_spending_DT, allowAll=TRUE)
    
    TRUE
      dropped attributes
    

    筛选行

    对于数据集特定行的筛选,我们可以使用dplyr包中的filter函数,它通过可能包含正则表达式的逻辑语句来实现该功能。在data.table中,我们只需使用逻辑语句就可以了。

    对单个变量进行筛选

    # selecting rows for California
    from_dplyr = filter(hospital_spending,State=='CA') 
    
    from_data_table = hospital_spending_DT[State=='CA']
    
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    
    TRUE
      dropped attributes
    

    对多个变量进行筛选

    from_dplyr = filter(hospital_spending,State=='CA' & Claim.Type!="Hospice") 
    from_data_table = hospital_spending_DT[State=='CA' & Claim.Type!="Hospice"]
    
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    
    TRUE
      dropped attributes
    
    from_dplyr = filter(hospital_spending,State %in% c('CA','MA',"TX")) 
    from_data_table = hospital_spending_DT[State %in% c('CA','MA',"TX")]
    
    unique(from_dplyr$State)
    CA MA TX 
    
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    
    TRUE
    dropped attributes
    

    数据排序

    我们使用dplyr包中的arrange()函数对数据行进行排序,可以实现对一个或多个变量的数据行进行排序。如果想实现降序,需使用如下代码所示的desc()函数。以下示例演示了如何对数据行进行升序和降序排序:

    #升序 
    from_dplyr = arrange(hospital_spending, State)
    from_data_table = setorder(hospital_spending_DT, State)
    
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    
    #降序 
    from_dplyr = arrange(hospital_spending, desc(State))
    from_data_table = setorder(hospital_spending_DT, -State)
    
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    
    

    对多变量进行排序

    以下代码实现了State变量升序,End_Date变量降序排序:

    from_dplyr = arrange(hospital_spending, State,desc(End_Date))
    from_data_table = setorder(hospital_spending_DT, State,-End_Date)
    
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    
    

    添加或更新列

    在dplyr包中,使用mutate()函数来添加新列。在data.table包中,我们可以使用:=引用来添加或更新列:

    from_dplyr = mutate(hospital_spending, diff=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.)
    from_data_table = copy(hospital_spending_DT)
    from_data_table = from_data_table[,diff := Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.]
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    
    
    from_dplyr = mutate(hospital_spending, diff1=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.,diff2=End_Date-Start_Date)
    from_data_table = copy(hospital_spending_DT)
    from_data_table = from_data_table[,c("diff1","diff2") := list(Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.,diff2=End_Date-Start_Date)]
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    
    

    数据汇总

    我们可以使用dplyr包中的summarise()函数来创建概括性统计量:

    summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.))
    mean 1820.409
    
    hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.))]
    mean 1820.409
    
    summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.),
                                maximum=max(Avg.Spending.Per.Episode..Nation.),
                                minimum=min(Avg.Spending.Per.Episode..Nation.),
                                median=median(Avg.Spending.Per.Episode..Nation.))
    mean     maximum   minimum  median
    1820.409  20025       0      109
    
    hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.),
                            maximum=max(Avg.Spending.Per.Episode..Nation.),
                            minimum=min(Avg.Spending.Per.Episode..Nation.),
                            median=median(Avg.Spending.Per.Episode..Nation.))]
    mean      maximum   minimum  median
    1820.409  20025       0      109
    

    当然,我们也可以对各分组的数据块分别求概述性统计量。在dplyr中使用group_by()函数,data.table中指定by参数即可:

    head(hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)),by=.(Hospital)])
    
    mygroup= group_by(hospital_spending,Hospital,State)
    from_dplyr = summarize(mygroup,mean=mean(Avg.Spending.Per.Episode..Hospital.))
    
    from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital,State)]
    
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    

    链式操作

    在dplyr和data.table包中,我们可以使用链式操作来实现代码的连续性。在dplyr中,使用magrittr包中的%>%管道函数非常酷。%>%的功能是用于实现将一个函数的输出传递给下一个函数的第一个参数。在data.table中,我们可以使用%>%或[来实现链式操作。

    from_dplyr=hospital_spending %>% group_by(Hospital,State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.))
    
    from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital,State)]
    
    compare(from_dplyr,from_data_table, allowAll=TRUE)
    
    
    hospital_spending %>% group_by(State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.)) %>% 
    arrange(desc(mean)) %>% head(10) %>% 
            mutate(State = factor(State,levels = State[order(mean,decreasing =TRUE)])) %>% 
              ggplot(aes(x=State,y=mean))+geom_bar(stat='identity',color='darkred',fill='skyblue')+
              xlab("")+ggtitle('Average Spending Per Episode by State')+
              ylab('Average')+ coord_cartesian(ylim = c(3800, 4000))
    
    hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)),
                                         by=.(State)][order(-mean)][1:10] %>% 
                mutate(State = factor(State,levels = State[order(mean,decreasing =TRUE)])) %>% 
               ggplot(aes(x=State,y=mean))+geom_bar(stat='identity',color='darkred',fill='skyblue')+
              xlab("")+ggtitle('Average Spending Per Episode by State')+
              ylab('Average')+ coord_cartesian(ylim = c(3800, 4000))
    

    相关文章

      网友评论

        本文标题:dplyr和data.table

        本文链接:https://www.haomeiwen.com/subject/qkpsaxtx.html