R语言基础之第五部分 总结数据信息

作者: 多美丽 | 来源:发表于2019-08-25 23:02 被阅读5次

    R语言基础之第部分 总结数据信息

    1、以R自带数据集airquality为例

    > head(airquality)   #默认给出数据集前6行
      Ozone Solar.R Wind Temp Month Day
    1    41     190  7.4   67     5   1
    2    36     118  8.0   72     5   2
    3    12     149 12.6   74     5   3
    4    18     313 11.5   62     5   4
    5    NA      NA 14.3   56     5   5
    6    28      NA 14.9   66     5   6
    
    > head(airquality,10)    #给出数据集前10行
    > tail(airquality)      #默认给出数据集最后6行
    > summary(airquality)
       Ozone           Solar.R           Wind             Temp           Month      
     Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00   Min.   :5.000  
     1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00   1st Qu.:6.000  
     Median : 31.50   Median :205.0   Median : 9.700   Median :79.00   Median :7.000  
     Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88   Mean   :6.993  
     3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00   3rd Qu.:8.000  
     Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00   Max.   :9.000  
     NA's   :37       NA's   :7                                                       
          Day      
     Min.   : 1.0  
     1st Qu.: 8.0  
     Median :16.0  
     Mean   :15.8  
     3rd Qu.:23.0  
     Max.   :31.0  
    
    > str(airquality)    #查看变量属性
    'data.frame':   153 obs. of  6 variables:
     $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
     $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
     $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
     $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
     $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
     $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
    
    > table(airquality$Month)  #统计月份的频数
    
     5  6  7  8  9 
    31 30 31 31 30
    
    > table(airquality$Ozone)  #包含缺失值
    
      1   4   6   7   8   9  10  11  12  13  14  16  18  19  20  21  22  23  24  27  28 
      1   1   1   3   1   3   1   3   2   4   4   4   4   1   4   4   1   6   2   1   3 
     29  30  31  32  34  35  36  37  39  40  41  44  45  46  47  48  49  50  52  59  61 
      1   2   1   3   1   2   2   2   2   1   1   3   2   1   1   1   1   1   1   2   1 
     63  64  65  66  71  73  76  77  78  79  80  82  84  85  89  91  96  97 108 110 115 
      1   2   1   1   1   2   1   1   2   1   1   1   1   2   1   1   1   2   1   1   1 
    118 122 135 168 
      1   1   1   1 
    
    > table(airquality$Month,airquality$Day)
       
        1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
      5 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
      6 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
      7 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
      8 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
      9 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
       
        31
      5  1
      6  0
      7  1
      8  1
      9  0
    
    > any(is.na(airquality$Month))    #Month变量是否有缺失值
    [1] FALSE
    
    > sum(airquality$Month)
    [1] 1070
    
    > all(airquality$Month < 11)   #是否所有的月份都是小于11月
    [1] TRUE
    

    2、以R自带数据集Titanic为例

    > Titanic
    > Titanic
    , , Age = Child, Survived = No
    
          Sex
    Class  Male Female
      1st     0      0
      2nd     0      0
      3rd    35     17
      Crew    0      0
    
    , , Age = Adult, Survived = No
    
          Sex
    Class  Male Female
      1st   118      4
      2nd   154     13
      3rd   387     89
      Crew  670      3
    
    , , Age = Child, Survived = Yes
    
          Sex
    Class  Male Female
      1st     5      1
      2nd    11     13
      3rd    13     14
      Crew    0      0
    
    , , Age = Adult, Survived = Yes
    
          Sex
    Class  Male Female
      1st    57    140
      2nd    14     80
      3rd    75     76
      Crew  192     20
    
    > titanic = as.data.frame(Titanic)  
    > head(titanic,10) 
       Class    Sex   Age Survived Freq
    1    1st   Male Child       No    0
    2    2nd   Male Child       No    0
    3    3rd   Male Child       No   35
    4   Crew   Male Child       No    0
    5    1st Female Child       No    0
    6    2nd Female Child       No    0
    7    3rd Female Child       No   17
    8   Crew Female Child       No    0
    9    1st   Male Adult       No  118
    10   2nd   Male Adult       No  154
    
    > dim(titanic)
    [1] 32  5
    
    > summary(titanic)
      Class       Sex        Age     Survived      Freq       
     1st :8   Male  :16   Child:16   No :16   Min.   :  0.00  
     2nd :8   Female:16   Adult:16   Yes:16   1st Qu.:  0.75  
     3rd :8                                   Median : 13.50  
     Crew:8                                   Mean   : 68.78  
                                              3rd Qu.: 77.00  
                                              Max.   :670.00  
    
    > x= xtabs(Freq ~ Class + Age, data = titanic)  #交叉表
    > x
          Age
    Class  Child Adult
      1st      6   319
      2nd     24   261
      3rd     79   627
      Crew     0   885
    
    > ftable(x)   #扁平化
          Age Child Adult
    Class                
    1st           6   319
    2nd          24   261
    3rd          79   627
    Crew          0   885
    

    后续请参考:
    R语言基础之第部分 : 5种数据对象类型
    R语言基础之第部分 : 操纵数据取子集
    R语言基础之第部分:重要函数apply族函数的使用
    R语言基础之第部分 : 排序

    相关文章

      网友评论

        本文标题:R语言基础之第五部分 总结数据信息

        本文链接:https://www.haomeiwen.com/subject/irtlectx.html