美文网首页
R,笔记04

R,笔记04

作者: 按着易得 | 来源:发表于2018-12-16 00:46 被阅读0次

    数据处理

    > # 抽取数据去重复
    > de_dup <- function()
    + {
    +   i <- which(duplicated(iris))
    +   x <- iris[-i, ]
    +   #print(x)
    + }
    > head(de_dup())
      Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    1          5.1         3.5          1.4         0.2  setosa
    2          4.9         3.0          1.4         0.2  setosa
    3          4.7         3.2          1.3         0.2  setosa
    4          4.6         3.1          1.5         0.2  setosa
    5          5.0         3.6          1.4         0.2  setosa
    6          5.4         3.9          1.7         0.4  setosa
    #或者
    iris[!duplicated(iris), ]
    

    去掉NA

    > head(airquality[complete.cases(airquality), ])
      Ozone Solar.R Wind Temp Month Day
    1    41     190  7.4   67     5   1
    2    36     118  8.0   72     5   2
    3    12     149 12.6   74     5   3
    4    18     313 11.5   62     5   4
    7    23     299  8.6   65     5   7
    8    19      99 13.8   59     5   8
    > # 或者na.omit(airquality)
    

    with() identical() within()函数

    > # 用with()函数计算鸢尾花,花萼与花瓣的长度比
    > rwith <- with(iris, Sepal.Length / Petal.Length)
    > head(rwith)
    [1] 3.642857 3.500000 3.615385 3.066667 3.571429 3.176471
    > 
    > # identical()基本作用是检测两个对象是否完全相同,相同返回T,否则,F
    > 
    > # within函数与with类似,但主要用于列运算,将运算结果放入新列
    > myiris <- iris # 不破坏内建数据集
    > myiris <- within(myiris, lenth.ratio <- Sepal.Length / Petal.Length)
    > head(myiris)
      Sepal.Length Sepal.Width Petal.Length Petal.Width Species lenth.ratio
    1          5.1         3.5          1.4         0.2  setosa    3.642857
    2          4.9         3.0          1.4         0.2  setosa    3.500000
    3          4.7         3.2          1.3         0.2  setosa    3.615385
    4          4.6         3.1          1.5         0.2  setosa    3.066667
    5          5.0         3.6          1.4         0.2  setosa    3.571429
    6          5.4         3.9          1.7         0.4  setosa    3.176471
    

    分割数据

    > # 分割数据
    > # cut()将数据等量切割,处理后的数据是factor数据型态
    > # 将state.77对象依人口数做分割,分成5等份
    > popu <- state.x77[, "Population"]
    > cutpopu <- cut(popu, 5)
    > head(cutpopu)
    [1] (344,4.53e+03]     (344,4.53e+03]     (344,4.53e+03]     (344,4.53e+03]     (1.7e+04,2.12e+04] (344,4.53e+03]    
    Levels: (344,4.53e+03] (4.53e+03,8.7e+03] (8.7e+03,1.29e+04] (1.29e+04,1.7e+04] (1.7e+04,2.12e+04]
    
    > #分割时,按人口数由多到少,分别给予名称"high" "2nd" "3rd" "4th" "low"
    > cut(popu, 5, labels =  c ("high", "2nd", "3rd", "4th", "low"))
     [1] high high high high low  high high high 2nd  2nd  high high 3rd  2nd  high high high high high high 2nd  3rd  high high 2nd  high high high high 2nd  high low 
    [33] 2nd  high 3rd  high high 3rd  high high high high 3rd  high high 2nd  high high 2nd  high
    Levels: high 2nd 3rd 4th low
    > 
    > #要了解每一人口数分类有多少州
    > x.popu <- cut(popu, 5, labels =  c ("high", "2nd", "3rd", "4th", "low"))
    > table(x.popu)
    x.popu
    high  2nd  3rd  4th  low 
      34    9    5    0    2 
    

    合并数据

    准备数据库
    > mystates.x77 <- as.data.frame(state.x77)
    > mystates.x77$name <- rownames(state.x77) # 给新数据增加一个字段name
    > head(mystates.x77)
               Population Income Illiteracy Life Exp Murder HS Grad Frost   Area       name
    Alabama          3615   3624        2.1    69.05   15.1    41.3    20  50708    Alabama
    Alaska            365   6315        1.5    69.31   11.3    66.7   152 566432     Alaska
    Arizona          2212   4530        1.8    70.55    7.8    58.1    15 113417    Arizona
    Arkansas         2110   3378        1.9    70.66   10.1    39.9    65  51945   Arkansas
    California      21198   5114        1.1    71.71   10.3    62.6    20 156361 California
    Colorado         2541   4884        0.7    72.06    6.8    63.9   166 103766   Colorado
    > row.names(mystates.x77) <- NULL # 删除原来行名
    > head(mystates.x77)
      Population Income Illiteracy Life Exp Murder HS Grad Frost   Area       name
    1       3615   3624        2.1    69.05   15.1    41.3    20  50708    Alabama
    2        365   6315        1.5    69.31   11.3    66.7   152 566432     Alaska
    3       2212   4530        1.8    70.55    7.8    58.1    15 113417    Arizona
    4       2110   3378        1.9    70.66   10.1    39.9    65  51945   Arkansas
    5      21198   5114        1.1    71.71   10.3    62.6    20 156361 California
    6       2541   4884        0.7    72.06    6.8    63.9   166 103766   Colorado
    
    > #人口大于500万的选出来(原单位是千人数),同时新对象要有2个字段name 和 population
    > mypopu.states <- mystates.x77[mystates.x77$Population > 5000, c("name", "Population")]
    > mypopu.states
                 name Population
    5      California      21198
    9         Florida       8277
    13       Illinois      11197
    14        Indiana       5313
    21  Massachusetts       5814
    22       Michigan       9111
    30     New Jersey       7333
    32       New York      18076
    33 North Carolina       5441
    35           Ohio      10735
    38   Pennsylvania      11860
    43          Texas      12237
    
    > #选出月收入大于5000美元的。同时新对象要有2个字段name 和 Income
    > myincomes.states <- mystates.x77[mystates.x77$Income > 5000, c("name", "Income")]
    > myincomes.states
               name Income
    2        Alaska   6315
    5    California   5114
    7   Connecticut   5348
    13     Illinois   5107
    20     Maryland   5299
    28       Nevada   5149
    30   New Jersey   5237
    34 North Dakota   5087
    

    merge

    > # merge()交集合并。merge(x, y, all = F),默认是交接合并
    > # 合并上述两个数据中人数超500万的州和月收入超5000美元的州
    > merge(mypopu.states, myincomes.states)
            name Population Income
    1 California      21198   5114
    2   Illinois      11197   5107
    3 New Jersey       7333   5237
    > 
    > # 取并集
    > merge(mypopu.states, myincomes.states, all = T)
                 name Population Income
    1          Alaska         NA   6315
    2      California      21198   5114
    3     Connecticut         NA   5348
    4         Florida       8277     NA
    5        Illinois      11197   5107
    6         Indiana       5313     NA
    7        Maryland         NA   5299
    8   Massachusetts       5814     NA
    9        Michigan       9111     NA
    10         Nevada         NA   5149
    11     New Jersey       7333   5237
    12       New York      18076     NA
    13 North Carolina       5441     NA
    14   North Dakota         NA   5087
    15           Ohio      10735     NA
    16   Pennsylvania      11860     NA
    17          Texas      12237     NA
    > 
    > # merge参数all.x = T, 保证第一个对象的元素在合并中都存在,第二个如没有则NA填充
    > merge(mypopu.states, myincomes.states, all.x = T)
                 name Population Income
    1      California      21198   5114
    2         Florida       8277     NA
    3        Illinois      11197   5107
    4         Indiana       5313     NA
    5   Massachusetts       5814     NA
    6        Michigan       9111     NA
    7      New Jersey       7333   5237
    8        New York      18076     NA
    9  North Carolina       5441     NA
    10           Ohio      10735     NA
    11   Pennsylvania      11860     NA
    12          Texas      12237     NA
    

    match

    > # match()类似于取两个对象交集,即第一对象x的某行数据若在第二个对象y中找到符合条件的数据,则返回第二个对象中
    > # 相应数据的位置,否则返回NA。所以match后会返回一个与第一个对象长度相同的向量。
    > 
    > # 找出符合人口数多于500万,同时月授予超5000美元的行数据,在对象myincomes.states中的位置,返回的向量数值即是要的结果。
    > my.index <- match(mypopu.states$name, myincomes.states$name)
    > my.index
     [1]  2 NA  4 NA NA NA  7 NA NA NA NA NA
    > 
    > # 提取出myincome.states中人口数多于500万,同时月收入超5000美元的州的数据。
    > myincomes.states[na.omit(my.index), ]
             name Income
    5  California   5114
    13   Illinois   5107
    30 New Jersey   5237
    > 
    > # %in%将返回于第一个对象长度相同的逻辑向量,在向量中为T的元素是我们要的数据
    > my.index2 <- mypopu.states$name %in% myincomes.states$name
    > my.index2
     [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
    > 
    > #抽出mypopu.states中人口多于500万,同时月收入过5000美元的州数据
    > mypopu.states[my.index2, ]
             name Population
    5  California      21198
    13   Illinois      11197
    30 New Jersey       7333
    > 
    > # 换种做法
    > my.index <- match(mypopu.states$name, myincomes.states$name)
    > my.index3 <- !is.na(my.index) #my.index中不是NA的赋值给my.index3
    > my.index3
     [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
    > mypopu.states[my.index3, ]
             name Population
    5  California      21198
    13   Illinois      11197
    30 New Jersey       7333
    

    排序

    > # 排序sort/order
    > # 数据框的排序,对state.info数据框依据Income字段执行升序排列。
    > mystate.info <- data.frame(Region = state.region, state.x77)
    > mystate.info
                          Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
    Alabama                South       3615   3624        2.1    69.05   15.1    41.3    20  50708
    Alaska                  West        365   6315        1.5    69.31   11.3    66.7   152 566432
    Arizona                 West       2212   4530        1.8    70.55    7.8    58.1    15 113417
    Arkansas               South       2110   3378        1.9    70.66   10.1    39.9    65  51945
    California              West      21198   5114        1.1    71.71   10.3    62.6    20 156361
    Colorado                West       2541   4884        0.7    72.06    6.8    63.9   166 103766
    Connecticut        Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
    Delaware               South        579   4809        0.9    70.06    6.2    54.6   103   1982
    Florida                South       8277   4815        1.3    70.66   10.7    52.6    11  54090
    Georgia                South       4931   4091        2.0    68.54   13.9    40.6    60  58073
    Hawaii                  West        868   4963        1.9    73.60    6.2    61.9     0   6425
    Idaho                   West        813   4119        0.6    71.87    5.3    59.5   126  82677
    Illinois       North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
    Indiana        North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
    Iowa           North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
    Kansas         North Central       2280   4669        0.6    72.58    4.5    59.9   114  81787
    Kentucky               South       3387   3712        1.6    70.10   10.6    38.5    95  39650
    Louisiana              South       3806   3545        2.8    68.76   13.2    42.2    12  44930
    Maine              Northeast       1058   3694        0.7    70.39    2.7    54.7   161  30920
    Maryland               South       4122   5299        0.9    70.22    8.5    52.3   101   9891
    Massachusetts      Northeast       5814   4755        1.1    71.83    3.3    58.5   103   7826
    Michigan       North Central       9111   4751        0.9    70.63   11.1    52.8   125  56817
    Minnesota      North Central       3921   4675        0.6    72.96    2.3    57.6   160  79289
    Mississippi            South       2341   3098        2.4    68.09   12.5    41.0    50  47296
    Missouri       North Central       4767   4254        0.8    70.69    9.3    48.8   108  68995
    Montana                 West        746   4347        0.6    70.56    5.0    59.2   155 145587
    Nebraska       North Central       1544   4508        0.6    72.60    2.9    59.3   139  76483
    Nevada                  West        590   5149        0.5    69.03   11.5    65.2   188 109889
    New Hampshire      Northeast        812   4281        0.7    71.23    3.3    57.6   174   9027
    New Jersey         Northeast       7333   5237        1.1    70.93    5.2    52.5   115   7521
    New Mexico              West       1144   3601        2.2    70.32    9.7    55.2   120 121412
    New York           Northeast      18076   4903        1.4    70.55   10.9    52.7    82  47831
    North Carolina         South       5441   3875        1.8    69.21   11.1    38.5    80  48798
    North Dakota   North Central        637   5087        0.8    72.78    1.4    50.3   186  69273
    Ohio           North Central      10735   4561        0.8    70.82    7.4    53.2   124  40975
    Oklahoma               South       2715   3983        1.1    71.42    6.4    51.6    82  68782
    Oregon                  West       2284   4660        0.6    72.13    4.2    60.0    44  96184
    Pennsylvania       Northeast      11860   4449        1.0    70.43    6.1    50.2   126  44966
    Rhode Island       Northeast        931   4558        1.3    71.90    2.4    46.4   127   1049
    South Carolina         South       2816   3635        2.3    67.96   11.6    37.8    65  30225
    South Dakota   North Central        681   4167        0.5    72.08    1.7    53.3   172  75955
    Tennessee              South       4173   3821        1.7    70.11   11.0    41.8    70  41328
    Texas                  South      12237   4188        2.2    70.90   12.2    47.4    35 262134
    Utah                    West       1203   4022        0.6    72.90    4.5    67.3   137  82096
    Vermont            Northeast        472   3907        0.6    71.64    5.5    57.1   168   9267
    Virginia               South       4981   4701        1.4    70.08    9.5    47.8    85  39780
    Washington              West       3559   4864        0.6    71.72    4.3    63.5    32  66570
    West Virginia          South       1799   3617        1.4    69.48    6.7    41.6   100  24070
    Wisconsin      North Central       4589   4468        0.7    72.48    3.0    54.5   149  54464
    Wyoming                 West        376   4566        0.6    70.29    6.9    62.9   173  97203
    > head(mystate.info)
               Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
    Alabama     South       3615   3624        2.1    69.05   15.1    41.3    20  50708
    Alaska       West        365   6315        1.5    69.31   11.3    66.7   152 566432
    Arizona      West       2212   4530        1.8    70.55    7.8    58.1    15 113417
    Arkansas    South       2110   3378        1.9    70.66   10.1    39.9    65  51945
    California   West      21198   5114        1.1    71.71   10.3    62.6    20 156361
    Colorado     West       2541   4884        0.7    72.06    6.8    63.9   166 103766
    > state.info <- mystate.info[1:15, ]
    > inc.order <- order(state.info$Income) # 默认升序
    > state.info[inc.order, ]
                       Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
    Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
    Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
    Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
    Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677
    Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
    Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
    Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
    Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
    Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
    Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
    Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
    Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
    California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
    Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
    Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
    > 
    > # 排序是增加次要键值,格式,order(主要健值,次要键值,……)
    > # 以state.info 数据框为例,将Region作为主要健值,Income作为次要健值,升序排。
    > inc.order2 <- order(state.info$Region, state.info$Income)
    > state.info[inc.order2, ]
                       Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
    Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
    Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
    Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
    Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
    Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
    Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
    Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
    Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
    Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
    Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677
    Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
    Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
    Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
    California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
    Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
    > # 在排序结果中south在northeast和north central之间,错了吗?这是由于state.region是一个因子,class()可知。
    > # 对因子而言order的排序,相当于是执行levels排序,所以应该小心。
    > 
    > # 混合排序。部分字段升序排,部分字段降序排,用xtfrm(),可将原向量转为数值向量,当想要以不同方式排序时,在xtfrm()前加上—即可
    > 
    > #以state.info为例,将Region作为主要健值升序排,Income作次要健值降序排。
    > mix.order <- order(state.info$Region, -xtfrm(state.info$Income))
    > state.info[mix.order, ]
                       Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
    Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
    Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
    Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
    Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
    Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
    Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
    Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
    Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
    Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
    Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
    California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
    Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
    Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
    Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
    Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677
    

    公式符号等

    > # 公式符号,指的是统计学符号,基本的如下
    > # y ~ a y是a的函数
    > # y ~ a + b y是a和b的函数
    > # y ~ a - b y是a的函数但排除b
    > 
    > # 认识长格式数据(Long Format)与宽格式数据(Wide Format)
    > # reshapes2扩展包的melt()函数/dcast()函数
    
    

    相关文章

      网友评论

          本文标题:R,笔记04

          本文链接:https://www.haomeiwen.com/subject/jjnzhqtx.html