美文网首页生信小白R语言编程进阶
R for data science ||使用stringr处理

R for data science ||使用stringr处理

作者: 周运来就是我 | 来源:发表于2019-08-02 04:49 被阅读14次

    对于非结构化和半结构化的数据,正则表达式可以用非常简练的语言来描述字符串中的表达模式。第一次见到正则表达式,你可能会以为这是猫咪在键盘上踩出来的,但是随着逐渐加深对他的理解后,你就会体会其中的深刻含义了。

    str_length(c("a", "R for data science", NA))
    [1]  1 18 NA
    

    字符串组合

    str_c("x", "y")
    #> [1] "xy"
    str_c("x", "y", "z")
    #> [1] "xyz"
    str_c("x", "y", sep = ", ")
    #> [1] "x, y"
    
    x <- c("abc", NA)
    str_c("|-", x, "-|")
    #> [1] "|-abc-|" NA
    str_c("|-", str_replace_na(x), "-|")
    #> [1] "|-abc-|" "|-NA-|"
    
    str_c("prefix-", c("a", "b", "c"), "-suffix")
    #> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
    
    name <- "Hadley"
    time_of_day <- "morning"
    birthday <- FALSE
    
    str_c(
      "Good ", time_of_day, " ", name,
      if (birthday) " and HAPPY BIRTHDAY",
      "."
    )
    #> [1] "Good morning Hadley."
    
    str_c(c("x", "y", "z"), collapse = ", ")
    #> [1] "x, y, z"
    
    

    提取子字符串

    x <- c("Apple", "Banana", "Pear")
    str_sub(x, 1, 3)
    #> [1] "App" "Ban" "Pea"
    # negative numbers count backwards from end
    str_sub(x, -3, -1)
    #> [1] "ple" "ana" "ear"
    
    str_sub("a", 1, 5)
    #> [1] "a"
    
    str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
    x
    #> [1] "apple"  "banana" "pear"
    

    区域设置

    # Turkish has two i's: with and without a dot, and it
    # has a different rule for capitalising them:
    str_to_upper(c("i", "ı"))
    #> [1] "I" "I"
    str_to_upper(c("i", "ı"), locale = "tr")
    #> [1] "İ" "I"
    
    x <- c("apple", "eggplant", "banana")
    
    str_sort(x, locale = "en")  # English
    #> [1] "apple"    "banana"   "eggplant"
    
    str_sort(x, locale = "haw") # Hawaiian
    #> [1] "apple"    "eggplant" "banana"
    
    正则表达式模式匹配
    x <- c("apple", "banana", "pear")
    str_view(x, "an")
    banana
    
    str_view(x, ".a.")
    banana
    pear
    
    锚定

    ^ to match the start of the string.
    $ to match the end of the string.

    x <- c("apple", "banana", "pear")
    str_view(x, "^a")
    apple
    
    str_view(x, "a$")
    
    banana
    
    字符串类和字符选项

    \d: matches any digit.
    \s: matches any whitespace (e.g. space, tab, newline).
    [abc]: matches a, b, or c.

    # Look for a literal character that normally has special meaning in a regex
    str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
    a.c
    
    str_view(c("grey", "gray"), "gr(e|a)y")
    grey
    gray
    
    
    重复

    ?: 0 or 1
    +: 1 or more
    *: 0 or more

    x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
    str_view(x, "CC?")
    

    1888 is the longest year in Roman numerals: MDCCCLXXXVIII

    str_view(x, "CC+")
    

    1888 is the longest year in Roman numerals: MDCCCLXXXVIII

    str_view(x, 'C[LX]+')
    

    1888 is the longest year in Roman numerals: MDCCCLXXXVIII

    {n}: exactly n
    {n,}: n or more
    {,m}: at most m
    {n,m}: between n and m
    str_view(x, "C{2}")
    
    分组与回溯引用
    str_view(fruit, "(..)\\1", match = TRUE)
    

    banana
    coconut
    cucumber
    jujube
    papaya
    salal berry

    匹配检测
    x <- c("apple", "banana", "pear")
    str_detect(x, "e")
    #> [1]  TRUE FALSE  TRUE
    
    # How many common words start with t?
    sum(str_detect(words, "^t"))
    #> [1] 65
    # What proportion of common words end with a vowel?
    mean(str_detect(words, "[aeiou]$"))
    #> [1] 0.277
    
    # Find all words containing at least one vowel, and negate
    no_vowels_1 <- !str_detect(words, "[aeiou]")
    # Find all words consisting only of consonants (non-vowels)
    no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
    identical(no_vowels_1, no_vowels_2)
    #> [1] TRUE
    
    words[str_detect(words, "x$")]
    #> [1] "box" "sex" "six" "tax"
    str_subset(words, "x$")
    #> [1] "box" "sex" "six" "tax"
    
    df <- tibble(
      word = words, 
      i = seq_along(word)
    )
    df %>% 
      filter(str_detect(word, "x$"))
    #> # A tibble: 4 x 2
    #>   word      i
    #>   <chr> <int>
    #> 1 box     108
    #> 2 sex     747
    #> 3 six     772
    #> 4 tax     841
    
    x <- c("apple", "banana", "pear")
    str_count(x, "a")
    #> [1] 1 3 1
    
    # On average, how many vowels per word?
    mean(str_count(words, "[aeiou]"))
    #> [1] 1.99
    
    df %>% 
      mutate(
        vowels = str_count(word, "[aeiou]"),
        consonants = str_count(word, "[^aeiou]")
      )
    #> # A tibble: 980 x 4
    #>   word         i vowels consonants
    #>   <chr>    <int>  <int>      <int>
    #> 1 a            1      1          0
    #> 2 able         2      2          2
    #> 3 about        3      3          2
    #> 4 absolute     4      4          4
    #> 5 accept       5      2          4
    #> 6 account      6      3          4
    #> # … with 974 more rows
    
    str_count("abababa", "aba")
    #> [1] 2
    str_view_all("abababa", "aba")
    

    aba b aba

    提取匹配内容
    length(sentences)
    #> [1] 720
    head(sentences)
    #> [1] "The birch canoe slid on the smooth planks." 
    #> [2] "Glue the sheet to the dark blue background."
    #> [3] "It's easy to tell the depth of a well."     
    #> [4] "These days a chicken leg is a rare dish."   
    #> [5] "Rice is often served in round bowls."       
    #> [6] "The juice of lemons makes fine punch."
    
    colours <- c("red", "orange", "yellow", "green", "blue", "purple")
    colour_match <- str_c(colours, collapse = "|")
    colour_match
    #> [1] "red|orange|yellow|green|blue|purple"
    
    has_colour <- str_subset(sentences, colour_match)
    matches <- str_extract(has_colour, colour_match)
    head(matches)
    #> [1] "blue" "blue" "red"  "red"  "red"  "blue"
    
    more <- sentences[str_count(sentences, colour_match) > 1]
    str_view_all(more, colour_match)
    

    It is hard to erase blue or red ink.
    Thegreen light in the brown box flickered.
    The sky in the west is tinged with orange red.

    str_extract(more, colour_match)
    #> [1] "blue"   "green"  "orange"
    
    分组匹配
    noun <- "(a|the) ([^ ]+)"
    
    has_noun <- sentences %>%
      str_subset(noun) %>%
      head(10)
    has_noun %>% 
      str_extract(noun)
    #>  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
    #>  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"
    
    tibble(sentence = sentences) %>% 
      tidyr::extract(
        sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
        remove = FALSE
      )
    #> # A tibble: 720 x 3
    #>   sentence                                    article noun   
    #>   <chr>                                       <chr>   <chr>  
    #> 1 The birch canoe slid on the smooth planks.  the     smooth 
    #> 2 Glue the sheet to the dark blue background. the     sheet  
    #> 3 It's easy to tell the depth of a well.      the     depth  
    #> 4 These days a chicken leg is a rare dish.    a       chicken
    #> 5 Rice is often served in round bowls.        <NA>    <NA>   
    #> 6 The juice of lemons makes fine punch.       <NA>    <NA>   
    #> # … with 714 more rows
    
    替换匹配内容
    x <- c("apple", "pear", "banana")
    str_replace(x, "[aeiou]", "-")
    #> [1] "-pple"  "p-ar"   "b-nana"
    str_replace_all(x, "[aeiou]", "-")
    #> [1] "-ppl-"  "p--r"   "b-n-n-"
    
    x <- c("1 house", "2 cars", "3 people")
    str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
    #> [1] "one house"    "two cars"     "three people"
    
    sentences %>% 
      str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
      head(5)
    #> [1] "The canoe birch slid on the smooth planks." 
    #> [2] "Glue sheet the to the dark blue background."
    #> [3] "It's to easy tell the depth of a well."     
    #> [4] "These a days chicken leg is a rare dish."   
    #> [5] "Rice often is served in round bowls."
    
    拆分
    sentences %>%
      head(5) %>% 
      str_split(" ")
    #> [[1]]
    #> [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
    #> [8] "planks."
    #> 
    #> [[2]]
    #> [1] "Glue"        "the"         "sheet"       "to"          "the"        
    #> [6] "dark"        "blue"        "background."
    #> 
    #> [[3]]
    #> [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
    #> 
    #> [[4]]
    #> [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
    #> [8] "rare"    "dish."  
    #> 
    #> [[5]]
    #> [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."
    
    "a|b|c|d" %>% 
      str_split("\\|") %>% 
      .[[1]]
    #> [1] "a" "b" "c" "d"
    
    sentences %>%
      head(5) %>% 
      str_split(" ", simplify = TRUE)
    #>      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]    
    #> [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth"
    #> [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"  
    #> [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"    
    #> [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"     
    #> [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls."
    #>      [,8]          [,9]   
    #> [1,] "planks."     ""     
    #> [2,] "background." ""     
    #> [3,] "a"           "well."
    #> [4,] "rare"        "dish."
    #> [5,] ""            ""
    

    r4ds

    相关文章

      网友评论

        本文标题:R for data science ||使用stringr处理

        本文链接:https://www.haomeiwen.com/subject/vabarctx.html