美文网首页
R语言package3: stringr

R语言package3: stringr

作者: 小程的学习笔记 | 来源:发表于2023-03-10 23:51 被阅读0次

    ❀ str_c:字符串拼接,与R语言自带的paste和paste0函数具有相似的作用

    str_c(..., sep = "", collapse = NULL)

    library(stringr)
    # 默认无向量分割符拼接
    str_c("Letter: ", letters)
    ## [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e" "Letter: f" "Letter: g" "Letter: h" "Letter: i"
    ## [10] "Letter: j" "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o" "Letter: p" "Letter: q" "Letter: r"
    ## [19] "Letter: s" "Letter: t" "Letter: u" "Letter: v" "Letter: w" "Letter: x" "Letter: y" "Letter: z"
    
    str_c(letters, " is for", "...")
    ## [1] "a is for..." "b is for..." "c is for..." "d is for..." "e is for..." "f is for..." "g is for..."
    ## [8] "h is for..." "i is for..." "j is for..." "k is for..." "l is for..." "m is for..." "n is for..."
    ## [15] "o is for..." "p is for..." "q is for..." "r is for..." "s is for..." "t is for..." "u is for..."
    ## [22] "v is for..." "w is for..." "x is for..." "y is for..." "z is for..."
    
    # 指定向量分隔符
    str_c("Letter", letters, sep = ": ")
    ## [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e" "Letter: f" "Letter: g" "Letter: h" "Letter: i"
    ## [10] "Letter: j" "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o" "Letter: p" "Letter: q" "Letter: r"
    ## [19] "Letter: s" "Letter: t" "Letter: u" "Letter: v" "Letter: w" "Letter: x" "Letter: y" "Letter: z"
    
    str_c(letters, collapse = ", ")
    [1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"
    
    # 与 paste() 的区别
    str_c(c("a", NA, "b"), "-d")
    ## [1] "a-d" NA    "b-d"
    
    paste0(c("a", NA, "b"), "-d")
    ## [1] "a-d"  "NA-d" "b-d"
    
    # 使用 str_replace_NA 显示文字 NA
    str_c(str_replace_na(c("a", NA, "b")), "-d")
    ## [1] "a-d"  "NA-d" "b-d"
    

    ★ collapse用于将输出合并为单个字符串的可选字符串
    ★ str_c类似于paste0(),但使用 tidyverse 回收和 NA规则

    ❀ str_length:计算长度/宽度

    str_length(string) # 返回字符串中代码点的数量。

    str_length(letters)
    ## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
    
    str_length(NA)
    ## [1] NA
    
    str_length(factor("abc"))
    ## [1] 3
    
    str_length(c("i", "like", "programming", NA))
    ## [1]  1  4 11 NA
    

    ❀ str_subset:查找匹配元素

    str_subset(string, pattern, negate = FALSE)

    fruit <- c("apple", "banana", "pear", "pineapple")
    
    str_subset(fruit, "a")
    ##[1] "apple"     "banana"    "pear"      "pineapple"
    
    str_subset(fruit, "^a")
    ## [1] "apple"
    
    str_subset(fruit, "a$")
    ## [1] "banana"
    
    str_subset(fruit, "b")
    ##[1] "banana"
    
    str_subset(fruit, "[aeiou]")
    ##[1] "apple"     "banana"    "pear"      "pineapple"
    
    # 不匹配的元素
    str_subset(fruit, "^p", negate = TRUE)
    ## [1] "apple"  "banana"
    

    ★ negate如果TRUE,返回不匹配的元素

    ❀ str_sub:使用它们的位置获取和设置子串

    str_sub(string, start = 1L, end = -1L)
    str_sub(string, start = 1L, end = -1L, omit_na = FALSE) <- value
    str_sub_all(string, start = 1L, end = -1L)

    hw <- "Hadley Wickham"
    
    str_sub(hw, 1, 6) # 1和6分别是子集的起始和中止位置
    ## [1] "Hadley"
    
    str_sub(hw, end = 6)
    ##[1] "Hadley"
    
    str_sub(hw, 8)
    ## [1] "Wickham"
    
    # 正数表示从前往后数,负数表示从后往前数
    str_sub(hw, -7)
    ## [1] "Wickham"
    
    str_sub(hw, end = -7)
    ## [1] "Hadley W"
    
    # 1和8分别是两个子集的起始位置,6和14分别是两个子集的终止位置
    str_sub(hw, c(1, 8), c(6, 14))
    ## [1] "Hadley"  "Wickham"
    
    # 若想从多个字符串中提取多个位置,使用str_sub_all()
    x <- c("abcde", "ghifgh")
    str_sub(x, c(1, 2), c(2, 4))
    ## [1] "ab"  "hif"
    
    str_sub_all(x, start = c(1, 2), end = c(2, 4))
    ## [[1]]
    ## [1] "ab"  "bcd"
    ## 
    ## [[2]]
    ## [1] "gh"  "hif"
    
    # 使用str_sub()函数的赋值形式来修改字符串
    x <- "BBCDEF"
    str_sub(x, 1, 1) <- "A"; x
    ## [1] "ABCDEF"
    
    str_sub(x, -1, -1) <- "K"; x
    ## [1] "ABCDEK"
    
    str_sub(x, -2, -2) <- "GHIJ"; x
    ## [1] "ABCDGHIJK"
    
    str_sub(x, 2, -2) <- ""; x
    ## [1] "AK"
    

    ❀ str_locate:查找匹配的位置

    str_locate(string, pattern)
    str_locate_all(string, pattern)

    fruit <- c("apple", "banana", "pear", "pineapple")
    
    # 为每个元素返回第一个匹配的具有两列和一行的整数矩阵
    str_locate(fruit, "$")
    ##      start end
    ## [1,]     6   5
    ## [2,]     7   6
    ## [3,]     5   4
    ## [4,]    10   9
    
    str_locate(fruit, "e")
    ##      start end
    ## [1,]     5   5
    ## [2,]    NA  NA
    ## [3,]     2   2
    ## [4,]     4   4
    
    str_locate(fruit, c("a", "b", "p", "p"))
    ##      start end
    ## [1,]     1   1
    ## [2,]     1   1
    ## [3,]     1   1
    ## [4,]     1   1
    
    # 返回所有符合元素的矩阵列表
    str_locate_all(fruit, "a")
    ## [[1]]
    ##      start end
    ## [1,]     1   1
    ## 
    ## [[2]]
    ##      start end
    ## [1,]     2   2
    ## [2,]     4   4
    ## [3,]     6   6
    ## 
    ## [[3]]
    ##      start end
    ## [1,]     3   3
    ## 
    ## [[4]]
    ##      start end
    ## [1,]     5   5
    
    str_locate_all(fruit, c("a", "b", "p", "p"))
    ## [[1]]
    ##      start end
    ## [1,]     1   1
    ## 
    ## [[2]]
    ##      start end
    ## [1,]     1   1
    ## 
    ## [[3]]
    ##      start end
    ## [1,]     1   1
    ## 
    ## [[4]]
    ##      start end
    ## [1,]     1   1
    ## [2,]     6   6
    ## [3,]     7   7
    

    ❀ str_extract:字符串提取

    str_extract(string, pattern, group = NULL)
    str_extract_all(string, pattern, simplify = FALSE)

    shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
    str_extract(shopping_list, "\\d") # \\d 匹配一个数字字符, 等价于 [0-9]
    ## [1] "4" NA  NA  "2"
    
    str_extract(shopping_list, "[a-z]+")
    ## [1] "apples" "bag"    "bag"    "milk" 
    
    str_extract(shopping_list, "[a-z]{1,4}")
    ## [1] "appl" "bag"  "bag"  "milk"
    
    str_extract(shopping_list, "\\b[a-z]{1,4}\\b")  # \\b特指单词边界 
    ## [1] NA     "bag"  "bag"  "milk"
    
    str_extract(shopping_list, "([a-z]+) of ([a-z]+)")
    ## [1] NA             "bag of flour" "bag of sugar" NA 
    
    # 提取所有匹配项
    str_extract_all(shopping_list, "[a-z]+")
    ## [[1]]
    ## [1] "apples" "x"     
    ## 
    ## [[2]]
    ## [1] "bag"   "of"    "flour"
    ## 
    ## [[3]]
    ## [1] "bag"   "of"    "sugar"
    ## 
    ## [[4]]
    ## [1] "milk" "x"
    
    str_extract_all(shopping_list, "\\b[a-z]+\\b")
    ## [[1]]
    ## [1] "apples"
    ## 
    ## [[2]]
    ## [1] "bag"   "of"    "flour"
    ## 
    ## [[3]]
    ## [1] "bag"   "of"    "sugar"
    ## 
    ## [[4]]
    ## [1] "milk"
    
    str_extract_all(shopping_list, "\\d")
    ## [[1]]
    ## [1] "4"
    ## 
    ## [[2]]
    ## character(0)
    ## 
    ## [[3]]
    ## character(0)
    ## 
    ## [[4]]
    ## [1] "2"
    
    # 将结果简化为字符矩阵
    str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)
    ##      [,1]     [,2] [,3]   
    ## [1,] "apples" ""   ""     
    ## [2,] "bag"    "of" "flour"
    ## [3,] "bag"    "of" "sugar"
    ## [4,] "milk"   ""   ""  
    
    str_extract_all(shopping_list, "\\d", simplify = TRUE)
    ##      [,1]
    ## [1,] "4" 
    ## [2,] ""  
    ## [3,] ""  
    ## [4,] "2"
    

    ❀ str_dup:字符串复制

    str_dup(string, times) # time表示重复的次数

    fruit <- c("apple", "pear", "banana")
    str_dup(fruit, 2)
    ## [1] "appleapple"   "pearpear"     "bananabanana"
    
    str_dup(fruit, 1:3)
    ## [1] "apple"              "pearpear"           "bananabananabanana"
    
    str_c("ba", str_dup("na", 0:5))
    ## [1] "ba"           "bana"         "banana"       "bananana"     "banananana"   "bananananana"
    

    ❀ str_count:字符串计数

    str_count(string, pattern = "")

    fruit <- c("apple", "banana", "pear", "pineapple")
    str_count(fruit, "a")
    ## [1] 1 3 1 1
    
    str_count(fruit, c("a", "b", "p", "p"))
    ## [1] 1 1 1 3
    

    ❀ str_split:字符串分割

    str_split(string, pattern, n = Inf, simplify = FALSE) # 接受一个字符向量并返回一个列表
    str_split_1(string, pattern) # 接受一个字符串并返回一个字符向量
    str_split_fixed(string, pattern, n) # 接受一个字符向量并返回一个矩阵
    str_split_i(string, pattern, i) # 接受一个字符向量并返回一个字符向量

    fruits <- c(
      "apples and oranges and pears and bananas",
      "pineapples and mangos and guavas"
    )
    
    str_split(fruits, " and ")
    ## [[1]]
    ## [1] "apples"  "oranges" "pears"   "bananas"
    ## 
    ## [[2]]
    ## [1] "pineapples" "mangos"     "guavas" 
    
    str_split(fruits, " and ", simplify = TRUE)
    ##      [,1]         [,2]      [,3]     [,4]     
    ## [1,] "apples"     "oranges" "pears"  "bananas"
    ## [2,] "pineapples" "mangos"  "guavas" ""  
    
    # 拆分单个字符串
    str_split_1(fruits[[1]], " and ")
    ## [1] "apples"  "oranges" "pears"   "bananas"
    
    # 指定 n 以限制可能匹配的数量
    str_split(fruits, " and ", n = 3)
    ## [[1]]
    ## [1] "apples"            "oranges"           "pears and bananas"
    ## 
    ## [[2]]
    ## [1] "pineapples" "mangos"     "guavas" 
    
    # 若n 大于件数,则不会发生填充
    str_split(fruits, " and ", n = 5)
    ## [[1]]
    ## [1] "apples"  "oranges" "pears"   "bananas"
    ## 
    ## [[2]]
    ## [1] "pineapples" "mangos"     "guavas"  
    
    # 使用 fixed 返回一个字符矩阵
    str_split_fixed(fruits, " and ", 3)
    ##      [,1]         [,2]      [,3]               
    ## [1,] "apples"     "oranges" "pears and bananas"
    ## [2,] "pineapples" "mangos"  "guavas"
    
    # 仅从字符串中提取单个片段
    str_split_i(fruits, " and ", 1)
    ## [1] "apples"     "pineapples"
    
    str_split_i(fruits, " and ", 4)
    ## [1] "bananas" NA
    
    # 使用负数从末尾选择
    str_split_i(fruits, " and ", -1)
    ## [1] "bananas" "guavas"
    

    ❀ str_replace:字符串替换

    str_replace(string, pattern, replacement)
    str_replace_all(string, pattern, replacement)

    fruits <- c("one apple", "two pears", "three bananas")
    str_replace(fruits, "[aeiou]", "-")
    ## [1] "-ne apple"     "tw- pears"     "thr-e bananas"
    
    str_replace_all(fruits, "[aeiou]", "-")
    ## [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"
    
    str_replace(fruits, "([aeiou])", "\\1\\1") # \\1是匹配第一个分组匹配到的内容
    ## [1] "oone apple"     "twoo pears"     "threee bananas"
    
    # str_replace() 是沿着文本、模式和替换向量化的
    str_replace(fruits, "[aeiou]", c("1", "2", "3"))
    ## [1] "1ne apple"     "tw2 pears"     "thr3e bananas"
    
    # 将多个模式和替换应用于同一字符串
    fruits %>%
      str_c(collapse = "---") %>%
      str_replace_all(c("one" = "1", "two" = "2", "three" = "3"))
    ## [1] "1 apple---2 pears---3 bananas"
    

    参考:

    1. https://www.rdocumentation.org/packages/stringr/versions/1.5.0

    相关文章

      网友评论

          本文标题:R语言package3: stringr

          本文链接:https://www.haomeiwen.com/subject/ysxrkdtx.html