美文网首页
R字符串处理2

R字符串处理2

作者: 一刀YiDao | 来源:发表于2016-09-02 15:49 被阅读49次

    1 、连接字符串:str_c()

    str_c(..., sep = "", collapse = NULL)
    
    

    sep:连接两个字符之间插入的符号
    collapse:连接后,使用哪个字符分割

    示例

    > str_c("Letter: ", letters)
     [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e" "Letter: f" "Letter: g" "Letter: h"
     [9] "Letter: i" "Letter: j" "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o" "Letter: p"
    [17] "Letter: q" "Letter: r" "Letter: s" "Letter: t" "Letter: u" "Letter: v" "Letter: w" "Letter: x"
    [25] "Letter: y" "Letter: z"
    
    > str_c("Letter", letters, sep = ": ")
     [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e" "Letter: f" "Letter: g" "Letter: h"
     [9] "Letter: i" "Letter: j" "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o" "Letter: p"
    [17] "Letter: q" "Letter: r" "Letter: s" "Letter: t" "Letter: u" "Letter: v" "Letter: w" "Letter: x"
    [25] "Letter: y" "Letter: z"
    
    > str_c(letters, " is for", "...")
     [1] "a is for..." "b is for..." "c is for..." "d is for..." "e is for..." "f is for..." "g is for..."
     [8] "h is for..." "i is for..." "j is for..." "k is for..." "l is for..." "m is for..." "n is for..."
    [15] "o is for..." "p is for..." "q is for..." "r is for..." "s is for..." "t is for..." "u is for..."
    [22] "v is for..." "w is for..." "x is for..." "y is for..." "z is for..."
    
    > str_c(letters[-26], " comes before ", letters[-1])
     [1] "a comes before b" "b comes before c" "c comes before d" "d comes before e" "e comes before f"
     [6] "f comes before g" "g comes before h" "h comes before i" "i comes before j" "j comes before k"
    [11] "k comes before l" "l comes before m" "m comes before n" "n comes before o" "o comes before p"
    [16] "p comes before q" "q comes before r" "r comes before s" "s comes before t" "t comes before u"
    [21] "u comes before v" "v comes before w" "w comes before x" "x comes before y" "y comes before z"
    > str_c(letters, collapse = "")
    [1] "abcdefghijklmnopqrstuvwxyz"
    
    > str_c(letters, collapse = ", ")
    [1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"
    
    > str_c(c("a", NA, "b"), "-d")
    [1] "a-d"  "NA-d" "b-d"
    
    

    2 、字符串计数:str_count()

    str_count(string, pattern = "")
    
    

    string:字符串
    pattern:对哪个字符进行计数

    示例

    > fruit <- c("apple", "banana", "pear", "pineapple")
    > str_count(fruit, "a")
    [1] 1 3 1 1
    > str_count(fruit, "p")
    [1] 2 0 1 3
    > str_count(fruit, "e")
    [1] 1 0 1 2
    > str_count(fruit, c("a", "b", "p", "p"))
    [1] 1 1 1 3
    > str_count(c("a.", "...", ".a.a"), ".")
    [1] 2 3 4
    > str_count(c("a.", "...", ".a.a"), fixed("."))
    [1] 1 3 2
    
    

    3 、字符串逻辑判断:str_detect()

    str_detect(string, pattern = "")
    
    

    string:字符串
    pattern:对哪个字符进行逻辑判断

    示例

    > fruit <- c("apple", "banana", "pear", "pinapple")
    > str_detect(fruit, "a")
    [1] TRUE TRUE TRUE TRUE
    > str_detect(fruit, "^a")
    [1]  TRUE FALSE FALSE FALSE
    > str_detect(fruit, "a$")
    [1] FALSE  TRUE FALSE FALSE
    > str_detect(fruit, "b")
    [1] FALSE  TRUE FALSE FALSE
    > str_detect(fruit, "[aeiou]")
    [1] TRUE TRUE TRUE TRUE
    > # Also vectorised over pattern
    > str_detect("aecfg", letters)
     [1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
    [17] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
    
    

    4 、复制字符串:str_dup()

    str_dup(string, times)
    
    

    string:字符串
    times:复制的次数

    示例

    > fruit <- c("apple", "pear", "banana")
    > str_dup(fruit, 2)
    [1] "appleapple"   "pearpear"     "bananabanana"
    > str_dup(fruit, 1:3)
    [1] "apple"              "pearpear"           "bananabananabanana"
    > str_c("ba", str_dup("na", 0:5))
    [1] "ba"           "bana"         "banana"       "bananana"     "banananana"   "bananananana"
    

    5 、从字符串中提取匹配字符:str_extract()

    str_extract(string, pattern)
    str_extract_all(string, pattern, simplify = FALSE)
    

    string:字符串
    pattern:匹配的字符,默认正则和非正则

    示例

    
    > shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
    > str_extract(shopping_list, "\\d")
    [1] "4" NA  NA  "2"
    > str_extract(shopping_list, "[a-z]+")
    [1] "apples" "bag"    "bag"    "milk"  
    > str_extract(shopping_list, "[a-z]{1,4}")
    [1] "appl" "bag"  "bag"  "milk"
    > str_extract(shopping_list, "\\b[a-z]{1,4}\\b")
    [1] NA     "bag"  "bag"  "milk"
    > str_extract_all(shopping_list, "[a-z]+")
    [[1]]
    [1] "apples" "x"     
    
    [[2]]
    [1] "bag"   "of"    "flour"
    
    [[3]]
    [1] "bag"   "of"    "sugar"
    
    [[4]]
    [1] "milk" "x"   
    
    > str_extract_all(shopping_list, "\\b[a-z]+\\b")
    [[1]]
    [1] "apples"
    
    [[2]]
    [1] "bag"   "of"    "flour"
    
    [[3]]
    [1] "bag"   "of"    "sugar"
    
    [[4]]
    [1] "milk"
    
    > str_extract_all(shopping_list, "\\d")
    [[1]]
    [1] "4"
    
    [[2]]
    character(0)
    
    [[3]]
    character(0)
    
    [[4]]
    [1] "2"
    
    

    6 、字符串的长度:str_length()

    str_length(string))
    

    string:字符串

    示例

    
    > str_length(letters)
     [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
    > str_length(NA)
    [1] NA
    > str_length(factor("abc"))
    [1] 3
    > str_length(c("i", "like", "programming", NA))
    [1]  1  4 11 NA
    > # Two ways of representing a u with an umlaut
    > u1 <- "\u00fc"
    > u2 <- stringi::stri_trans_nfd(u1)
    > # The print the same:
    > u1
    [1] "ü"
    > u2
    [1] "ü"
    > # But have a different length
    > str_length(u1)
    [1] 1
    > str_length(u2)
    [1] 2
    
    

    7 、找到匹配的字符串的位置:str_locate()

    str_locate(string, pattern)
    str_locate_all(string, pattern)
    

    string:字符串
    pattern:匹配的字符,默认正则和非正则

    示例

    > fruit <- c("apple", "banana", "pear", "pineapple")
    > str_locate(fruit, "$")
         start end
    [1,]     6   5
    [2,]     7   6
    [3,]     5   4
    [4,]    10   9
    > str_locate(fruit, "a")
         start end
    [1,]     1   1
    [2,]     2   2
    [3,]     3   3
    [4,]     5   5
    > str_locate(fruit, "e")
         start end
    [1,]     5   5
    [2,]    NA  NA
    [3,]     2   2
    [4,]     4   4
    > str_locate(fruit, c("a", "b", "p", "p"))
         start end
    [1,]     1   1
    [2,]     1   1
    [3,]     1   1
    [4,]     1   1
    > str_locate_all(fruit, "a")
    [[1]]
         start end
    [1,]     1   1
    
    [[2]]
         start end
    [1,]     2   2
    [2,]     4   4
    [3,]     6   6
    
    [[3]]
         start end
    [1,]     3   3
    
    [[4]]
         start end
    [1,]     5   5
    
    > str_locate_all(fruit, "e")
    [[1]]
         start end
    [1,]     5   5
    
    [[2]]
         start end
    
    [[3]]
         start end
    [1,]     2   2
    
    [[4]]
         start end
    [1,]     4   4
    [2,]     9   9
    
    > str_locate_all(fruit, "")
    [[1]]
         start end
    [1,]     1   0
    [2,]     2   1
    [3,]     3   2
    [4,]     4   3
    [5,]     5   4
    
    [[2]]
         start end
    [1,]     1   0
    [2,]     2   1
    [3,]     3   2
    [4,]     4   3
    [5,]     5   4
    [6,]     6   5
    
    [[3]]
         start end
    [1,]     1   0
    [2,]     2   1
    [3,]     3   2
    [4,]     4   3
    
    [[4]]
          start end
     [1,]     1   0
     [2,]     2   1
     [3,]     3   2
     [4,]     4   3
     [5,]     5   4
     [6,]     6   5
     [7,]     7   6
     [8,]     8   7
     [9,]     9   8
    
    

    8 、从字符串中提取匹配组:str_match()

    str_match(string, pattern)
    str_match_all(string, pattern)
    

    string:字符串
    pattern:匹配的字符,默认正则和非正则

    示例

    
    > strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
    + "387 287 6718", "apple", "233.398.9187 ", "482 952 3315",
    + "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
    + "Home: 543.355.3679")
    > phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
    > str_extract(strings, phone)
     [1] "219 733 8965" "329-293-8753" NA             "595 794 7569" "387 287 6718" NA            
     [7] "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527" NA             "543.355.3679"
    > str_match(strings, phone)
          [,1]           [,2]  [,3]  [,4]  
     [1,] "219 733 8965" "219" "733" "8965"
     [2,] "329-293-8753" "329" "293" "8753"
     [3,] NA             NA    NA    NA    
     [4,] "595 794 7569" "595" "794" "7569"
     [5,] "387 287 6718" "387" "287" "6718"
     [6,] NA             NA    NA    NA    
     [7,] "233.398.9187" "233" "398" "9187"
     [8,] "482 952 3315" "482" "952" "3315"
     [9,] "239 923 8115" "239" "923" "8115"
    [10,] "579-499-7527" "579" "499" "7527"
    [11,] NA             NA    NA    NA    
    [12,] "543.355.3679" "543" "355" "3679"
    > str_extract_all(strings, phone)
    [[1]]
    [1] "219 733 8965"
    
    [[2]]
    [1] "329-293-8753"
    
    [[3]]
    character(0)
    
    [[4]]
    [1] "595 794 7569"
    
    [[5]]
    [1] "387 287 6718"
    
    [[6]]
    character(0)
    
    [[7]]
    [1] "233.398.9187"
    
    [[8]]
    [1] "482 952 3315"
    
    [[9]]
    [1] "239 923 8115" "842 566 4692"
    
    [[10]]
    [1] "579-499-7527"
    
    [[11]]
    character(0)
    
    [[12]]
    [1] "543.355.3679"
    
    > str_match_all(strings, phone)
    [[1]]
         [,1]           [,2]  [,3]  [,4]  
    [1,] "219 733 8965" "219" "733" "8965"
    
    [[2]]
         [,1]           [,2]  [,3]  [,4]  
    [1,] "329-293-8753" "329" "293" "8753"
    
    [[3]]
    character(0)
    
    [[4]]
         [,1]           [,2]  [,3]  [,4]  
    [1,] "595 794 7569" "595" "794" "7569"
    
    [[5]]
         [,1]           [,2]  [,3]  [,4]  
    [1,] "387 287 6718" "387" "287" "6718"
    
    [[6]]
    character(0)
    
    [[7]]
         [,1]           [,2]  [,3]  [,4]  
    [1,] "233.398.9187" "233" "398" "9187"
    
    [[8]]
         [,1]           [,2]  [,3]  [,4]  
    [1,] "482 952 3315" "482" "952" "3315"
    
    [[9]]
         [,1]           [,2]  [,3]  [,4]  
    [1,] "239 923 8115" "239" "923" "8115"
    [2,] "842 566 4692" "842" "566" "4692"
    
    [[10]]
         [,1]           [,2]  [,3]  [,4]  
    [1,] "579-499-7527" "579" "499" "7527"
    
    [[11]]
    character(0)
    
    [[12]]
         [,1]           [,2]  [,3]  [,4]  
    [1,] "543.355.3679" "543" "355" "3679"
    
    > x <- c("<a> <b>", "<a> <>", "<a>", "", NA)
    > str_match(x, "<(.*?)> <(.*?)>")
         [,1]      [,2] [,3]
    [1,] "<a> <b>" "a"  "b" 
    [2,] "<a> <>"  "a"  ""  
    [3,] NA        NA   NA  
    [4,] NA        NA   NA  
    [5,] NA        NA   NA  
    > str_match_all(x, "<(.*?)>")
    [[1]]
         [,1]  [,2]
    [1,] "<a>" "a" 
    [2,] "<b>" "b" 
    
    [[2]]
         [,1]  [,2]
    [1,] "<a>" "a" 
    [2,] "<>"  ""  
    
    [[3]]
         [,1]  [,2]
    [1,] "<a>" "a" 
    
    [[4]]
    character(0)
    
    [[5]]
    character(0)
    
    > str_extract(x, "<.*?>")
    [1] "<a>" "<a>" "<a>" NA    NA   
    > str_extract_all(x, "<.*?>")
    [[1]]
    [1] "<a>" "<b>"
    
    [[2]]
    [1] "<a>" "<>" 
    
    [[3]]
    [1] "<a>"
    
    [[4]]
    character(0)
    
    [[5]]
    character(0)
    

    9 、字符串增加空字符:str_pad()

    str_pad(string, width, side = c("left", "right", "both"), pad = " ")
    

    string:字符向量
    decreasing:宽度
    side:左边,右边,还是两边增加空格

    示例

    > str_pad("conan", 20, "left")
    [1] "               conan"
    > # 从右边补充空格,直到字符串长度为20
    > str_pad("conan", 20, "right")
    [1] "conan               "
    > # 从左右两边各补充空格,直到字符串长度为20
    > str_pad("conan", 20, "both")
    [1] "       conan        "
    > # 从左右两边各补充x字符,直到字符串长度为20
    > str_pad("conan", 20, "both",'x')
    [1] "xxxxxxxconanxxxxxxxx"
    
    

    10 、替换字符串:str_replace()

    str_replace(string, pattern, replacement)
    str_replace_all(string, pattern, replacement)
    

    string:字符向量
    pattern:匹配的字符,默认正则和非正则

    示例

    > fruits <- c("one apple", "two pears", "three bananas")
    > str_replace(fruits, "[aeiou]", "-")
    [1] "-ne apple"     "tw- pears"     "thr-e bananas"
    > str_replace_all(fruits, "[aeiou]", "-")
    [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"
    > str_replace(fruits, "([aeiou])", "")
    [1] "ne apple"     "tw pears"     "thre bananas"
    > str_replace(fruits, "([aeiou])", "\\1\\1")
    [1] "oone apple"     "twoo pears"     "threee bananas"
    > str_replace(fruits, "[aeiou]", c("1", "2", "3"))
    [1] "1ne apple"     "tw2 pears"     "thr3e bananas"
    > str_replace(fruits, c("a", "e", "i"), "-")
    [1] "one -pple"     "two p-ars"     "three bananas"
    > fruits <- c("one apple", "two pears", "three bananas")
    > str_replace(fruits, "[aeiou]", "-")
    [1] "-ne apple"     "tw- pears"     "thr-e bananas"
    > str_replace_all(fruits, "[aeiou]", "-")
    [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"
    > str_replace_all(fruits, "([aeiou])", "")
    [1] "n ppl"    "tw prs"   "thr bnns"
    > str_replace_all(fruits, "([aeiou])", "\\1\\1")
    [1] "oonee aapplee"      "twoo peeaars"       "threeee baanaanaas"
    > str_replace_all(fruits, "[aeiou]", c("1", "2", "3"))
    [1] "1n1 1ppl1"     "tw2 p22rs"     "thr33 b3n3n3s"
    > str_replace_all(fruits, c("a", "e", "i"), "-")
    [1] "one -pple"     "two p-ars"     "three bananas"
    
    

    11 、分割字符串:str_split()

    str_split(string, pattern, n = Inf, simplify = FALSE)
    str_split_fixed(string, pattern, n)
    

    string:字符向量
    pattern:匹配的字符,默认正则和非正则
    simplify: 如何值为FALSE返回字符串向量,如何值为FALSE返回字符串矩阵

    示例

    > fruits <- c(
    + "apples and oranges and pears and bananas",
    + "pineapples and mangos and guavas"
    + )
    > str_split(fruits, " and ")
    [[1]]
    [1] "apples"  "oranges" "pears"   "bananas"
    
    [[2]]
    [1] "pineapples" "mangos"     "guavas"
    
    > str_split(fruits, " and ", n = 3)
    [[1]]
    [1] "apples"            "oranges"           "pears and bananas"
    
    [[2]]
    [1] "pineapples" "mangos"     "guavas"    
    
    > str_split(fruits, " and ", n = 2)
    [[1]]
    [1] "apples"                        "oranges and pears and bananas"
    
    [[2]]
    [1] "pineapples"        "mangos and guavas"
    
    > # If n greater than number of pieces, no padding occurs
    > str_split(fruits, " and ", n = 5)
    [[1]]
    [1] "apples"  "oranges" "pears"   "bananas"
    
    [[2]]
    [1] "pineapples" "mangos"     "guavas"    
    
    > # Use fixed to return a character matrix
    > str_split_fixed(fruits, " and ", 3)
         [,1]         [,2]      [,3]               
    [1,] "apples"     "oranges" "pears and bananas"
    [2,] "pineapples" "mangos"  "guavas"           
    > str_split_fixed(fruits, " and ", 4)
         [,1]         [,2]      [,3]     [,4]     
    [1,] "apples"     "oranges" "pears"  "bananas"
    [2,] "pineapples" "mangos"  "guavas" ""       
    

    12 、截取字符串:str_sub()

    str_sub(string, start = 1L, end = -1L)
    str_sub(string, start = 1L, end = -1L) <- value
    

    string:字符向量
    start:开始位置
    end: 结束位置

    示例

    > hw <- "Hadley Wickham"
    > str_sub(hw, 1, 6)
    [1] "Hadley"
    
    > str_sub(hw, end = 6)
    [1] "Hadley"
    
    > str_sub(hw, 8, 14)
    [1] "Wickham"
    
    > str_sub(hw, 8)
    [1] "Wickham"
    
    > str_sub(hw, c(1, 8), c(6, 14))
    [1] "Hadley"  "Wickham"
    
    > str_sub(hw, -1)
    [1] "m"
    
    > str_sub(hw, -7)
    [1] "Wickham"
    
    > str_sub(hw, end = -7)
    [1] "Hadley W"
    
    > # Alternatively, you can pass in a two colum matrix, as in the
    > # output from str_locate_all
    > pos <- str_locate_all(hw, "[aeio]")[[1]]
    > str_sub(hw, pos)
    [1] "adley Wickham" "ey Wickham"    "ickham"        "am"            "adley Wickham" "ey Wickham"   
    [7] "ickham"        "am"       
    
    > str_sub(hw, pos[, 1], pos[, 2])
    [1] "a" "e" "i" "a"
    > # Vectorisation
    
    > str_sub(hw, seq_len(str_length(hw)))
     [1] "Hadley Wickham" "adley Wickham"  "dley Wickham"   "ley Wickham"    "ey Wickham"    
     [6] "y Wickham"      " Wickham"       "Wickham"        "ickham"         "ckham"         
    [11] "kham"           "ham"            "am"             "m"       
    
    > str_sub(hw, end = seq_len(str_length(hw)))
     [1] "H"              "Ha"             "Had"            "Hadl"           "Hadle"         
     [6] "Hadley"         "Hadley "        "Hadley W"       "Hadley Wi"      "Hadley Wic"    
    [11] "Hadley Wick"    "Hadley Wickh"   "Hadley Wickha"  "Hadley Wickham"
    
    > # Replacement form
    > x <- "BBCDEF"
    > str_sub(x, 1, 1) <- "A"; x
    [1] "ABCDEF"
    
    > str_sub(x, -1, -1) <- "K"; x
    [1] "ABCDEK"
    
    > str_sub(x, -2, -2) <- "GHIJ"; x
    [1] "ABCDGHIJK"
    
    > str_sub(x, 2, -2) <- ""; x
    [1] "AK"
    

    13 、删除空字符串:str_trim()

    str_trim(string, side = c("both", "left", "right"))
    
    

    string:字符向量
    side:删除左边,右边,两边的空字符

    示例

    > str_trim(" String with trailing and leading white space\t")
    [1] "String with trailing and leading white space"
    
    > str_trim("\n\nString with trailing and leading white space\n\n")
    [1] "String with trailing and leading white space"
    
    

    相关文章

      网友评论

          本文标题:R字符串处理2

          本文链接:https://www.haomeiwen.com/subject/gvahettx.html