美文网首页
R 包学习 - stringr()

R 包学习 - stringr()

作者: Thinkando | 来源:发表于2020-05-18 22:30 被阅读0次

    stringr: R 语言字符串处理包

    • 字符串拼接函数
      str_c: 字符串拼接。
      str_join: 字符串拼接,同str_c。
      str_trim: 去掉字符串的空格和TAB(\t)
      str_pad: 补充字符串的长度
      str_dup: 复制字符串
      str_wrap: 控制字符串输出格式
      str_sub: 截取字符串

    • 字符串计算函数

    str_count: 字符串计数
    str_length: 字符串长度
    str_sort: 字符串值排序
    str_order: 字符串索引排序,规则同str_sort

    • 字符串匹配函数

    str_split: 字符串分割
    str_split_fixed: 字符串分割,同str_split
    str_subset: 返回匹配的字符串
    word: 从文本中提取单词
    str_detect: 检查匹配字符串的字符
    str_match: 从字符串中提取匹配组。
    str_match_all: 从字符串中提取匹配组,同str_match
    str_replace: 字符串替换
    str_replace_all: 字符串替换,同str_replace
    str_replace_na:把NA替换为NA字符串
    str_locate: 找到匹配的字符串的位置。
    str_locate_all: 找到匹配的字符串的位置,同str_locate
    str_extract: 从字符串中提取匹配字符
    str_extract_all: 从字符串中提取匹配字符,同str_extract

    • 字符串变换函数

    str_conv: 字符编码转换
    str_to_upper: 字符串转成大写
    str_to_lower: 字符串转成小写,规则同str_to_upper
    str_to_title: 字符串转成首字母大写,规则同str_to_upper

    • 参数控制函数,仅用于构造功能的参数,不能独立使用。

    boundary: 定义使用边界
    coll: 定义字符串标准排序规则。
    fixed: 定义用于匹配的字符,包括正则表达式中的转义符
    regex: 定义正则表达式

    > str_c('a','b')
    [1] "ab"
    > str_c('a','b',sep='-')
    [1] "a-b"
    > str_c(c('a','a1'),c('b','b1'),sep='-')
    [1] "a-b"   "a1-b1"
    
    > str_c(head(letters), collapse = "")
    [1] "abcdef"
    > str_c(head(letters), collapse = ", ")
    [1] "a, b, c, d, e, f"
    
    > str_c(c('a','a1'),c('b','b1'),collapse='-')
    [1] "ab-a1b1"
    
    #只过滤左边的空格
    > str_trim("  left space\t\n",side='left') 
    [1] "left space\t\n"
    
    #只过滤右边的空格
    > str_trim("  left space\t\n",side='right')
    [1] "  left space"
    
    #过滤两边的空格
    > str_trim("  left space\t\n",side='both')
    [1] "left space"
    
    #过滤两边的空格
    > str_trim("\nno space\n\t")
    [1] "no space"
    
    # 从左边补充空格,直到字符串长度为20
    > str_pad("conan", 20, "left")
    [1] "               conan"
    
    # 从右边补充空格,直到字符串长度为20
    > str_pad("conan", 20, "right")
    [1] "conan               "
    
    # 从左右两边各补充空格,直到字符串长度为20
    > str_pad("conan", 20, "both")
    [1] "       conan        "
    
    # 从左右两边各补充x字符,直到字符串长度为20
    > str_pad("conan", 20, "both",'x')
    [1] "xxxxxxxconanxxxxxxxx"
    
    > val <- c("abca4", 123, "cba2")
    
    # 复制2次
    > str_dup(val, 2)
    [1] "abca4abca4" "123123"     "cba2cba2"  
    
    # 按位置复制
    > str_dup(val, 1:3)
    [1] "abca4"        "123123"       "cba2cba2cba2"
    
    > txt <- "I am Conan."
    
    # 截取1-4的索引位置的字符串
    > str_sub(txt, 1, 4)
    [1] "I am"
    
    # 截取1-6的索引位置的字符串
    > str_sub(txt, end=6)
    [1] "I am C"
    
    # 截取6到结束的索引位置的字符串
    > str_sub(txt, 6)
    [1] "Conan."
    
    # 分2段截取字符串
    > str_sub(txt, c(1, 4), c(6, 8))
    [1] "I am C" "m Con" 
    
    # 通过负坐标截取字符串
    > str_sub(txt, -3)
    [1] "an."
    > str_sub(txt, end = -3)
    [1] "I am Cona"
    
    > x <- "AAABBBCCC"
    
    # 在字符串的1的位置赋值为1
    > str_sub(x, 1, 1) <- 1; x
    [1] "1AABBBCCC"
    
    # 在字符串从2到-2的位置赋值为2345
    > str_sub(x, 2, -2) <- "2345"; x
    [1] "12345C"
    
    > str_count('aaa444sssddd', "a")
    [1] 3
    
    
    > fruit <- c("apple", "banana", "pear", "pineapple")
    > str_count(fruit, "a")
    [1] 1 3 1 1
    > str_count(fruit, "p")
    [1] 2 0 1 3
    
    # 用fixed匹配字符
    > str_count(c("a.", ".", ".a.",NA), fixed("."))
    [1]  1  1  2 NA
    
    # 用\\匹配字符
    > str_count(c("a.", ".", ".a.",NA), "\\.")
    [1]  1  1  2 NA
    
    > str_length(c("I", "am", "张丹", NA))
    [1]  1  2  2 NA
    
    > val <- "abc,123,234,iuuu"
    
    # 以,进行分割
    > s1<-str_split(val, ",");s1
    [[1]]
    [1] "abc"  "123"  "234"  "iuuu"
    
    # 以,进行分割,保留2块
    > s2<-str_split(val, ",",2);s2
    [[1]]
    [1] "abc"          "123,234,iuuu"
    
    # 查看str_split()函数操作的结果类型list
    > class(s1)
    [1] "list"
    
    # 用str_split_fixed()函数分割,结果类型是matrix
    > s3<-str_split_fixed(val, ",",2);s3
         [,1]  [,2]          
    [1,] "abc" "123,234,iuuu"
    
    > class(s3)
    [1] "matrix"
    
    > val <- c("abc", 123, "cba")
    
    # 全文匹配
    > str_subset(val, "a")
    [1] "abc" "cba"
    
    # 开头匹配
    > str_subset(val, "^a")
    [1] "abc"
    
    # 结尾匹配
    > str_subset(val, "a$")
    [1] "cba"
    
    > val <- c("I am Conan.", "http://fens.me, ok")
    
    # 默认以空格分割,取第一个位置的字符串
    > word(val, 1)
    [1] "I"               "http://fens.me,"
    > word(val, -1)
    [1] "Conan." "ok"    
    > word(val, 2, -1)
    [1] "am Conan." "ok"       
    
    # 以,分割,取第一个位置的字符串 
    > val<-'111,222,333,444'
    > word(val, 1, sep = fixed(','))
    [1] "111"
    > word(val, 3, sep = fixed(','))
    [1] "333"
    
    > val <- c("abca4", 123, "cba2")
    
    # 检查字符串向量,是否包括a
    > str_detect(val, "a")
    [1]  TRUE FALSE  TRUE
    
    # 检查字符串向量,是否以a为开头
    > str_detect(val, "^a")
    [1]  TRUE FALSE FALSE
    
    # 检查字符串向量,是否以a为结尾
    > str_detect(val, "a$")
    [1] FALSE FALSE FALSE
    
    > val <- c("abc", 123, "cba")
    
    # 匹配字符a,并返回对应的字符
    > str_match(val, "a")
         [,1]
    [1,] "a" 
    [2,] NA  
    [3,] "a" 
    
    # 匹配字符0-9,限1个,并返回对应的字符
    > str_match(val, "[0-9]")
         [,1]
    [1,] NA  
    [2,] "1" 
    [3,] NA  
    
    # 匹配字符0-9,不限数量,并返回对应的字符
    > str_match(val, "[0-9]*")
         [,1] 
    [1,] ""   
    [2,] "123"
    [3,] ""  
    
    • str_replace
    > val <- c("abc", 123, "cba")
    
    # 把目标字符串第一个出现的a或b,替换为-
    > str_replace(val, "[ab]", "-")
    [1] "-bc" "123" "c-a"
    
    # 把目标字符串所有出现的a或b,替换为-
    > str_replace_all(val, "[ab]", "-")
    [1] "--c" "123" "c--"
    
    # 把目标字符串所有出现的a,替换为被转义的字符
    > str_replace_all(val, "[a]", "\1\1")
    [1] "\001\001bc" "123"        "cb\001\001"
    

    参考:http://blog.fens.me/r-stringr/

    相关文章

      网友评论

          本文标题:R 包学习 - stringr()

          本文链接:https://www.haomeiwen.com/subject/crkmohtx.html