数据准备
rm(list = ls())
if(!require(stringr))install.packages('stringr')
library(stringr)
x <- "The birch canoe slid on the smooth planks."
# x为一个字符串
1.检测字符串长度
length(x) #一句字符串
str_length(x) # 字符串长度
2.字符串拆分与组合
str_split(x," ") #用空格拆分字符串
[[1]] #第一个元素
[1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
[8] "planks."
赋值
x2 = str_split(x," ")[[1]] #如果含有多句字符串,拆分后每句字符串构成一个元素
str_c(x2,collapse = " ") #外部用空格连接,复原原数据
[1] "The birch canoe slid on the smooth planks."
str_c(x2,1234,sep = "+") #内部连接,数据还是被拆分的模式
[1] "The+1234" "birch+1234" "canoe+1234" "slid+1234" "on+1234"
[6] "the+1234" "smooth+1234" "planks.+1234"
3.提取字符串的一部分
str_sub(x,5,9) #提取第5-9位的字符
[1] "birch"
4.大小写转换
str_to_upper(x2)
str_to_lower(x2)
str_to_title(x2) #首字母大写
5.字符串排序
str_sort(x2) #首字母按24顺序排序
6.字符检测,返回逻辑值向量
str_detect(x2,"h")
str_starts(x2,"T")
str_ends(x2,"e")
与sum和mean连用,可以统计匹配的个数和比例
sum(str_detect(x2,"h")) #共有几个字符串含有h
[1]4
mean(str_detect(x2,"h")) #含有h的字符串占总字符串比例
[1]0.5
7.提取匹配到的字符串
str_subset(x2,"h")
x2[str_detect(x2,"h")] #也能实现
8.字符计数
str_count(x," ") #x里面有多少空格
str_count(x2,"o")
9.字符串替换
str_replace(x2,"o","A") #替换出现的第一个o
str_replace_all(x2,"o","A") #替换所有o
结合正则表达式更加强大
练习6-2
Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community.
1.将上面这句话作为一个长字符串,赋值给tmp
2.拆分为一个由单词组成的向量,赋值给tmp2(注意标点符号)
tmp2 <- str_split(tmp," ")[[1]]
tmp2
[1] "Bioinformatics" "is" "a" "new"
[5] "subject" "of" "genetic" "data"
[9] "collection,analysis" "and" "dissemination" "to"
[13] "the" "research" "community."
#会出现“,”隔开的两单词连接在一起
> str_replace_all(tmp,","," ")
[1] "Bioinformatics is a new subject of genetic data collection analysis and dissemination to the research community." #用空格替代“,”,虚注意“.”指代句号的时候不能直接出现在代码里面,需“[.]”或者“\\.”这两种格式
tmp2 =tmp%>%
str_replace_all(","," ")%>%
str_remove("[.]")%>%
str_split(" ")
tmp2=tmp2[[1]]
[[1]]
[1] "Bioinformatics" "is" "a" "new" "subject"
[6] "of" "genetic" "data" "collection" "analysis"
[11] "and" "dissemination" "to" "the" "research"
[16] "community"
3.用函数返回这句话中有多少个单词。
> length(tmp2)
[1] 16
4.用函数返回这句话中每个单词由多少个字母组成。
str_length(tmp2)
[1] 14 2 1 3 7 2 7 4 10 8 3 13 2 3 8 9
网友评论