参考:《文本数据挖掘——基于R语言》
library(pacman)
p_load(dplyr, stringr, purrr)
1、读取数据
随便文本代替即可,包括两列,一列为文档名或编号,一列为文本内容。
storagebottles <- read.csv("dataset/ali/storagebottles0905.csv",
header = F) %>%
set_names(c("sku_name", "sku_price", "sku_sale_volume", "sku_score",
"sku_ship", "sku_isNewin", "sku_isPromotion",
"sku_isTopselling", "shop_name", "sku_link", "category4")) %>%
distinct(.keep_all = T)
storagebottles <- storagebottles %>%
filter(!is.na(sku_name)) %>%
filter(str_detect(sku_price, "^US")) %>%
filter(str_detect(sku_link, "aliexpress")) %>%
filter(str_detect(sku_sale_volume, "sold")) %>%
mutate(category = "home",
category2 = "Home Storage",
category3 = "Storage Bottles & Jars") %>%
mutate(sku_id = str_extract(sku_link, "\\d{16}"),
sku_link = paste0("http:", sku_link)) %>%
mutate(sku_id = as.character(sku_id)) %>%
arrange(sku_sale_volume) %>%
group_by(sku_id, .drop = T) %>%
slice_tail(n=1) %>%
ungroup()
df <- select(storagebottles, sku_id, sku_name)
2、文本纠错
p_load(hunspell)
# 检查是否有错
hunspell_check(df$sku_name[1])
## [1] FALSE
# 识别错误单词
bad <- hunspell(df$sku_name[1])
print(bad[[1]])
## [1] "pcs"
# 修正建议
hunspell_suggest(bad[[1]])
## [[1]]
## [1] "cps" "cs" "pecs" "pics" "pis" "pas" "pct" "pus" "p cs" "PCs"
3、切分
p_load(tokenizers, tidytext)
txt <- paste0(df$sku_name[1:2], collapse = "。")
txt
## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles。1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"
# 段落切分
# 设置切分标记
tokenize_paragraphs(txt, paragraph_break = "。")
## [[1]]
## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles"
## [2] "1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"
# 句子切分
tokenize_sentences(txt)
## [[1]]
## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles。"
## [2] "1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"
# 词语切分,会同时去除标点,转小写
tokenize_words(txt)
## [[1]]
## [1] "1000pcs" "8" "32mm" "0.5ml" "plastic"
## [6] "centrifuge" "tube" "test" "tubing" "vial"
## [11] "clear" "plastic" "container" "home" "garden"
## [16] "storage" "bottles" "1000pcs" "6" "22mm"
## [21] "0.2ml" "plastic" "bottles" "gardening" "storage"
## [26] "container" "transparent" "plastic" "vials" "pcr"
## [31] "centrifuge" "tube"
# 也可以保留标点,去掉数字
tokenize_words(txt, strip_punct = F, strip_numeric = T, simplify = T)
## [1] "1000pcs" "*" "32mm" "0.5ml" "plastic"
## [6] "centrifuge" "tube" "test" "tubing" "vial"
## [11] "clear" "plastic" "container" "home" "garden"
## [16] "storage" "bottles" "。" "1000pcs" "*"
## [21] "22mm" "0.2ml" "plastic" "bottles" "gardening"
## [26] "storage" "container" "transparent" "plastic" "vials"
## [31] "pcr" "centrifuge" "tube"
# n元切分,simplify = T输出向量而非列表
tokenize_ngrams(txt, n = 2, simplify = T)
## [1] "1000pcs 8" "8 32mm" "32mm 0.5ml"
## [4] "0.5ml plastic" "plastic centrifuge" "centrifuge tube"
## [7] "tube test" "test tubing" "tubing vial"
## [10] "vial clear" "clear plastic" "plastic container"
## [13] "container home" "home garden" "garden storage"
## [16] "storage bottles" "bottles 1000pcs" "1000pcs 6"
## [19] "6 22mm" "22mm 0.2ml" "0.2ml plastic"
## [22] "plastic bottles" "bottles gardening" "gardening storage"
## [25] "storage container" "container transparent" "transparent plastic"
## [28] "plastic vials" "vials pcr" "pcr centrifuge"
## [31] "centrifuge tube"
# 4、字符切分
tokenize_characters(txt, simplify = T)
## [1] "1" "0" "0" "0" "p" "c" "s" "8" "3" "2" "m" "m" "0" "5" "m" "l" "p" "l"
## [19] "a" "s" "t" "i" "c" "c" "e" "n" "t" "r" "i" "f" "u" "g" "e" "t" "u" "b"
## [37] "e" "t" "e" "s" "t" "t" "u" "b" "i" "n" "g" "v" "i" "a" "l" "c" "l" "e"
## [55] "a" "r" "p" "l" "a" "s" "t" "i" "c" "c" "o" "n" "t" "a" "i" "n" "e" "r"
## [73] "h" "o" "m" "e" "g" "a" "r" "d" "e" "n" "s" "t" "o" "r" "a" "g" "e" "b"
## [91] "o" "t" "t" "l" "e" "s" "1" "0" "0" "0" "p" "c" "s" "6" "2" "2" "m" "m"
## [109] "0" "2" "m" "l" "p" "l" "a" "s" "t" "i" "c" "b" "o" "t" "t" "l" "e" "s"
## [127] "g" "a" "r" "d" "e" "n" "i" "n" "g" "s" "t" "o" "r" "a" "g" "e" "c" "o"
## [145] "n" "t" "a" "i" "n" "e" "r" "t" "r" "a" "n" "s" "p" "a" "r" "e" "n" "t"
## [163] "p" "l" "a" "s" "t" "i" "c" "v" "i" "a" "l" "s" "p" "c" "r" "c" "e" "n"
## [181] "t" "r" "i" "f" "u" "g" "e" "t" "u" "b" "e"
5、扩展缩写
p_load(qdap)
# 连词缩写替换
replace_contraction(c("MR. Jones isn't going."))
## [1] "MR. Jones is not going."
# 缩写替换
replace_abbreviation(c("MR. Jones isn't going."))
## [1] "Mister Jones isn't going."
# 数字替换
replace_number(c(1))
## [1] "one"
# 序词替换
replace_ordinal(c("3rd"))
## [1] "third"
# 符号替换
replace_symbol(c("&"))
## [1] "and"
6、词干提取
stem <- tokenize_word_stems(df$sku_name[1], simplify = T)
stem
## [1] "1000pcs" "8" "32mm" "0.5ml" "plastic" "centrifug"
## [7] "tube" "test" "tube" "vial" "clear" "plastic"
## [13] "contain" "home" "garden" "storag" "bottl"
7、词形还原
模型下载地址:
英文:https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe
中文:https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/chinese-gsd-ud-2.5-191206.udpipe
p_load(udpipe)
# 从github上下载英文词形还原的模型,也可以下载chinese
# 网络可能导致下载失败
udmodel <- udpipe_download_model(language = "english",
model_dir = "model/")
# 加载模型
en_model <- udpipe_load_model(udmodel$file_model)
# 词形还原
udpipe_annotate(en_model, stem) %>%
as_tibble() %>%
# 分词结果,词形还原结果
select(token, lemma)
## token lemma
## <chr> <chr>
## 1000pcs 1000pcs
## 8 8
## 32mm 32mm
## 0.5ml 0.5ml
## plastic plastic
## centrifug centrifug
## tube tube
## test test
## tube tube
## vial vial
## clear clear
## plastic plastic
## contain contain
## home home
## garden garden
## storag storag
## bottl bottl
udmodel <- udpipe_download_model(language = "chinese",
model_dir = "model/")
# 加载模型
cn_model <- udpipe_load_model(udmodel$file_model)
# 中文词形还原,# 转换为UTF-8编码
udpipe_annotate(cn_model, iconv(c("别人笑我忒疯癫"), to = "UTF-8")) %>%
as_tibble() %>%
# 分词结果,词形还原结果
select(token, lemma)
## token lemma
## <chr> <chr>
## 别 别
## 人笑 人笑
## 我忒 我忒
## 疯 疯
## 癫 癫
8、词性标注
udpipe_annotate(en_model, df$sku_name[1]) %>%
as_tibble() %>%
select(token, upos)
## token upos
## <chr> <chr>
## 1000 NUM
## pcs NOUN
## 8 NUM
## * PUNCT
## 32 NUM
## mm NOUN
## 0.5 NUM
## ml NOUN
## Plastic PROPN
## Centrifuge PROPN
……(太多就不一一列出来了)
PROPN表示专有名词,AUX表示助动词,ADJ表示形容词,DET表示限定词,NOUN表示名词,PUNCT表示标点符号
ADJ: adjective
ADP: adposition
ADV: adverb
AUX: auxiliary
CCONJ: coordinating conjunction
DET: determiner
INTJ: interjection
NOUN: noun
NUM: numeral
PART: particle
PRON: pronoun
PROPN: proper noun
PUNCT: punctuation
SCONJ: subordinating conjunction
SYM: symbol
VERB: verb
X: other
9、批量文本预处理
p_load(tidytext)
df <- unnest_tokens(tbl = df,
# 输出列名称
output = stem,
input = sku_name,
# 词干提取
token = tokenize_word_stems)
df
## # A tibble: 21,765 × 2
## sku_id stem
## <chr> <chr>
## 1 2251801564728378 1000pcs
## 2 2251801564728378 8
## 3 2251801564728378 32mm
## 4 2251801564728378 0.5ml
## 5 2251801564728378 plastic
## 6 2251801564728378 centrifug
## 7 2251801564728378 tube
## 8 2251801564728378 test
## 9 2251801564728378 tube
## 10 2251801564728378 vial
## # … with 21,755 more rows
网友评论