1、酒店评论数据
library(pacman)
p_load(dplyr)
df <- read.csv("./data_set/review.csv",header = T,stringsAsFactors = F) %>%
tbl_df() %>%
select(id=ReviewID,date=RatingDate,review=ReviewText,rate=Obs_Avg_Rating) %>%
mutate(sentiment=ifelse(rate<3,0,1))
df <- df %>% select(-rate)
df$date <- as.Date(df$date)
nrow(df)
## [1] 11005
# 切割为5个文件
N_files <- 5
# 每个文件多少行数据
chunk_len <- nrow(df) / N_files
# 创建临时文件
files <- sapply(1:N_files,function(x) tempfile())
# 将文件分块
chunks <- split(df,rep(1:N_files,each=chunk_len))
# 将分块写入到对应的文件
for (i in 1:N_files) {
write.table(chunks[[i]],files[[i]],quote=T,row.names=F,
col.names = T,sep = "|")
}
str(df,strict.width="cut")
## Classes 'tbl_df', 'tbl' and 'data.frame': 11005 obs. of 4 variables:
## $ id : int 116455519 116885145 117395588 126717496 132233722..
## $ date : Date, format: "2011-08-08" ...
## $ review : chr "I gave this place 2 because the staff was very "..
## $ sentiment: num 0 0 0 0 0 1 0 1 1 1 ...
2、构造text2vec读取文件函数
p_load(data.table,text2vec)
reader <- function(x, ...) {
# 读取数据
chunk <- data.table::fread(x, header = T, sep = "|")
# 选择列
res <- chunk$review
# 加入ids信息
names(res) <- chunk$id
res
}
# 创建迭代器
it_files <- ifiles(files, reader = reader)
#
it_tokens = itoken(it_files,
preprocessor = tolower,
tokenizer = word_tokenizer,
progressbar = FALSE)
vocab <- create_vocabulary(it_tokens)
3、使用构造的函数创建DTM
请注意,DTM具有文档id。它们继承自我们在reader函数中分配的文档名称。 在处理文件时,这是分配文档id的方便方法。
dtm <- create_dtm(it_tokens,vectorizer = vocab_vectorizer(vocab))
str(dtm, list.len = 5)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:734787] 9012 558 10744 2736 1950 10921 9345 983 1874 758 ...
## ..@ p : int [1:26935] 0 1 2 3 4 5 6 7 8 9 ...
## ..@ Dim : int [1:2] 11005 26934
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:11005] "116455519" "116885145" "117395588" "126717496" ...
## .. ..$ : chr [1:26934] "contar" "reacting" "veriy" "cranny's" ...
## ..@ x : num [1:734787] 1 1 1 1 1 1 1 1 1 1 ...
## .. [list output truncated]
4、使用多核读取内存中的数据
library(doParallel)
# 查看系统核心数
N_WORKERS <- detectCores()
# 创建集群
cl <- makeCluster(N_WORKERS-1)
# 注册parallel后端
registerDoParallel(cl)
it_token_par <- itoken_parallel(df$review,
preprocessor = tolower,
tokenizer = word_tokenizer,
ids = df$id,
# 可以自己控制n_chunks参数
n_chunks = 8)
vocab <- create_vocabulary(it_token_par)
v_vectorizer <- vocab_vectorizer(vocab)
dtm <- create_dtm(it_token_par,v_vectorizer)
# 停止集群
stopCluster(cl)
str(dtm)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:734787] 558 1146 1481 2583 2588 2651 2741 2796 2991 3457 ...
## ..@ p : int [1:26935] 0 1 53 55 56 58 59 60 68 69 ...
## ..@ Dim : int [1:2] 11005 26934
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:11005] "116455519" "116885145" "117395588" "126717496" ...
## .. ..$ : chr [1:26934] "reacting" "downside" "impacted" "janitorial" ...
## ..@ x : num [1:734787] 1 1 1 1 1 1 1 1 1 1 ...
## ..@ factors : list()
5、读取磁盘中的数据
library(doParallel)
# 查看系统核心数
N_WORKERS <- detectCores()
# 创建集群
cl <- makeCluster(N_WORKERS-1)
# 注册parallel后端
registerDoParallel(cl)
it_files_par <- ifiles_parallel(file_paths = files)
it_token_par <- itoken_parallel(it_files_par,
preprocessor = tolower,
tokenizer = word_tokenizer)
vocab <- create_vocabulary(it_token_par)
# 词向量DTM
v_vectorizer <- vocab_vectorizer(vocab)
dtm_v <- create_dtm(it_token_par,v_vectorizer)
str(dtm_v)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:788756] 559 3900 5157 2473 2738 935 1951 984 2086 6899 ...
## ..@ p : int [1:37943] 0 1 3 4 5 6 7 8 10 11 ...
## ..@ Dim : int [1:2] 11010 37942
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:11010] "filec082b5c5c_1" "filec082b5c5c_2" "filec082b5c5c_3" "filec082b5c5c_4" ...
## .. ..$ : chr [1:37942] "reacting" "infact" "193827036" "cranny's" ...
## ..@ x : num [1:788756] 1 1 1 1 1 1 1 1 1 1 ...
## ..@ factors : list()
# hash向量DTM
h_vectorizer <- hash_vectorizer()
dtm_h <- create_dtm(it_token_par,h_vectorizer)
str(dtm_h)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:788604] 6402 1093 8357 559 630 815 894 957 1347 2872 ...
## ..@ p : int [1:262145] 0 0 0 0 0 0 0 0 0 0 ...
## ..@ Dim : int [1:2] 11010 262144
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:11010] "filec089266d1a_1" "filec089266d1a_2" "filec089266d1a_3" "filec089266d1a_4" ...
## .. ..$ : NULL
## ..@ x : num [1:788604] 1 1 1 1 1 1 1 1 1 1 ...
## ..@ factors : list()
# 词共现矩阵
tcm <- create_tcm(it_token_par,
vectorizer = v_vectorizer,
skip_grams_window = 5)
str(tcm)
## Formal class 'dgTMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:1107832] 216 285 193 292 494 422 235 180 416 361 ...
## ..@ j : int [1:1107832] 216 349 406 416 494 526 538 585 603 614 ...
## ..@ Dim : int [1:2] 37942 37942
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:37942] "reacting" "infact" "193827036" "cranny's" ...
## .. ..$ : chr [1:37942] "reacting" "infact" "193827036" "cranny's" ...
## ..@ x : num [1:1107832] 0.2 1 0.25 0.25 0.5 ...
## ..@ factors : list()
# 停止集群
stopCluster(cl)
网友评论