美文网首页R语言与统计分析R语言作业数据科学与R语言
36-text2vec包学习:利用多核机器的优势处理文件

36-text2vec包学习:利用多核机器的优势处理文件

作者: wonphen | 来源:发表于2020-02-14 16:54 被阅读0次

    1、酒店评论数据

    library(pacman)
    p_load(dplyr)
    df <- read.csv("./data_set/review.csv",header = T,stringsAsFactors = F) %>% 
      tbl_df() %>% 
      select(id=ReviewID,date=RatingDate,review=ReviewText,rate=Obs_Avg_Rating) %>% 
      mutate(sentiment=ifelse(rate<3,0,1)) 
    
    df <- df %>% select(-rate)
    df$date <- as.Date(df$date)
    nrow(df)
    
    ## [1] 11005
    
    # 切割为5个文件
    N_files <- 5
    # 每个文件多少行数据
    chunk_len <- nrow(df) / N_files
    
    # 创建临时文件
    files <- sapply(1:N_files,function(x) tempfile())
    
    # 将文件分块
    chunks <- split(df,rep(1:N_files,each=chunk_len))
    
    # 将分块写入到对应的文件
    for (i in 1:N_files) {
      write.table(chunks[[i]],files[[i]],quote=T,row.names=F,
                  col.names = T,sep = "|")
    }
    
    str(df,strict.width="cut")
    
    ## Classes 'tbl_df', 'tbl' and 'data.frame':    11005 obs. of  4 variables:
    ##  $ id       : int  116455519 116885145 117395588 126717496 132233722..
    ##  $ date     : Date, format: "2011-08-08" ...
    ##  $ review   : chr  "I gave this place 2 because the staff was very "..
    ##  $ sentiment: num  0 0 0 0 0 1 0 1 1 1 ...
    

    2、构造text2vec读取文件函数

    p_load(data.table,text2vec)
    
    reader <- function(x, ...) {
      # 读取数据
      chunk <- data.table::fread(x, header = T, sep = "|")
      # 选择列
      res <- chunk$review
      # 加入ids信息
      names(res) <- chunk$id
      res
    }
    
    # 创建迭代器
    it_files <- ifiles(files, reader = reader)
    # 
    it_tokens = itoken(it_files,
                       preprocessor = tolower,
                       tokenizer = word_tokenizer,
                       progressbar = FALSE)
    
    vocab <- create_vocabulary(it_tokens)
    

    3、使用构造的函数创建DTM

    请注意,DTM具有文档id。它们继承自我们在reader函数中分配的文档名称。 在处理文件时,这是分配文档id的方便方法。

    dtm <- create_dtm(it_tokens,vectorizer = vocab_vectorizer(vocab))
    str(dtm, list.len = 5)
    
    ## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
    ##   ..@ i       : int [1:734787] 9012 558 10744 2736 1950 10921 9345 983 1874 758 ...
    ##   ..@ p       : int [1:26935] 0 1 2 3 4 5 6 7 8 9 ...
    ##   ..@ Dim     : int [1:2] 11005 26934
    ##   ..@ Dimnames:List of 2
    ##   .. ..$ : chr [1:11005] "116455519" "116885145" "117395588" "126717496" ...
    ##   .. ..$ : chr [1:26934] "contar" "reacting" "veriy" "cranny's" ...
    ##   ..@ x       : num [1:734787] 1 1 1 1 1 1 1 1 1 1 ...
    ##   .. [list output truncated]
    

    4、使用多核读取内存中的数据

    library(doParallel)
    
    # 查看系统核心数
    N_WORKERS <- detectCores()
    
    # 创建集群
    cl <- makeCluster(N_WORKERS-1)
    
    # 注册parallel后端
    registerDoParallel(cl)
    
    it_token_par <- itoken_parallel(df$review,
                                    preprocessor = tolower,
                                    tokenizer = word_tokenizer,
                                    ids = df$id,
                                    # 可以自己控制n_chunks参数
                                    n_chunks = 8)
    vocab <- create_vocabulary(it_token_par)
    v_vectorizer <- vocab_vectorizer(vocab)
    dtm <- create_dtm(it_token_par,v_vectorizer)
    
    # 停止集群
    stopCluster(cl)
    
    str(dtm)
    
    ## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
    ##   ..@ i       : int [1:734787] 558 1146 1481 2583 2588 2651 2741 2796 2991 3457 ...
    ##   ..@ p       : int [1:26935] 0 1 53 55 56 58 59 60 68 69 ...
    ##   ..@ Dim     : int [1:2] 11005 26934
    ##   ..@ Dimnames:List of 2
    ##   .. ..$ : chr [1:11005] "116455519" "116885145" "117395588" "126717496" ...
    ##   .. ..$ : chr [1:26934] "reacting" "downside" "impacted" "janitorial" ...
    ##   ..@ x       : num [1:734787] 1 1 1 1 1 1 1 1 1 1 ...
    ##   ..@ factors : list()
    

    5、读取磁盘中的数据

    library(doParallel)
    
    # 查看系统核心数
    N_WORKERS <- detectCores()
    
    # 创建集群
    cl <- makeCluster(N_WORKERS-1)
    
    # 注册parallel后端
    registerDoParallel(cl)
    
    it_files_par <- ifiles_parallel(file_paths = files)
    
    it_token_par <- itoken_parallel(it_files_par,
                                    preprocessor = tolower,
                                    tokenizer = word_tokenizer)
    vocab <- create_vocabulary(it_token_par)
    
    # 词向量DTM
    v_vectorizer <- vocab_vectorizer(vocab)
    dtm_v <- create_dtm(it_token_par,v_vectorizer)
    str(dtm_v)
    
    ## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
    ##   ..@ i       : int [1:788756] 559 3900 5157 2473 2738 935 1951 984 2086 6899 ...
    ##   ..@ p       : int [1:37943] 0 1 3 4 5 6 7 8 10 11 ...
    ##   ..@ Dim     : int [1:2] 11010 37942
    ##   ..@ Dimnames:List of 2
    ##   .. ..$ : chr [1:11010] "filec082b5c5c_1" "filec082b5c5c_2" "filec082b5c5c_3" "filec082b5c5c_4" ...
    ##   .. ..$ : chr [1:37942] "reacting" "infact" "193827036" "cranny's" ...
    ##   ..@ x       : num [1:788756] 1 1 1 1 1 1 1 1 1 1 ...
    ##   ..@ factors : list()
    
    # hash向量DTM
    h_vectorizer <- hash_vectorizer()
    dtm_h <- create_dtm(it_token_par,h_vectorizer)
    str(dtm_h)
    
    ## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
    ##   ..@ i       : int [1:788604] 6402 1093 8357 559 630 815 894 957 1347 2872 ...
    ##   ..@ p       : int [1:262145] 0 0 0 0 0 0 0 0 0 0 ...
    ##   ..@ Dim     : int [1:2] 11010 262144
    ##   ..@ Dimnames:List of 2
    ##   .. ..$ : chr [1:11010] "filec089266d1a_1" "filec089266d1a_2" "filec089266d1a_3" "filec089266d1a_4" ...
    ##   .. ..$ : NULL
    ##   ..@ x       : num [1:788604] 1 1 1 1 1 1 1 1 1 1 ...
    ##   ..@ factors : list()
    
    # 词共现矩阵
    tcm <- create_tcm(it_token_par,
                      vectorizer = v_vectorizer,
                      skip_grams_window = 5)
    str(tcm)
    
    ## Formal class 'dgTMatrix' [package "Matrix"] with 6 slots
    ##   ..@ i       : int [1:1107832] 216 285 193 292 494 422 235 180 416 361 ...
    ##   ..@ j       : int [1:1107832] 216 349 406 416 494 526 538 585 603 614 ...
    ##   ..@ Dim     : int [1:2] 37942 37942
    ##   ..@ Dimnames:List of 2
    ##   .. ..$ : chr [1:37942] "reacting" "infact" "193827036" "cranny's" ...
    ##   .. ..$ : chr [1:37942] "reacting" "infact" "193827036" "cranny's" ...
    ##   ..@ x       : num [1:1107832] 0.2 1 0.25 0.25 0.5 ...
    ##   ..@ factors : list()
    
    # 停止集群
    stopCluster(cl)
    

    相关文章

      网友评论

        本文标题:36-text2vec包学习:利用多核机器的优势处理文件

        本文链接:https://www.haomeiwen.com/subject/mmtjfhtx.html