美文网首页
【文本挖掘】class 2

【文本挖掘】class 2

作者: caokai001 | 来源:发表于2019-03-07 22:53 被阅读7次

    词云_R

    课程链接
    http://xiajingbo.weebly.com/uploads/1/3/3/0/13306375/2._r_package_and_word_cloud.leaflet_8pages_.pdf
    
    ###tm 需要安装R_3.5.2,同时安装R包过程中会存在一些依赖包
    library(NLP)
    library(tm)
    cname<-c("C:/Users/16926/Desktop/研究生/【研究生】/研究生课程/文本挖掘/class2")
    docs<-Corpus(DirSource(cname))
    summary(docs)
    #Removing punctuation:
    docs <- tm_map(docs, removePunctuation) 
    for(j in seq(docs)){
      docs[[j]] <- gsub("/", " ", docs[[j]]) 
      docs[[j]] <- gsub("@", " ", docs[[j]]) 
      docs[[j]] <- gsub("\\|", " ", docs[[j]])
    }
    #################Removing numbers:
    docs <- tm_map(docs, removeNumbers)
    ###################Converting to lowercase:
    docs <-tm_map(docs, tolower)
    ############Removing “stopwords” (common words) that usually have no analytic value
    docs <- tm_map(docs, removeWords, stopwords("english"))
    ######Removing particular words
    docs <- tm_map(docs, removeWords, c("department", "email", "doi", "center", "sciences", "pubmed", "nature","university", "pmid", "author", "school","research"))
    
    ####Tell R to treat your preprocessed documents as text documents.
    docs <- tm_map(docs, PlainTextDocument)
    #########To proceed, create a document term matrix.
    dtm <- DocumentTermMatrix(docs)
    ##########You’ll also need a transpose of this matrix. Create it using:
    tdm <- TermDocumentMatrix(docs)
    #Organize terms by their frequency:
    freq <- colSums(as.matrix(dtm))
    freq
    names(freq)
    ord <- order(freq)
    
    ###If you prefer to export the matrix to Excel: 
    m <- as.matrix(dtm)
    write.csv(m, file="dtm.csv")
    ###############Word Frequency
    ###There are lots of terms, just check some of the most and least frequently occurring words
    freq[head(ord, 10)]
    freq[tail(ord, 50)] 
    
    wf <- data.frame(word = names(freq), freq=freq)
    head(wf)
    
    
    ############Plot words that appear at least 50 times
    library(ggplot2)
    p <- ggplot(subset(wf, freq>50), aes(word, freq))
    p <- p + geom_bar(stat="identity")
    p <- p + theme(axis.text.x=element_text(angle=45, hjust=1)) 
    p
    
    词频率分布
    #########word cloud
    set.seed(142)
    library("wordcloud")
    wordcloud(names(freq), freq, min.freq=25)
    
    词云

    相关文章

      网友评论

          本文标题:【文本挖掘】class 2

          本文链接:https://www.haomeiwen.com/subject/cbukpqtx.html