Go 实现词频统计

作者: Yohann丶blog | 来源:发表于2020-05-27 15:28 被阅读0次

Go 实现词频统计
Python实现词频统计
MapReduce实现词频统计
python词频统计实例
es实战-使用IK分词器进行词频统计
用Py做文本分析3：制作词云图
词频统计
词频统计
Hadoop MapReduce 的基本helloworld程序
Hadoop入门-本地实现词频统计

a1ec08fa513d2697d52da0805cfbb2fb4216d875.jpeg

功能

统计多个文件中英文单词出现的次数
按照词频从多到少排序输出
支持并发

实现

创建 file.txt 内容如下：

You were the shadow to my light，Did you feel us?
Another start，You fade away
Afraid our aim is out of sight，Wanna see us，Alive
Where are you now? Where are you now?Where are you now?
Was it all in my fantasy?Where are you now?Were you only imaginary?
Where are you now? Atlantis Under the sea Under the sea
Where are you now? Another dream，The monster's running wild inside of me
I'm faded I'm faded  So lost, I'm faded
I'm faded  So lost, I'm faded
These shallow waters never met what I needed I'm letting go a deeper dive
Eternal silence of the sea. I'm breathing alive
Where are you now? Where are you now?
Under the bright but faded lights You've set my heart on fire
Where are you now? Where are you now?
Where are you now? Atlantis
Under the sea Under the sea
Where are you now? Another dream The monster's running wild inside of me
I'm faded I'm faded So lost, I'm faded
I'm faded So lost, I'm faded

创建 wordcount.go 内容如下：

package main

import (
    "bufio"
    "fmt"
    "io"
    "log"
    "os"
    "sort"
    "strings"
    "unicode"
    "unicode/utf8"
    "path/filepath"
)

type Pair struct {
    Key   string
    Value int
}

// PariList实现了sort接口，可以使用sort.Sort对其排序

type PairList []Pair

func (p PairList) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
func (p PairList) Len() int           { return len(p) }
func (p PairList) Less(i, j int) bool { return p[j].Value < p[i].Value } // 逆序

// 提取单词
func SplitOnNonLetters(s string) []string {
    notALetter := func(char rune) bool { return !unicode.IsLetter(char) }
    return strings.FieldsFunc(s, notALetter)
}

/*
   基于map实现了类型WordCount, 并对期实现了Merge(), Report(), SortReport(), UpdateFreq(),
   WordFreqCounter() 方法
*/

type WordCount map[string]int

// 用于合并两个WordCount
func (source WordCount) Merge(wordcount WordCount) WordCount {
    for k, v := range wordcount {
        source[k] += v
    }

    return source
}

// 打印词频统计情况
func (wordcount WordCount) Report() {
    words := make([]string, 0, len(wordcount))
    wordWidth, frequencyWidth := 0, 0
    for word, frequency := range wordcount {
        words = append(words, word)
        if width := utf8.RuneCountInString(word); width > wordWidth {
            wordWidth = width
        }
        if width := len(fmt.Sprint(frequency)); width > frequencyWidth {
            frequencyWidth = width
        }
    }
    sort.Strings(words)
    gap := wordWidth + frequencyWidth - len("Word") - len("Frequency")
    fmt.Printf("Word %*s%s\n", gap, " ", "Frequency")
    for _, word := range words {
        fmt.Printf("%-*s %*d\n", wordWidth, word, frequencyWidth,
            wordcount[word])
    }
}

// 从多到少打印词频
func (wordcount WordCount) SortReport() {
    p := make(PairList, len(wordcount))
    i := 0
    for k, v := range wordcount { // 将wordcount map转换成PairList
        p[i] = Pair{k, v}
        i++
    }

    sort.Sort(p) // 因为PairList实现了排序接口，所以可以使用sort.Sort()对其排序

    wordWidth, frequencyWidth := 0, 0
    for _, pair := range p {
        word, frequency := pair.Key, pair.Value
        if width := utf8.RuneCountInString(word); width > wordWidth {
            wordWidth = width
        }
        if width := len(fmt.Sprint(frequency)); width > frequencyWidth {
            frequencyWidth = width
        }
    }
    gap := wordWidth + frequencyWidth - len("Word") - len("Frequency")
    fmt.Printf("Word %*s%s\n", gap, " ", "Frequency")

    for _, pair := range p {
        fmt.Printf("%-*s %*d\n", wordWidth, pair.Key, frequencyWidth,
            pair.Value)
    }

}

// 从文件中读取单词，并更新其出现的次数
func (wordcount WordCount) UpdateFreq(filename string) {
    var file *os.File
    var err error

    if file, err = os.Open(filename); err != nil {
        log.Println("failed to open the file: ", err)
        return
    }
    defer file.Close() // 本函数退出之前时，关闭文件

    reader := bufio.NewReader(file)
    for {
        line, err := reader.ReadString('\n')
        for _, word := range SplitOnNonLetters(strings.TrimSpace(line)) {
            if len(word) > utf8.UTFMax ||
                utf8.RuneCountInString(word) > 1 {
                wordcount[strings.ToLower(word)] += 1
            }
        }
        if err != nil {
            if err != io.EOF {
                log.Println("failed to finish reading the file: ", err)
            }
            break
        }
    }
}

// 并发统计单词频次
func (wordcount WordCount) WordFreqCounter(files []string) {

    results := make(chan Pair, len(files))  // goroutine 将结果发送到该channel
    done := make(chan struct{}, len(files)) // 每个goroutine工作完成后，发送一个空结构体到该channel，表示工作完成

    for i := 0; i < len(files); { // 有多少个文件就开启多少个goroutine, 使用匿名函数的方式
        go func(done chan<- struct{}, results chan<- Pair, filename string) {
            wordcount := make(WordCount)
            wordcount.UpdateFreq(filename)
            for k, v := range wordcount {
                pair := Pair{k, v}
                results <- pair
            }
            done <- struct{}{}
        }(done, results, files[i])

        i++
    }

    for working := len(files); working > 0; { // 监听通道，直到所有的工作goroutine完成任务时才退出
        select {
        case pair := <-results: // 接收发送到通道中的统计结果
            wordcount[pair.Key] += pair.Value

        case <-done: // 判断工作goroutine是否全部完成
            working--

        }
    }

DONE: // 再次启动for循环处理通道中还未处理完的值
    for {
        select {
        case pair := <-results:
            wordcount[pair.Key] += pair.Value
        default:
            break DONE
        }
    }

    close(results)
    close(done)

}


func main() {
    if len(os.Args) == 1 || os.Args[1] == "-h" || os.Args[1] == "--help" {
        fmt.Printf("usage: %s <file1> [<file2> [... <fileN>]]\n",
            filepath.Base(os.Args[0]))
        os.Exit(1)
    }

    wordcounter := make(WordCount)

    wordcounter.WordFreqCounter(os.Args[1:])

    wordcounter.SortReport()
}

编译

$ go build wordcount.go
$ ls
wordcount  wordcount.go

执行go build wordcount.go对 wordcount.go 进行编译。执行后，查看当前目录已经有了一个可执行文件 wordcount

执行

$ ./wordcount file.txt |head -n 6
Word   Frequency
you       17
where     12
are       12
now       12
faded     11

为了方便输出，这里只打印了排名前 5 的单词

网友评论

本文标题：Go 实现词频统计

本文链接：https://www.haomeiwen.com/subject/pjiwahtx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

Go 实现词频统计

功能

实现

相关文章

Go 实现词频统计

Python实现词频统计

MapReduce实现词频统计

python词频统计实例

es实战-使用IK分词器进行词频统计

用Py做文本分析3：制作词云图

词频统计

词频统计

Hadoop MapReduce 的基本helloworld程序

Hadoop入门-本地实现词频统计

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读