美文网首页
日民人报语料库的汉字词频统计 with JAVA

日民人报语料库的汉字词频统计 with JAVA

作者: Lairai | 来源:发表于2020-04-19 19:29 被阅读0次

    实在用不惯perl...就用Java写了(我先用notepad转码成了UTF-8)

    package homework3;
    
    import java.io.*;
    import java.util.*;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class ChineseWordFrequency {
        Map freqDic = new HashMap<String, Integer>();
        String markPattern = "\\[?[\\u4e00-\\u9fa5]+/[a-z].*"; //标注模式
        String wordPattern = "[\\u4e00-\\u9fa5]+";              //汉字词模式
    
        public void getFrequency(String corpus) throws IOException{
            BufferedReader corpusReader = new BufferedReader(new FileReader(corpus));
            String line;
    
            // 建立词典
            while ((line = corpusReader.readLine()) != null) {
                String[] marks = line.split("\\s+");
                for (String mark: marks) {
                    //System.out.println(mark);
                    if (Pattern.matches(markPattern, mark)) {
                        // 找到目标词语
                        Matcher matcher = Pattern.compile(wordPattern).matcher(mark);
                        if (matcher.find()) {
                            String word = matcher.group(0);
                            //System.out.println(word);
                            if (freqDic.containsKey(word)) {
                                freqDic.put(word, (Integer)freqDic.get(word) + 1);
                            } else {
                                freqDic.put(word, 1);
                            }
                        }
                    }
                }
            }
    
            // 根据频率升序排序
            freqDic = sortByValueAscending(freqDic);
        }
    
        public void printResult() throws Exception{
            File resFile = new File("C:\\Coding\\javaCoding\\NLP\\src\\homework3\\1.txt");
            resFile.createNewFile();
            BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resFile), "UTF-8"));
            writer.append("Size: " + freqDic.size() + '\n');
            Set<Map.Entry<String, Integer>> entrySet = freqDic.entrySet();
            for (Map.Entry<String, Integer> entry : entrySet) {
                writer.append(entry.getKey() + "  "+entry.getValue() + '\n');
            }
            writer.close();
        }
    
        //降序排序
        private <K, V extends Comparable<? super V>> Map<K, V> sortByValueAscending(Map<K, V> map)
        {
            List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet());
            Collections.sort(list, new Comparator<Map.Entry<K, V>>()
            {
                @Override
                public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2)
                {
                    int compare = (o1.getValue()).compareTo(o2.getValue());
                    return -compare;
                }
            });
    
            Map<K, V> result = new LinkedHashMap<K, V>();
            for (Map.Entry<K, V> entry : list) {
                result.put(entry.getKey(), entry.getValue());
            }
            return result;
        }
    }
    

    相关文章

      网友评论

          本文标题:日民人报语料库的汉字词频统计 with JAVA

          本文链接:https://www.haomeiwen.com/subject/jubbbhtx.html