美文网首页
日民人报语料库 - 词性分类 by JAVA

日民人报语料库 - 词性分类 by JAVA

作者: Lairai | 来源:发表于2020-04-19 22:59 被阅读0次
package homework3;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Task2 {
    Map freqDic = new HashMap<WordAndClass, Integer>();
    List<Triple> resList = new ArrayList<>();       //用于输出结果的list

    String markPattern = "\\[?[\\u4e00-\\u9fa5]+/[a-z].*"; //标注模式
    String wordPattern = "[\\u4e00-\\u9fa5]+";  //汉字词模式
    String wordClassPattern = "[a-z]";              //词性首字母模式

    public void getFrequency(String corpus) throws IOException{
        BufferedReader corpusReader = new BufferedReader(new FileReader(corpus));
        String line;
        // 建立词典
        while ((line = corpusReader.readLine()) != null) {
            String[] marks = line.split("\\s+");
            for (String mark: marks) {
                //System.out.println(mark);
                String word = null, wordClass = null;
                if (Pattern.matches(markPattern, mark)) {
                    // 找到目标词语
                    Matcher wordMatcher = Pattern.compile(wordPattern).matcher(mark);
                    if (wordMatcher.find()) {
                        word = wordMatcher.group(0);
                        //System.out.println(word);
                    }
                    Matcher wordClassMatcher = Pattern.compile(wordClassPattern).matcher(mark);
                    if (wordClassMatcher.find()) {
                        wordClass = wordClassMatcher.group(0);
                        //System.out.println(wordClass);
                    }
                    if (word != null && wordClass != null) {
                        WordAndClass key = new WordAndClass(word, wordClass);
                        //System.out.println(word + "  " + wordClass);
                        if (freqDic.containsKey(key)) {
                            freqDic.put(key, (Integer)freqDic.get(key) + 1);
                        } else {
                            freqDic.put(key, 1);
                        }
                    }
                }
            }
        }

        // 遍历Map,得到结果
        Set<Map.Entry<WordAndClass, Integer>> entrySet = freqDic.entrySet();
        for (Map.Entry<WordAndClass, Integer> entry : entrySet) {
            resList.add(new Triple(entry.getKey().word, entry.getKey().wordClass, entry.getValue()));
        }
        // 对List按要求进行排序
        resList.sort((o1, o2) -> {
            if (o1.word.compareTo(o2.word) != 0)
                return o1.word.compareTo(o2.word);
            // 多种词性按频次降序排列
            return Integer.compare(o2.frequency, o1.frequency);
        });
    }

    public void printResult() throws Exception{
        File resFile = new File("C:\\Coding\\javaCoding\\NLP\\src\\homework3\\2.txt");
        resFile.createNewFile();
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resFile), "UTF-8"));
        writer.append("Size: " + resList.size() + '\n');
        for (Triple t : resList) {
            writer.append(t.word + " " + t.wordClass + " " + t.frequency + '\n');
        }
        writer.close();
    }

    private class WordAndClass {
        String word;        //姓名
        String wordClass;   //词性
        public WordAndClass(String word, String wordClass) {
            this.word = word;
            this.wordClass = wordClass;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;

            WordAndClass that = (WordAndClass) o;

            if (word != null ? !word.equals(that.word) : that.word != null) return false;
            return wordClass != null ? wordClass.equals(that.wordClass) : that.wordClass == null;
        }

        @Override
        public int hashCode() {
            int result = word != null ? word.hashCode() : 0;
            result = 31 * result + (wordClass != null ? wordClass.hashCode() : 0);
            return result;
        }
    }

    private class Triple {
        /**
         * 放在list里面的元素,用于展示结果
         */
        String word, wordClass;
        int frequency;

        public Triple(String word, String wordClass, int frequency) {
            this.word = word;
            this.wordClass = wordClass;
            this.frequency = frequency;
        }
    }
}

相关文章

网友评论

      本文标题:日民人报语料库 - 词性分类 by JAVA

      本文链接:https://www.haomeiwen.com/subject/ouawbhtx.html