package homework3;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Task2 {
Map freqDic = new HashMap<WordAndClass, Integer>();
List<Triple> resList = new ArrayList<>(); //用于输出结果的list
String markPattern = "\\[?[\\u4e00-\\u9fa5]+/[a-z].*"; //标注模式
String wordPattern = "[\\u4e00-\\u9fa5]+"; //汉字词模式
String wordClassPattern = "[a-z]"; //词性首字母模式
public void getFrequency(String corpus) throws IOException{
BufferedReader corpusReader = new BufferedReader(new FileReader(corpus));
String line;
// 建立词典
while ((line = corpusReader.readLine()) != null) {
String[] marks = line.split("\\s+");
for (String mark: marks) {
//System.out.println(mark);
String word = null, wordClass = null;
if (Pattern.matches(markPattern, mark)) {
// 找到目标词语
Matcher wordMatcher = Pattern.compile(wordPattern).matcher(mark);
if (wordMatcher.find()) {
word = wordMatcher.group(0);
//System.out.println(word);
}
Matcher wordClassMatcher = Pattern.compile(wordClassPattern).matcher(mark);
if (wordClassMatcher.find()) {
wordClass = wordClassMatcher.group(0);
//System.out.println(wordClass);
}
if (word != null && wordClass != null) {
WordAndClass key = new WordAndClass(word, wordClass);
//System.out.println(word + " " + wordClass);
if (freqDic.containsKey(key)) {
freqDic.put(key, (Integer)freqDic.get(key) + 1);
} else {
freqDic.put(key, 1);
}
}
}
}
}
// 遍历Map,得到结果
Set<Map.Entry<WordAndClass, Integer>> entrySet = freqDic.entrySet();
for (Map.Entry<WordAndClass, Integer> entry : entrySet) {
resList.add(new Triple(entry.getKey().word, entry.getKey().wordClass, entry.getValue()));
}
// 对List按要求进行排序
resList.sort((o1, o2) -> {
if (o1.word.compareTo(o2.word) != 0)
return o1.word.compareTo(o2.word);
// 多种词性按频次降序排列
return Integer.compare(o2.frequency, o1.frequency);
});
}
public void printResult() throws Exception{
File resFile = new File("C:\\Coding\\javaCoding\\NLP\\src\\homework3\\2.txt");
resFile.createNewFile();
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resFile), "UTF-8"));
writer.append("Size: " + resList.size() + '\n');
for (Triple t : resList) {
writer.append(t.word + " " + t.wordClass + " " + t.frequency + '\n');
}
writer.close();
}
private class WordAndClass {
String word; //姓名
String wordClass; //词性
public WordAndClass(String word, String wordClass) {
this.word = word;
this.wordClass = wordClass;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
WordAndClass that = (WordAndClass) o;
if (word != null ? !word.equals(that.word) : that.word != null) return false;
return wordClass != null ? wordClass.equals(that.wordClass) : that.wordClass == null;
}
@Override
public int hashCode() {
int result = word != null ? word.hashCode() : 0;
result = 31 * result + (wordClass != null ? wordClass.hashCode() : 0);
return result;
}
}
private class Triple {
/**
* 放在list里面的元素,用于展示结果
*/
String word, wordClass;
int frequency;
public Triple(String word, String wordClass, int frequency) {
this.word = word;
this.wordClass = wordClass;
this.frequency = frequency;
}
}
}
网友评论