package homework3;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Task3 {
static int FAMILY_NAME = 0;
static int PERSONAL_NAME = 1;
//Map freqDicName = new HashMap<Name, Integer>(); //姓名频数
Map freqDicFamilyName = new HashMap<String, Map<Name, Integer>>(); //姓映射到名字及频数的集合
List<Element> resList = new ArrayList<>(); //用于输出结果的list
String markPattern = "\\[?[\\u4e00-\\u9fa5]+/nr.*"; //姓or名的标注模式
String namePattern = "[\\u4e00-\\u9fa5]+"; //姓or名的模式
public void getFrequency(String corpus) throws IOException{
BufferedReader corpusReader = new BufferedReader(new FileReader(corpus));
String line;
String lastFamilyName = null; // 最近找到的姓
// 建立词典
int flag = FAMILY_NAME; //假设姓总在名前面出现
while ((line = corpusReader.readLine()) != null) {
String[] marks = line.split("\\s+");
for (String mark: marks) {
//System.out.println(mark);
String name = null;
if (Pattern.matches(markPattern, mark)) {
// 找到姓名
Matcher nameMatcher = Pattern.compile(namePattern).matcher(mark);
if (nameMatcher.find()) {
name = nameMatcher.group(0);
if (flag == FAMILY_NAME) {
// 如果找到的是姓
lastFamilyName = name;
flag = PERSONAL_NAME;
} else {
// 如果找到的是名,就把这个人的姓名放入姓名索引
String personalName = name;
Name fullName = new Name(lastFamilyName, personalName);
if (freqDicFamilyName.containsKey(lastFamilyName)) {
// 如果存在这个姓,就拿到这个姓对应的Map
HashMap<Name, Integer> nameMap = (HashMap<Name, Integer>) freqDicFamilyName.get(lastFamilyName);
if (nameMap.containsKey(fullName)) {
// 如果已经记录了这个名字,就增加频数
nameMap.put(fullName, (Integer)nameMap.get(fullName) + 1);
} else {
// 否则记录这个名字
nameMap.put(fullName, 1);
}
} else {
// 不存在这个姓,则将这个姓和这个名一起放入
HashMap<Name, Integer> nameMap = new HashMap<>();
nameMap.put(fullName, 1);
freqDicFamilyName.put(lastFamilyName, nameMap);
}
flag = FAMILY_NAME;
}
}
}
}
}
// 遍历Map,得到结果
Set<Map.Entry<String, Map<Name, Integer>>> entrySet = freqDicFamilyName.entrySet();
for (Map.Entry<String, Map<Name, Integer>> entry : entrySet) {
int sumFreq = 0;
Map<Name, Integer> nameMap = entry.getValue();
for (Integer i : nameMap.values()) sumFreq += i;
nameMap = sortByValueAscending(nameMap);
resList.add(new Element(entry.getKey(), sumFreq, nameMap));
}
// 对List按要求进行排序
resList.sort((o1, o2) -> {
// 按照形式频次降序排列
return Integer.compare(o2.frequencySum, o1.frequencySum);
});
}
public void printResult() throws Exception{
File resFile = new File("C:\\Coding\\javaCoding\\NLP\\src\\homework3\\3.txt");
resFile.createNewFile();
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resFile), "UTF-8"));
writer.append("Size: " + resList.size() + '\n');
for (Element e : resList) {
writer.append(e.commonName + " " + e.frequencySum + " ");
// 添加前5个
int count = 0;
Set<Map.Entry<Name, Integer>> entrySet = e.nameMap.entrySet();
for (Map.Entry<Name, Integer> entry : entrySet) {
writer.append(entry.getKey().toString() + " " + entry.getValue() + " ");
++count;
if (count >= 5) break;
}
writer.append('\n');
}
writer.close();
}
private class Name {
String familyName; //姓
String personalName; //名
public Name(String familyName, String personalName) {
this.familyName = familyName;
this.personalName = personalName;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Name name = (Name) o;
if (familyName != null ? !familyName.equals(name.familyName) : name.familyName != null) return false;
return personalName != null ? personalName.equals(name.personalName) : name.personalName == null;
}
@Override
public int hashCode() {
int result = familyName != null ? familyName.hashCode() : 0;
result = 31 * result + (personalName != null ? personalName.hashCode() : 0);
return result;
}
@Override
public String toString() {
return familyName + personalName;
}
}
private class Element {
/**
* 用于放置结果的元素
*/
String commonName; //共同的姓
int frequencySum; //该姓出现的总频数
Map<Name, Integer> nameMap; //姓名-频次Map,应已按照频次降序排列
public Element(String commonName, int frequencySum, Map<Name, Integer> nameMap) {
this.commonName = commonName;
this.frequencySum = frequencySum;
this.nameMap = nameMap;
}
}
//降序排序
private <K, V extends Comparable<? super V>> Map<K, V> sortByValueAscending(Map<K, V> map) {
List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<K, V>>()
{
@Override
public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2)
{
int compare = (o1.getValue()).compareTo(o2.getValue());
return -compare;
}
});
Map<K, V> result = new LinkedHashMap<K, V>();
for (Map.Entry<K, V> entry : list) {
result.put(entry.getKey(), entry.getValue());
}
return result;
}
}
网友评论