美文网首页
日民人报语料库 - 百家姓 with JAVA

日民人报语料库 - 百家姓 with JAVA

作者: Lairai | 来源:发表于2020-04-20 13:18 被阅读0次
    package homework3;
    
    import java.io.*;
    import java.util.*;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class Task3 {
        static int FAMILY_NAME = 0;
        static int PERSONAL_NAME = 1;
        //Map freqDicName = new HashMap<Name, Integer>();           //姓名频数
        Map freqDicFamilyName = new HashMap<String, Map<Name, Integer>>();      //姓映射到名字及频数的集合
        List<Element> resList = new ArrayList<>();      //用于输出结果的list
    
        String markPattern = "\\[?[\\u4e00-\\u9fa5]+/nr.*"; //姓or名的标注模式
        String namePattern = "[\\u4e00-\\u9fa5]+";  //姓or名的模式
    
    
        public void getFrequency(String corpus) throws IOException{
            BufferedReader corpusReader = new BufferedReader(new FileReader(corpus));
            String line;
            String lastFamilyName = null;   // 最近找到的姓
            // 建立词典
            int flag = FAMILY_NAME;     //假设姓总在名前面出现
            while ((line = corpusReader.readLine()) != null) {
                String[] marks = line.split("\\s+");
                for (String mark: marks) {
                    //System.out.println(mark);
                    String name = null;
                    if (Pattern.matches(markPattern, mark)) {
                        // 找到姓名
                        Matcher nameMatcher = Pattern.compile(namePattern).matcher(mark);
                        if (nameMatcher.find()) {
                            name = nameMatcher.group(0);
                            if (flag == FAMILY_NAME) {
                                // 如果找到的是姓
                                lastFamilyName = name;
                                flag = PERSONAL_NAME;
                            } else {
                                // 如果找到的是名,就把这个人的姓名放入姓名索引
                                String personalName = name;
                                Name fullName = new Name(lastFamilyName, personalName);
                                if (freqDicFamilyName.containsKey(lastFamilyName)) {
                                    // 如果存在这个姓,就拿到这个姓对应的Map
                                    HashMap<Name, Integer> nameMap = (HashMap<Name, Integer>) freqDicFamilyName.get(lastFamilyName);
                                    if (nameMap.containsKey(fullName)) {
                                        // 如果已经记录了这个名字,就增加频数
                                        nameMap.put(fullName, (Integer)nameMap.get(fullName) + 1);
                                    } else {
                                        // 否则记录这个名字
                                        nameMap.put(fullName, 1);
                                    }
                                } else {
                                    // 不存在这个姓,则将这个姓和这个名一起放入
                                    HashMap<Name, Integer> nameMap = new HashMap<>();
                                    nameMap.put(fullName, 1);
                                    freqDicFamilyName.put(lastFamilyName, nameMap);
                                }
                                flag = FAMILY_NAME;
                            }
                        }
                    }
                }
            }
    
            // 遍历Map,得到结果
            Set<Map.Entry<String, Map<Name, Integer>>> entrySet = freqDicFamilyName.entrySet();
            for (Map.Entry<String, Map<Name, Integer>> entry : entrySet) {
                int sumFreq = 0;
                Map<Name, Integer> nameMap = entry.getValue();
                for (Integer i : nameMap.values()) sumFreq += i;
                nameMap = sortByValueAscending(nameMap);
                resList.add(new Element(entry.getKey(), sumFreq, nameMap));
            }
            // 对List按要求进行排序
            resList.sort((o1, o2) -> {
                // 按照形式频次降序排列
                return Integer.compare(o2.frequencySum, o1.frequencySum);
            });
        }
    
        public void printResult() throws Exception{
            File resFile = new File("C:\\Coding\\javaCoding\\NLP\\src\\homework3\\3.txt");
            resFile.createNewFile();
            BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resFile), "UTF-8"));
            writer.append("Size: " + resList.size() + '\n');
            for (Element e : resList) {
                writer.append(e.commonName + " " + e.frequencySum + "   ");
                // 添加前5个
                int count = 0;
                Set<Map.Entry<Name, Integer>> entrySet = e.nameMap.entrySet();
                for (Map.Entry<Name, Integer> entry : entrySet) {
                    writer.append(entry.getKey().toString() + " " + entry.getValue() + " ");
                    ++count;
                    if (count >= 5) break;
                }
                writer.append('\n');
            }
            writer.close();
        }
    
        private class Name {
            String familyName;      //姓
            String personalName;    //名
    
            public Name(String familyName, String personalName) {
                this.familyName = familyName;
                this.personalName = personalName;
            }
    
            @Override
            public boolean equals(Object o) {
                if (this == o) return true;
                if (o == null || getClass() != o.getClass()) return false;
    
                Name name = (Name) o;
    
                if (familyName != null ? !familyName.equals(name.familyName) : name.familyName != null) return false;
                return personalName != null ? personalName.equals(name.personalName) : name.personalName == null;
            }
    
            @Override
            public int hashCode() {
                int result = familyName != null ? familyName.hashCode() : 0;
                result = 31 * result + (personalName != null ? personalName.hashCode() : 0);
                return result;
            }
    
            @Override
            public String toString() {
                return familyName + personalName;
            }
        }
    
        private class Element {
            /**
             * 用于放置结果的元素
             */
            String commonName;  //共同的姓
            int frequencySum;       //该姓出现的总频数
            Map<Name, Integer> nameMap; //姓名-频次Map,应已按照频次降序排列
    
            public Element(String commonName, int frequencySum, Map<Name, Integer> nameMap) {
                this.commonName = commonName;
                this.frequencySum = frequencySum;
                this.nameMap = nameMap;
            }
        }
        //降序排序
        private <K, V extends Comparable<? super V>> Map<K, V> sortByValueAscending(Map<K, V> map) {
            List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet());
            Collections.sort(list, new Comparator<Map.Entry<K, V>>()
            {
                @Override
                public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2)
                {
                    int compare = (o1.getValue()).compareTo(o2.getValue());
                    return -compare;
                }
            });
    
            Map<K, V> result = new LinkedHashMap<K, V>();
            for (Map.Entry<K, V> entry : list) {
                result.put(entry.getKey(), entry.getValue());
            }
            return result;
        }
    }
    

    相关文章

      网友评论

          本文标题:日民人报语料库 - 百家姓 with JAVA

          本文链接:https://www.haomeiwen.com/subject/ypwrihtx.html