美文网首页Java 杂谈
Java 超大文件排序

Java 超大文件排序

作者: 叫我宫城大人 | 来源:发表于2019-07-12 10:45 被阅读13次

    思想

    1. 超大文件无法一次性全部加载到内存中;
    2. 可以将超大文件分片排序,然后遍历分片,输出排序后内容至指定文件;

    编码

    创建超大文件

    private static void createBigFile() {
        Random random = new Random();
        try (FileWriter writer = new FileWriter(BIG_FILE_NAME)) {
            for (int i = 0; i < LINE_COUNT; i++) {
                int val = random.nextInt(Integer.MAX_VALUE);
                writer.write(val + LINE_SEPARATOR);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    

    超大文件分片排序

    private static List<String> separateFile() {
        List<String> fileNameList = new ArrayList<>();
        try (BufferedReader reader = new BufferedReader(new FileReader(BIG_FILE_NAME))) {
            int index = 0;
            List<Integer> batchLineList = new ArrayList<>(BATCH_SIZE);
            String line;
            while ((line = reader.readLine()) != null) {
                batchLineList.add(Integer.valueOf(line));
                if (batchLineList.size() == BATCH_SIZE) {
                    // 内容排序
                    batchLineList.sort(Comparator.comparingInt(a -> a));
                    // 写小文件
                    String fileName = BIG_FILE_NAME + ".tmp." + index++;
                    try (FileWriter tmpWriter = new FileWriter(fileName)) {
                        for (Integer val : batchLineList) {
                            tmpWriter.write(val + LINE_SEPARATOR);
                        }
                    }
                    fileNameList.add(fileName);
                    batchLineList.clear();
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return fileNameList;
    }
    

    分片合并输出

    private static void mergeFile(List<String> fileNameList) {
        Map<BufferedReader, String> map = new HashMap<>();
        try (FileWriter writer = new FileWriter(SORT_FILE_NAME)) {
            for (String fileName : fileNameList) {
                BufferedReader tmpReader = new BufferedReader(new FileReader(fileName));
                map.put(tmpReader, tmpReader.readLine());
            }
            while (true) {
                boolean canRead = false;
                Map.Entry<BufferedReader, String> minEntry = null;
                for (Map.Entry<BufferedReader, String> entry : map.entrySet()) {
                    String value = entry.getValue();
                    if (value == null) {
                        continue;
                    }
                    // 获取当前 reader 内容最小 entry
                    if ((minEntry == null) || (Integer.valueOf(value) < Integer.valueOf(minEntry.getValue()))) {
                        minEntry = entry;
                    }
                    canRead = true;
                }
                // 当且仅当所有 reader 内容为空时,跳出循环
                if (!canRead) {
                    break;
                }
                writer.write(minEntry.getValue() + LINE_SEPARATOR);
                minEntry.setValue(minEntry.getKey().readLine());
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            // 注意关闭分片文件输入流
            for (BufferedReader reader : map.keySet()) {
                try {
                    reader.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
    

    相关文章

      网友评论

        本文标题:Java 超大文件排序

        本文链接:https://www.haomeiwen.com/subject/ditekctx.html