在使用Flink批处理完成数据比对(对账)一中,我们只是简单的实现了F000/F113/F114的情况,如果我的需求场景需要实现F115的场景该怎么办呢?
编写代码
在上一篇文章的基础上完成代码如下:
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.MapOperator;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.BatchTableEnvironment;
import java.util.List;
/***
* <strong>对账流程</strong>
* <ol>
* <li>两方文件处理如下:</li>
* <ul>
* <li>所有唯一性字段(如OrderNO)存放到一个table1</li>
* <li>所有唯一性字段+比较字段(如OrderNO+OrderMoney)存放到一个table2</li>
* </ul>
* <li>比对
* <ul>
* <li>两个文件的table1做差集可以得到F113、F114</li>
* <li>两个文件的table1做交集可以得到F000+F115</li>
* <li>两个文件的set2做差集可以得到F113+F115</li>
* <li>F113+F115去除比较字段,只留下关键字段</li>
* <li>去除F113+F115中的F113,得到F115</li>
* <li>去除F000+F115中的F115,得到F000</li>
* </ul>
* </ol>
*/
public class BatchJob2 {
public static void main(String[] args) throws Exception {
// set up the batch execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// Table Environment
BatchTableEnvironment tableEnvironment = BatchTableEnvironment.getTableEnvironment(env);
/**
* 构造两个数据集,实际生产从自己需要的source中获取即可
*/
// 只包含唯一性(用于关联)字段的数据源
DataSource<String> dataSourceA_unique = env.fromElements("orderId_1_f113", "orderId_2_f000", "orderId_3_f115");
DataSource<String> dataSourceB_unique = env.fromElements("orderId_2_f000", "orderId_3_f115", "orderId_4_f114");
// 包含唯一性字段和比较字段
DataSource<String> dataSourceA_compare = env.fromElements("orderId_1_f113:payment_1", "orderId_2_f000:payment_2", "orderId_3_f115:payment_33");
DataSource<String> dataSourceB_compare = env.fromElements("orderId_2_f000:payment_2", "orderId_3_f115:payment_333", "orderId_4_f114:payment_4");
// 转换成table
Table tableA_unique = tableEnvironment.fromDataSet(dataSourceA_unique);
Table tableB_unique = tableEnvironment.fromDataSet(dataSourceB_unique);
Table tableA_compare = tableEnvironment.fromDataSet(dataSourceA_compare);
Table tableB_compare = tableEnvironment.fromDataSet(dataSourceB_compare);
/**
* 核心对账逻辑
*/
Table f113_table = tableA_unique.minusAll(tableB_unique);
Table f114_table = tableB_unique.minusAll(tableA_unique);
Table f000_f115_table = tableA_unique.intersect(tableB_unique);
Table f113_f115_compare_table = tableA_compare.minusAll(tableB_compare);
// 拆分,留下唯一性字段
Table f113_f115_table = convert(tableEnvironment, f113_f115_compare_table);
Table f115_table = f113_f115_table.minusAll(f113_table);
Table f000_table = f000_f115_table.minusAll(f115_table);
DataSet<String> f000 = tableEnvironment.toDataSet(f000_table, String.class);
DataSet<String> f113 = tableEnvironment.toDataSet(f113_table, String.class);
DataSet<String> f114 = tableEnvironment.toDataSet(f114_table, String.class);
DataSet<String> f115 = tableEnvironment.toDataSet(f115_table, String.class);
/**
* 输出,实际输出到自己需要的sink即可
*/
List<String> f000_list = f000.collect();
List<String> f113_list = f113.collect();
List<String> f114_list = f114.collect();
List<String> f115_list = f115.collect();
System.out.println("==============================");
System.out.println("f000 ->" + f000_list);
System.out.println("==============================");
System.out.println("f113 ->" + f113_list);
System.out.println("==============================");
System.out.println("f114 ->" + f114_list);
System.out.println("==============================");
System.out.println("f115 ->" + f115_list);
}
private static Table convert(BatchTableEnvironment tableEnvironment, Table inputTable) {
DataSet<String> f000_compare_dataset = tableEnvironment.toDataSet(inputTable, String.class);
MapOperator<String, String> map = f000_compare_dataset.map(e -> {
return e.split(":")[0];// 留下前半段,关键字段
});
return tableEnvironment.fromDataSet(map);
}
}
中间的处理逻辑在代码中对注释清楚了。
源码
总结
需要知道两边都有数据(订单号相同)但存在差异的情况需要处理的步骤多点。
如果你有更好的想法,欢迎留言,多多指教。
转载请注明出处
网友评论