倒排索引案例（多Job串联）

作者: bullion | 来源:发表于2019-03-09 16:03 被阅读0次

倒排索引案例（多Job串联）
IMI 倒排多索引
Elasticsearch（一）：概念与基本API
ElasticSearch（基础）
ElasticSearch 倒排索引简析
搜索引擎索引-倒排索引
ElasticSearch知识库
Elasticsearch学习笔记(06) - 倒排索引简介
MapReduce 案例之倒排索引
MapReduce 案例之倒排索引

需求

需求分析

第一次处理

OneIndexMapper

public class OneIndexMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

String name;

@Override

protected void setup(Context context) throws IOException, InterruptedException {

// 获取文件名称

FileSplit inputSplit = (FileSplit) context.getInputSplit();

name = inputSplit.getPath().getName();

}

Text k = new Text();

IntWritable v = new IntWritable(1);

@Override

protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

// 1 获取一行

String line = value.toString();

// 2 切割

String[] fields = line.split(" ");

// 3 写出

for (String word : fields) {

k.set(word + "--" + name);

context.write(k, v);

}

}

}

OneIndexReducer

public class OneIndexReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

@Override

protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

// 1 累加求和

int sum = 0;

for (IntWritable value : values) {

sum += value.get();

}

IntWritable v = new IntWritable();

v.set(sum);

// 2 写出

context.write(key, v);

}

}

OneIndexDriver

public class OneIndexDriver {

public static void main(String[] args) throws Exception {

args = new String[]{"e:/input/inputoneindex", "e:/output5"};

Configuration configuration = new Configuration();

Job job = Job.getInstance(configuration);

job.setJarByClass(OneIndexDriver.class);

job.setMapperClass(OneIndexMapper.class);

job.setReducerClass(OneIndexReducer.class);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(IntWritable.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(IntWritable.class);

FileInputFormat.setInputPaths(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

boolean result = job.waitForCompletion(true);

System.exit(result ? 0 : 1);

}

}

第二次处理

TwoIndexMapper

public class TwoIndexMapper extends Mapper<LongWritable, Text, Text, Text> {

Text k = new Text();

Text v = new Text();

@Override

protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

// 1 获取一行

String line = value.toString();

// 2 切割

String[] fields = line.split("--");

// 3 封装

k.set(fields[0]);

v.set(fields[1]);

// 4 写出

context.write(k, v);

}

}

TwoIndexReducer

public class TwoIndexReducer extends Reducer<Text, Text, Text, Text> {

Text v = new Text();

@Override

protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

// 1 拼接字符串

StringBuffer sb = new StringBuffer();

for (Text value : values) {

sb.append(value.toString().replace("\t", "-->") + "\t");

}

v.set(sb.toString());

// 2 写出

context.write(key, v);

}

}

TwoIndexDriver

public class TwoIndexDriver {

public static void main(String[] args) throws Exception {

args = new String[]{"e:/input/inputtwoindex", "e:/output6"};

Configuration configuration = new Configuration();

Job job = Job.getInstance(configuration);

job.setJarByClass(TwoIndexDriver.class);

job.setMapperClass(TwoIndexMapper.class);

job.setReducerClass(TwoIndexReducer.class);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(Text.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

FileInputFormat.setInputPaths(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

boolean result = job.waitForCompletion(true);

System.exit(result ? 0 : 1);

}

}