MapReduce自定义类的编写
在有些场景中,我们可以自己定义一个类用于传输或者处理key或者value,这个类必须实现序列化和反序列化,实现Writable接口。
假设我们需要处理一批记录,该记录中有三个字段:用户手机号(phone_num),上行流量(upflow)和下行流量(downflow),我们需要最终统计每一个手机号的所有upflow、downflow以及(upflow+downflow),这种场景下用自定义类来处理就比较方便:
package com.wenhuan.defineclass;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class FlowBean implements Writable{
private int upflow;
private int downflow;
private int sumflow;
public int getUpflow() {
return upflow;
}
public void setUpflow(int upflow) {
this.upflow = upflow;
}
public int getDownflow() {
return downflow;
}
public void setDownflow(int downflow) {
this.downflow = downflow;
}
public int getSumflow() {
return sumflow;
}
public void setSumflow(int sumflow) {
this.sumflow = sumflow;
}
@Override
public String toString() {
return upflow + "\t" + downflow + "\t" + sumflow;
}
public FlowBean() {
super();
}
public FlowBean(int upflow, int downflow) {
super();
this.upflow = upflow;
this.downflow = downflow;
this.sumflow = this.upflow + this.downflow;
}
@Override
public void write(DataOutput out) throws IOException {
//序列化的方法
out.writeInt(upflow);
out.writeInt(downflow);
out.writeInt(sumflow);
}
@Override
public void readFields(DataInput in) throws IOException {
//反序列化的方法
this.upflow = in.readInt();
this.downflow = in.readInt();
this.sumflow = in.readInt();
}
}
重写Mapper类
package com.wenhuan.defineclass;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<LongWritable,Text,Text,FlowBean>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] datas = value.toString().split("\t");
String phoneNum = datas[0];
FlowBean fb = new FlowBean(Integer.parseInt(datas[datas.length - 1]),Integer.parseInt(datas[datas.length - 2]));
context.write(new Text(phoneNum), fb);
}
}
重写Reducer类
package com.wenhuan.defineclass;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReducer extends Reducer<Text,FlowBean,Text,FlowBean>{
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
int sumupflow = 0;
int sumdownflow = 0;
for(FlowBean fb:values) {
sumupflow += fb.getUpflow();
sumdownflow += fb.getDownflow();
}
FlowBean fb1 = new FlowBean(sumupflow,sumdownflow);
context.write(key, fb1);
}
}
MapReduce的排序与分组
maptask和reducetask之间,框架默认按map端输出key的字典顺序进行排序。
理论上,只要Map端的KEYOUT实现了WritableComparable接口,都可以满足排序和分组的需求;具体来说,WritableComparable接口中的compareTo方法为记录的排序提供依据。
自定义类方式定义排序逻辑
我们重写之前的FlowBean接口,使其具有排序功能:
package com.wenhuan.defineclass;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class FlowBeanSort implements WritableComparable<FlowBeanSort>{
private String phoneNum;
private int upflow;
private int downflow;
private int sumflow;
public String getPhoneNum() {
return phoneNum;
}
public void setPhoneNum(String phoneNum) {
this.phoneNum = phoneNum;
}
public int getUpflow() {
return upflow;
}
public void setUpflow(int upflow) {
this.upflow = upflow;
}
public int getDownflow() {
return downflow;
}
public void setDownflow(int downflow) {
this.downflow = downflow;
}
public int getSumflow() {
return sumflow;
}
public void setSumflow(int sumflow) {
this.sumflow = sumflow;
}
public FlowBeanSort() {
super();
// TODO Auto-generated constructor stub
}
public FlowBeanSort(String phoneNum, int upflow, int downflow, int sumflow) {
super();
this.phoneNum = phoneNum;
this.upflow = upflow;
this.downflow = downflow;
this.sumflow = sumflow;
}
@Override
public String toString() {
return phoneNum + "\t" + upflow + "\t" + downflow + "\t" + sumflow;
}
@Override
public void readFields(DataInput in) throws IOException {
this.phoneNum = in.readUTF();
this.upflow = in.readInt();
this.downflow = in.readInt();
this.sumflow = in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(phoneNum);
out.writeInt(upflow);
out.writeInt(downflow);
out.writeInt(sumflow);
}
@Override
public int compareTo(FlowBeanSort o) {
// 按照总流量倒序排序
return o.getSumflow() - this.getSumflow();
}
}
对应的map和reduce:
package com.wenhuan.defineclass;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
public class MyDefineSort {
static class MyMapper extends Mapper<LongWritable,Text,FlowBeanSort,NullWritable>{
@Override
protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
String[] infos = value.toString().split("\t");
FlowBeanSort fb = new FlowBeanSort(infos[0],Integer.parseInt(infos[1].trim()),Integer.parseInt(infos[2].trim()),Integer.parseInt(infos[3].trim()));
context.write(fb, NullWritable.get());
}
}
static class MyReducer extends Reducer<FlowBeanSort,NullWritable,FlowBeanSort,NullWritable>{
@Override
protected void reduce(FlowBeanSort key, Iterable<NullWritable> values,Context context) throws IOException, InterruptedException {
for (NullWritable nl:values) {
context.write(key, NullWritable.get());
}
}
}
...
}
串行MapReduce解决重新排序问题
WritableComparator类中的compare方法则为记录的分组提供依据。
默认情况下,compare方法内部直接调用compareTo方法,所以默认情况下,分组和排序依赖的字段相同。compareTo返回值为0的数据将分为一组。
但是有些需求我们需要分组和排序依赖的字段分离,也即分组和排序依赖不同的字段。
以最简单的WordCount为例,最终结果默认是以word的字典顺序进行排序的,如果我们希望对count进行排序,那么在原本的wordcount程序后还需要再串行一个用于二次排序的MapReduce程序,该程序在Map阶段以count作为KEYOUT进行排序,在Reduce阶段再将word作为KEYOUT,从而实现重新排序:
package com.wenhuan.wordcount;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MySort {
static class MyMapper extends Mapper<LongWritable,Text,IntWritable,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] datas = value.toString().split("\t");
String word = datas[0];
int count = Integer.parseInt(datas[1]);
context.write(new IntWritable(count), new Text(word));
}
}
static class MyReducer extends Reducer<IntWritable,Text,Text,IntWritable>{
@Override
protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for(Text v:values) {
context.write(v, key);
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//加载配置文件
Configuration conf = new Configuration();
//启动一个Job,封装maper和reducer
Job job = Job.getInstance(conf);
//设置计算程序的主驱动类,运行的时候打成jar包运行。
job.setJarByClass(MySort.class);
//设置Maper和Reduer类
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
//设置mapper的输出类型
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
//设置reducer的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置输入路径和输出路径
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//提交,需要打印日志
job.waitForCompletion(true);
}
}
自定义类方式实现重新排序
当然,我们也可以自定义一个继承WritableComparator的类,然后重写compare和compareTo方法,也可以让排序和分组逻辑实现分离。
假设我们有如下数据:
math, huangxiaoming,85,87,86,75,88,94
english, huangdatou, 48,58,98,56,73,75
字段名依次是:课程、姓名、历次得分
我们需要求出每门课程参考学生成绩最高平均分的学生的信息,一个分组求最大值的需求,有两个步骤:分组和排序。
但是需要以课程字段进行分组,以课程+平均分进行排序,分组和排序的逻辑必须分离。
需要注意的是,shuffle过程中,排序逻辑在分组逻辑之前,所以如果需要先分组后排序,那么写排序逻辑时,分组字段也要参与排序。
网友评论