美文网首页
MapReduce 基础 (三)分区

MapReduce 基础 (三)分区

作者: 做个合格的大厂程序员 | 来源:发表于2020-06-17 11:10 被阅读0次

    概念

    在 MapReduce 中, 通过我们指定分区, 会将同一个分区的数据发送到同一个 Reduce 当中进行 处理

    例如: 为了数据的统计, 可以把一批类似的数据发送到同一个 Reduce 当中, 在同一个 Reduce 当 中统计相同类型的数据, 就可以实现类似的数据分区和统计等

    其实就是相同类型的数据, 有共性的数据, 送到一起去处理

    Reduce 当中默认的分区只有一个

    image

    案例:
    表中的数据分区成两组,一组是中奖结果大于15的分区查询,一组的中奖结果小于15的分区查询。

    PartitionReducer

    package Partitioner;
    
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    public class PartitionReducer extends Reducer<Text, NullWritable, Text,NullWritable> {
        @Override
        protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            context.write(key,NullWritable.get());
        }
    }
    

    PartitionMapper

    package Partitioner;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    
    public class PartitionMapper extends Mapper<LongWritable,Text, Text, NullWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            context.write(value,NullWritable.get());
        }
    }
    

    Partition

    package Partitioner;
    
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Partitioner;
    
    public class PartitonerOwn extends Partitioner<Text, NullWritable> {
    
        @Override
        public int Partition(Text text, NullWritable nullWritable, int i) {
            //1:拆分行文本数据(K2),获取中奖字段的值
            String[] split = text.toString().split("\t");
            String numStr = split[5];
    
            //2:判断中奖字段的值和15的关系,然后返回对应的分区编号
            if(Integer.parseInt(numStr) > 15){
                return  1;
            }else{
                return  0;
            }
        }
    }
    

    main

    package Partitioner;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    import java.net.URI;
    
    public class MainJob extends Configured implements Tool {
    
        @Override
        public int run(String[] strings) throws Exception {
    
            //1.创建job任务对象
            Job job = Job.getInstance(super.getConf(), "partition_mapreduce");
    
            //2.对job任务进行配置(八个步骤)
    
            //1)设置输入类和路径
            job.setInputFormatClass(TextInputFormat.class);
            TextInputFormat.addInputPath(job,new Path("hdfs://node1:8020/input"));
            
            //2) 设置mapper类
            job.setMapperClass(PartitionerMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(NullWritable.class);
            
            //3) 指定分区类
            job.setPartitionerClass(PartitonerOwn.class);
            
            //7) 指定Reducer类
            job.setReducerClass(MyReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
            
            //设置reduceTask个数
            job.setNumReduceTasks(2);
    
            //8)指定输出类和路径
            Path path = new Path("hdfs://node1:8020/out/partition_out");
            job.setOutputFormatClass(TextOutputFormat.class);
            TextOutputFormat.setOutputPath(job,path);
    
            FileSystem fileSystem = FileSystem.get(new URI("hdfs://node1:8020/out/partition_out"),new Configuration());
            if (fileSystem.exists(path)){
                fileSystem.delete(path,true);
            }
    
            //3.等待任务执行
            boolean bl = job.waitForCompletion(true);
            return bl?0:1;
        }
    
        public static void main(String[] args) throws Exception{
            Configuration configuration = new Configuration();
            
            //启动Job任务
            int run = ToolRunner.run(configuration,new MainJob(),args);
    
            System.exit(run);
        }
    }
    

    相关文章

      网友评论

          本文标题:MapReduce 基础 (三)分区

          本文链接:https://www.haomeiwen.com/subject/krwjxktx.html