美文网首页
【2019-05-08】map reduce的类型和格式

【2019-05-08】map reduce的类型和格式

作者: BigBigFlower | 来源:发表于2019-05-08 14:10 被阅读0次

    (1)map reduce的类型
    map: (k1,v1) -> list(k2,v2)
    combiner:(k2,list(v2)) ->(k2,v2)
    reduce: (k2,list(v2)) -> list(k3,v3)

    map reduceAPI的设置类型
    streaming的分割符属性 inputformat类的层次结构

    有的应用程序可能不希望文件被切分,而是用一个mapper完整处理每一个输入文件。
    避免切分用FileInputFormat的具体子类,可以重写isSplitable

    // == NonSplittableTextInputFormat
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.mapreduce.JobContext;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    
    public class NonSplittableTextInputFormat extends TextInputFormat {
      @Override
      protected boolean isSplitable(JobContext context, Path file) {
        return false;
      }
    }
    
    

    把整个文件作为一条记录来处理

    // 把整个文件作为一条记录的inputformat
    // cc WholeFileInputFormat An InputFormat for reading a whole file as a record
    import java.io.IOException;
    import org.apache.hadoop.fs.*;
    import org.apache.hadoop.io.*;
    import org.apache.hadoop.mapreduce.InputSplit;
    import org.apache.hadoop.mapreduce.JobContext;
    import org.apache.hadoop.mapreduce.RecordReader;
    import org.apache.hadoop.mapreduce.TaskAttemptContext;
    import org.apache.hadoop.mapreduce.lib.input.*;
    
    //vv WholeFileInputFormat
    public class WholeFileInputFormat
        extends FileInputFormat<NullWritable, BytesWritable> {
      
      @Override
      protected boolean isSplitable(JobContext context, Path file) {
        return false;
      }
    
      @Override
      public RecordReader<NullWritable, BytesWritable> createRecordReader(
          InputSplit split, TaskAttemptContext context) throws IOException,
          InterruptedException {
        WholeFileRecordReader reader = new WholeFileRecordReader();
        reader.initialize(split, context);
        return reader;
      }
    }
    //^^ WholeFileInputFormat
    
    

    RecordReader将整个文件作为一条记录处理

    
    // cc WholeFileRecordReader The RecordReader used by WholeFileInputFormat for reading a whole file as a record
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FSDataInputStream;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.BytesWritable;
    import org.apache.hadoop.io.IOUtils;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.mapreduce.InputSplit;
    import org.apache.hadoop.mapreduce.RecordReader;
    import org.apache.hadoop.mapreduce.TaskAttemptContext;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    
    //vv WholeFileRecordReader
    class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable> {
      
      private FileSplit fileSplit;
      private Configuration conf;
      private BytesWritable value = new BytesWritable();
      private boolean processed = false;
    
      @Override
      public void initialize(InputSplit split, TaskAttemptContext context)
          throws IOException, InterruptedException {
        this.fileSplit = (FileSplit) split;
        this.conf = context.getConfiguration();
      }
      
      @Override
      public boolean nextKeyValue() throws IOException, InterruptedException {
        if (!processed) {
          byte[] contents = new byte[(int) fileSplit.getLength()];
          Path file = fileSplit.getPath();
          FileSystem fs = file.getFileSystem(conf);
          FSDataInputStream in = null;
          try {
            in = fs.open(file);
            IOUtils.readFully(in, contents, 0, contents.length);
            value.set(contents, 0, contents.length);
          } finally {
            IOUtils.closeStream(in);
          }
          processed = true;
          return true;
        }
        return false;
      }
      
      @Override
      public NullWritable getCurrentKey() throws IOException, InterruptedException {
        return NullWritable.get();
      }
    
      @Override
      public BytesWritable getCurrentValue() throws IOException,
          InterruptedException {
        return value;
      }
    
      @Override
      public float getProgress() throws IOException {
        return processed ? 1.0f : 0.0f;
      }
    
      @Override
      public void close() throws IOException {
        // do nothing
      }
    }
    //^^ WholeFileRecordReader
    
    

    将若干个小文件打包成顺序文件的MapReduce程序

    // cc SmallFilesToSequenceFileConverter A MapReduce program for packaging a collection of small files as a single SequenceFile
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.BytesWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.InputSplit;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    //vv SmallFilesToSequenceFileConverter
    public class SmallFilesToSequenceFileConverter extends Configured
        implements Tool {
      
      static class SequenceFileMapper
          extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> {
        
        private Text filenameKey;
        
        @Override
        protected void setup(Context context) throws IOException,
            InterruptedException {
          InputSplit split = context.getInputSplit();
          Path path = ((FileSplit) split).getPath();
          filenameKey = new Text(path.toString());
        }
        
        @Override
        protected void map(NullWritable key, BytesWritable value, Context context)
            throws IOException, InterruptedException {
          context.write(filenameKey, value);
        }
        
      }
    
      @Override
      public int run(String[] args) throws Exception {
        Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
        if (job == null) {
          return -1;
        }
        
        job.setInputFormatClass(WholeFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);
    
        job.setMapperClass(SequenceFileMapper.class);
    
        return job.waitForCompletion(true) ? 0 : 1;
      }
      
      public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverter(), args);
        System.exit(exitCode);
      }
    }
    // ^^ SmallFilesToSequenceFileConverter
    
    outputFormat类的层析结构

    数据分割
    MultipleOutputs类可以将数据分割成多个文件,这些文件的名称源于输出的键和值或任意字符串。

    // == PartitionByStationYearUsingMultipleOutputs
    // MultipleOutputs按照气象站划分数据
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    public class PartitionByStationYearUsingMultipleOutputs extends Configured
     implements Tool {
     
     static class StationMapper
       extends Mapper<LongWritable, Text, Text, Text> {
     
       private NcdcRecordParser parser = new NcdcRecordParser();
       
       @Override
       protected void map(LongWritable key, Text value, Context context)
           throws IOException, InterruptedException {
         parser.parse(value);
         context.write(new Text(parser.getStationId()), value);
       }
     }
     
     static class MultipleOutputsReducer
       extends Reducer<Text, Text, NullWritable, Text> {
       
       private MultipleOutputs<NullWritable, Text> multipleOutputs;
       private NcdcRecordParser parser = new NcdcRecordParser();
    
       @Override
       protected void setup(Context context)
           throws IOException, InterruptedException {
         multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
       }
    
    // vv PartitionByStationYearUsingMultipleOutputs
       @Override
       protected void reduce(Text key, Iterable<Text> values, Context context)
           throws IOException, InterruptedException {
         for (Text value : values) {
           parser.parse(value);
           String basePath = String.format("%s/%s/part",
               parser.getStationId(), parser.getYear());
           multipleOutputs.write(NullWritable.get(), value, basePath);
    //在MultipleOutputs的write方法指定中指定的基本路径相对于输出路径进行解释,因为它可以包含文件路径分隔符(/),创建任意深度的子目录是可能的。
         }
       }
    // ^^ PartitionByStationYearUsingMultipleOutputs
       
       @Override
       protected void cleanup(Context context)
           throws IOException, InterruptedException {
         multipleOutputs.close();
       }
     }
    
     @Override
     public int run(String[] args) throws Exception {
       Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
       if (job == null) {
         return -1;
       }
       
       job.setMapperClass(StationMapper.class);
       job.setMapOutputKeyClass(Text.class);
       job.setReducerClass(MultipleOutputsReducer.class);
       job.setOutputKeyClass(NullWritable.class);
    
       return job.waitForCompletion(true) ? 0 : 1;
     }
     public static void main(String[] args) throws Exception {
       int exitCode = ToolRunner.run(new PartitionByStationYearUsingMultipleOutputs(),
           args);
       System.exit(exitCode);
     }
    }
    
    

    相关文章

      网友评论

          本文标题:【2019-05-08】map reduce的类型和格式

          本文链接:https://www.haomeiwen.com/subject/akbsoqtx.html