美文网首页程序员
奔跑的大象,Hadoop之WordCount源代码

奔跑的大象,Hadoop之WordCount源代码

作者: GarfieldEr007 | 来源:发表于2015-11-18 12:18 被阅读1055次
    Hadoop.jpg

    本文有新版和旧版两个版本的源代码。

    hadoop_logo.png

    WordCount旧版源代码

        //package org.apache.hadoop.examples;
        import java.io.IOException;
        import java.util.Iterator;
        import java.util.StringTokenizer;
        import org.apache.hadoop.fs.Path;
        import org.apache.hadoop.io.IntWritable;
        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapred.FileInputFormat;
        import org.apache.hadoop.mapred.FileOutputFormat;
        import org.apache.hadoop.mapred.JobClient;
        import org.apache.hadoop.mapred.JobConf;
        import org.apache.hadoop.mapred.MapReduceBase;
        import org.apache.hadoop.mapred.Mapper;
        import org.apache.hadoop.mapred.OutputCollector;
        import org.apache.hadoop.mapred.Reducer;
        import org.apache.hadoop.mapred.Reporter;
        import org.apache.hadoop.mapred.TextInputFormat;
        import org.apache.hadoop.mapred.TextOutputFormat;
        
        public class WordCount {
            public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
                private final static IntWritable one = new IntWritable(1);
                private Text word = new Text();
                public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)throws IOException {
                    String line = value.toString();
                    StringTokenizer tokenizer = new StringTokenizer(line);
                    while (tokenizer.hasMoreTokens()) {
                        word.set(tokenizer.nextToken());
                        output.collect(word, one);
                    } //while
                } //map()
            } //static class Map
        
            public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
                public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter)throws IOException {
                    int sum = 0;
                    while (values.hasNext()) {
                        sum += values.next().get();
                    }
                    output.collect(key, new IntWritable(sum));
                } //reduce()
            } //static class Reduce
            public static void main(String[] args) throws Exception {
                JobConf conf = new JobConf(WordCount.class);
                conf.setJobName("wordcount");
                conf.setOutputKeyClass(Text.class);
                conf.setOutputValueClass(IntWritable.class);
                conf.setMapperClass(Map.class);
                conf.setCombinerClass(Reduce.class);
                conf.setReducerClass(Reduce.class);
                conf.setInputFormat(TextInputFormat.class);
                conf.setOutputFormat(TextOutputFormat.class);
                FileInputFormat.setInputPaths(conf, new Path(args[0]));
    

    Hadoop3.jpg

    WordCount新版源代码

        //package org.apache.hadoop.examples;
        import java.io.IOException;
        import java.util.StringTokenizer;
        import org.apache.hadoop.conf.Configuration;
        import org.apache.hadoop.fs.Path;
        import org.apache.hadoop.io.IntWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.Job;
        import org.apache.hadoop.mapreduce.Mapper;
        import org.apache.hadoop.mapreduce.Reducer;
        import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
        import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
        import org.apache.hadoop.util.GenericOptionsParser;
        public class WordCount {
            public static class TokenizerMapper
                    extends Mapper<Object, Text, Text, IntWritable>{
                    private final static IntWritable one = new IntWritable(1);
                    private Text word = new Text();
                    public void map(Object key, Text value, Context context)throws IOException, InterruptedException {
                        StringTokenizer itr = new StringTokenizer(value.toString());
                        while (itr.hasMoreTokens()) {
                            word.set(itr.nextToken());
                            context.write(word, one);
                        } //while
                    } //map()
            } //static class TokenizerMapper
            public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
                private IntWritable result = new IntWritable();
                public void reduce(Text key, Iterable<IntWritable> values,Context context)throws IOException, InterruptedException {
                    int sum = 0;
                    for (IntWritable val : values) {
                        sum += val.get();
                    }
                    result.set(sum);
                    context.write(key, result);
                } //reduce
            } //static class IntSumReducer
            public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
            if (otherArgs.length != 2) {
                System.err.println("Usage: wordcount <in> <out>");
                System.exit(2);
            }
            Job job = new Job(conf, "word count");
            job.setJarByClass(WordCount.class);
            job.setMapperClass(TokenizerMapper.class);
            job.setCombinerClass(IntSumReducer.class);
            job.setReducerClass(IntSumReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
            FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        } //main()
        } //class WordCount
    
    Hadoop3.jpg

    参考:Apache Hadoop

    相关文章

      网友评论

        本文标题:奔跑的大象,Hadoop之WordCount源代码

        本文链接:https://www.haomeiwen.com/subject/bjclhttx.html