在这里我们仍然以wordcount为例,这里大数据的wordcount就和helloworld一样吧(笑)。还是逐步分析代码。
配置连接。
这里的修改hadoop源码是指修改org.apache.hadoop
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.quorum", "node1,node2,node3");
conf.set("fs.defaultFS", "hdfs://node1:8020");//写你active的namenode名称
创建job类
Job job = Job.getInstance(conf);
job.setJarByClass(WCRunner.class);
设置mapreduce
//
job.setMapperClass(WCMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// map设置完毕
TableMapReduceUtil.initTableReducerJob("wc", WCReducer.class, job, null, null, null, null, false);
// 第一个参数是表名,往哪存数据,第二个class <? extend TableReducer>第三个job,后面全写空,最后一个必须写false
FileInputFormat.addInputPath(job, new Path("/usr/wc"));//指定路径,从哪里读文件
// reduce端输出的key和value的类型
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Put.class);//hdfs的put
// job.setOutputFormatClass(cls);这注释的两句控制从哪个源读数据,向哪个源写数据
// job.setInputFormatClass(cls);
job.waitForCompletion(true);
TableMapReduceUtil.initTableReducerJob("wc", WCReducer.class, job, null, null, null, null, false);
https://blog.csdn.net/shudaqi2010/article/details/88653797
WCrunner代码
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class WCRunner {
public static void main(String[] args) throws Exception {
// 配置文件设置
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.quorum", "node1,node2,node3");
conf.set("fs.defaultFS", "hdfs://node1:8020");
Job job = Job.getInstance(conf);
job.setJarByClass(WCRunner.class);
job.setMapperClass(WCMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
TableMapReduceUtil.initTableReducerJob("wc", WCReducer.class, job, null, null, null, null, false);
FileInputFormat.addInputPath(job, new Path("/usr/wc"));
// reduce端输出的key和value的类型
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Put.class);
// job.setOutputFormatClass(cls);
// job.setInputFormatClass(cls);
job.waitForCompletion(true);
}
}
WCMapper全部代码
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WCMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] splits = value.toString().split(" ");
// new StringTokenizer(value.toString()," ");这两种方法都可以
for (String string : splits) {
context.write(new Text(string), new IntWritable(1));
}
}
}
reduce代码
import java.io.IOException;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
public class WCReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> iter,
Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable intWritable : iter) {
sum+=intWritable.get();
}
Put put = new Put(key.toString().getBytes());//rowkey
put.add("cf".getBytes(), "cf".getBytes(), String.valueOf(sum).getBytes());
context.write(null, put);
}
}
建表
run后
网友评论