-
基于IntelliJ IDEA开发工具,简单配置如下:
1.点击File->New->Project,在弹出的对话框中选择Maven,JDK选择你自己安装的版本,点击Next
B115D533-4D48-4B3C-B1B3-5DB8BE627299.png
2.填写Maven的GroupId和ArtifactId
3.打开Intellij的Preference偏好设置,定位到Build, Execution, Deployment->Compiler->Java Compiler,将WordCount的Target bytecode version修改为你的jdk版本(我的是1.8)
01EB98A1-C0BD-4611-B603-8038077A62B9.png
4.配置依赖
找到pom.xml配置文件,配置如下:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>hadoop</groupId>
<artifactId>com.hadoop</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>apache</id>
<url>http://maven.apache.org</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-dependency-plugin</artifactId>
<configuration>
<excludeTransitive>false</excludeTransitive>
<stripVersion>true</stripVersion>
<outputDirectory>./lib</outputDirectory>
</configuration>
</plugin>
</plugins>
</build>
</project>
5.在 src/main/resources 目录下,创建一个文件: log4j.properties
log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
6.编写程序代码
6.1> 项目的目录结构如图:
51DB39BC-13B1-49C9-BB61-8E667DC41B09.png
6.2> WordCountMapper
文件代码如下:
package come.hadoop.mr.wordcount;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//map阶段
//KEYIN: 输入数据的类型
//VALUEIN: 输入的数据value
//KEYOUT: 输出数据的key类型
//VALUEOUT: 输出的数据的value类型
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// super.map(key, value, context);
//获取一行(一次读取一行)
// 奥巴马 习近平
String line = value.toString();
//2.切割单词(按照空格切割)
String[] words = line.split(" ");
//3循环写出
for (String word: words){
//以单词为key,单词出现的次数为value
k.set(word);
context.write(k, v);
}
}
}
6.3>WordCountReducer
文件代码如下:
package come.hadoop.mr.wordcount;
//import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.Text;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//奥巴马, 1
//习近平, 1
//1.累加求和
int sum = 0;
for (IntWritable value: values) {
sum += value.get();
}
v.set(sum);
//2.输出 atguigu 2
context.write(key, v);
}
}
6.4>WordCountDriver
文件代码如下:
package come.hadoop.mr.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1.获取Job对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2.设置jar存储位置
job.setJarByClass(WordCountDriver.class);
//3.关联Map和Reduce类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//4.设置Mapper阶段输出数据的key和value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//5.设置最终数据输出的key和value类型
job.setOutputKeyClass(Text.class); //理解为WordCountReducer的输出
job.setOutputValueClass(IntWritable.class);
//6.设置输入路径和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0])); //第一个参数就传输入路径
FileOutputFormat.setOutputPath(job, new Path(args[1])); //第二个参数就传输出路径
//7.提交job
// job.submit();
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
7.添加文件内容(/Users/XXX/Desktop/word/wordcount.txt):
wudy peter
sunny timo peter
张三 wudy
奥巴马 习近平
peter jack
8.在Application配置文件中传入输入文件路径
和输出文件路径
这两个参数给到args
注意:输出的文件夹(output)一开始应该是不存在的
9.运行WordCountDriver
, 输出结果为:
jack 1
peter 3
sunny 1
timo 1
wudy 2
习近平 1
奥巴马 1
张三 1
10.打包成jar文件
View -> Tool Windows -> Maven Projects, 出现以下界面:
11.打包完成后,在我们的项目下面会出现如下图,选择没有依赖的包(重命名为wc.jar)上传到服务器集群
0B1779D3-BB10-4132-AFE1-CE6836D3B109.png A9CF3514-55BB-4385-A1A5-21123C9B6065.png
12.在集群上运行我们测试的jar包
运行命令:hadoop wc.jar come.hadoop.mr.wordcount.WordCountDriver /user/wudy/input /user/wudy/output
如下图:
146A7532-3EAE-45B5-A66E-EBC5DF54071A.png
注意:
- come.hadoop.mr.wordcount.WordCountDriver 表示我们 主类的全类名
- /user/wudy/input表示我们hdfs的输入路径,如果不存在该目录,可通过下面命令创建
bin/hdfs dfs -mkdir -p /user/wudy/input
3./user/wudy/output表示我们hdfs的输出路径(如果该路径已经存在,应该先删除)
13.运行完毕,可以在 /user/wudy/output目录看到最终生成的文件
68E05B14-1FB3-4F1D-AC3D-0FF43D73B727.png 14C863AE-B98D-42D0-BD7C-E99AEDEAC1B5.png
点击下载,就能看到我们的计算结果了
网友评论