1.先到GitHub上下载著名的开源UserAgentParser项目,用maven打包
具体命令如下:mvn clean package -DskipTests 之后可以在项目的target下看到jar包。但是我们要安装到本地的maven中,用mvn clean install-DskipTests
2.注册到maven中的log日志如下
QQ截图20171016200304.png3.在本地maven中的pom.xml的配置
QQ截图20171016211548.png4.基于Hash的本地浏览器数量的代码如下
public class realWorks {
UserAgentParser userAgentParser = new UserAgentParser();
@Test
public void testReadFile() throws Exception{
//根据这个路径找到C盘的log文件
String path = "C:/Users/Administrator/Desktop/Document/data source/access.log.10";
BufferedReader reader = new BufferedReader(
new InputStreamReader(new FileInputStream(new File(path))));
String line = "";
int count = 0;
Map<String ,Integer> map = new HashMap<String, Integer>();
while(line!=null){
count++;
line=reader.readLine();
if(StringUtils.isNotBlank( line)){
String source = line.substring(getCharacterPosition(line,"\"",5));
UserAgent agent = userAgentParser.parse(source);
String browser = agent.getBrowser();
if(map.get(browser)!=null){
map.put(browser,map.get(browser)+1);
}else{
map.put(browser,1);
}
}
}
for(Map.Entry<String,Integer> entry : map.entrySet()){
System.out.println(entry.getKey()+" "+entry.getValue());
}
}
@Test
public void testGetCharacterPosition(){
String value = "60.247.54.4 - - [18/Sep/2013:07:16:09 +0000] \"GET /wp-content/uploads/2013/05/favicon.ico HTTP/1.1\" 200 1150 \"-\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\"";
int index = getCharacterPosition(value,"\"",5);
System.out.println(index);
}
private int getCharacterPosition(String value,String operator,int index){
Matcher slashMatcher = Pattern.compile(operator).matcher(value);
int mIdx = 0;
while(slashMatcher.find()){
mIdx++;
if(mIdx==index){
break;
}
}
return slashMatcher.start();
}
}
5.基于MapReduce执行大数据统计
5.1 添加maven打包的插件,以后用maven assembly:assembly打包
build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass></mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>
5.2 先把log -put 上HDFS ,之后执行sh脚本
hadoop jar /var/tmp/hadoop_train-1.0-SNAPSHOT-jar-with-dependencies.jar com.lzk.hadoop.LogApp /access.log.10 /browser
5.3 贴上代码
public class LogApp {
public static class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable>{
private UserAgentParser userAgentParser;
LongWritable one = new LongWritable(1);
@Override
protected void setup(Context context) throws IOException, InterruptedException {
userAgentParser = new UserAgentParser();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String source = line.substring(getCharacterPosition(line,"\"",5));
UserAgent agent = userAgentParser.parse(source);
String browser = agent.getBrowser();
context.write(new Text(browser),one);
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
userAgentParser=null;
}
}
public static class MyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum = 0;
for(LongWritable value:values){
sum+=value.get();
}
context.write(key,new LongWritable(sum));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration, "LogApp");
job.setJarByClass(LogApp.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
//设置map相关参数
job.setMapperClass(LogApp.MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//设置reduce相关参数
job.setReducerClass(LogApp.MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//设置作业处理的输出路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
private static int getCharacterPosition(String value,String operator,int index){
Matcher slashMatcher = Pattern.compile(operator).matcher(value);
int mIdx = 0;
while(slashMatcher.find()){
mIdx++;
if(mIdx==index){
break;
}
}
return slashMatcher.start();
}
}
网友评论