資源簡介
java項目 基于Hadoop對網站日志數據分析 使用MapReduce框架進行分析,并包含150M的網站日志數據
代碼片段和文件信息
package?com.zzhao;
import?java.io.IOException;
import?org.apache.hadoop.conf.Configuration;
import?org.apache.hadoop.fs.FileSystem;
import?org.apache.hadoop.fs.Path;
import?org.apache.hadoop.io.LongWritable;
import?org.apache.hadoop.io.Text;
import?org.apache.hadoop.mapreduce.Job;
import?org.apache.hadoop.mapreduce.Mapper;
import?org.apache.hadoop.mapreduce.Reducer;
import?org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import?org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public?class?LogClean?{
private?static?String?FOLDER_INPUT?=?“Resource/input“;
private?static?String?FOLDER_OUTPUT?=?“Resource/output“;
private?static?long?PV?=?0;
private?static?long?register?=?0;
private?static?long?IP?=?0;
private?static?long?jumper?=?0;
public?static?void?main(String[]?args)?throws?IOException?ClassNotFoundException?InterruptedException?{
Configuration?cfg?=?new?Configuration();
Job?job?=?Job.getInstance(cfg);
job.setMapperClass(LogCleanMapper.class);
job.setReducerClass(LogCleanReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job?new?Path(FOLDER_INPUT));
Path?outputDir?=?new?Path(FOLDER_OUTPUT);
FileSystem?fs?=?FileSystem.get(cfg);
if?(fs.exists(outputDir))?{
fs.delete(outputDir?true);
}
FileOutputFormat.setOutputPath(job?outputDir);
boolean?flag?=?job.waitForCompletion(true);
if?(flag)?{
System.out.println(“\tClean?process?success!\n\n\n“);
System.out.println(“\tPV量:\t\t“?+?PV);
System.out.println(“\t注冊用戶數:\t“?+?register);
System.out.println(“\t獨立IP數:\t“?+?IP);
System.out.println(“\t跳出用戶數:\t“?+?jumper);
}?else?{
System.out.println(“Clean?process?failed!“);
}
}
static?class?LogCleanMapper?extends?Mapper?{
LogParser?logParser?=?new?LogParser();
Text?k?=?new?Text();
Text?v?=?new?Text();
protected?void?map(LongWritable?key?Text?value
org.apache.hadoop.mapreduce.Mapper.Context?context)
throws?java.io.IOException?InterruptedException?{
PV?+=?1;
final?String[]?parsed?=?logParser.parse(value.toString());
//?step1.過濾掉靜態資源訪問請求
if?(parsed[2].startsWith(“GET?/static/“)?||?parsed[2].startsWith(“GET?/uc_server“))?{
return;
}
//?step2.過濾掉開頭的指定字符串
if?(parsed[2].startsWith(“GET?/“))?{
parsed[2]?=?parsed[2].substring(“GET?/“.length());
}?else?if?(parsed[2].startsWith(“POST?/“))?{
parsed[2]?=?parsed[2].substring(“POST?/“.length());
}
//?step3.過濾掉結尾的特定字符串
if?(parsed[2].endsWith(“?HTTP/1.1“))?{
parsed[2]?=?parsed[2].substring(0?parsed[2].length()?-?“?HTTP/1.1“.length());
}
//?step4.只寫入前三個記錄類型項
k.set(parsed[0]);
v.set(parsed[1]?+?“\t\t“?+?parsed[2]);
context.write(k?v);
//?判斷是否新用戶
if?(p
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????6132??2019-06-04?11:07??Hadoop\.classpath
?????文件????????382??2019-06-04?10:27??Hadoop\.project
?????文件?????????57??2019-06-04?14:15??Hadoop\.settings\org.eclipse.core.resources.prefs
?????文件???????3118??2019-06-11?15:54??Hadoop\bin\com\zzhao\LogClean$LogCleanMapper.class
?????文件???????2275??2019-06-11?15:54??Hadoop\bin\com\zzhao\LogClean$LogCleanReducer.class
?????文件???????3499??2019-06-11?15:54??Hadoop\bin\com\zzhao\LogClean.class
?????文件???????2529??2019-06-11?15:53??Hadoop\bin\com\zzhao\LogParser.class
?????文件???61084192??2019-06-04?13:56??Hadoop\bin\input\access_2013_05_30.log
?????文件??157069653??2019-06-04?14:32??Hadoop\bin\input\access_2013_05_31.log
?????文件????????879??2019-06-04?10:49??Hadoop\bin\log4j.properties
?????文件????????567??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$CachedUid.class
?????文件????????644??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX$CachedName.class
?????文件???????1545??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX$CacheManipulator.class
?????文件???????1302??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX$IdCache.class
?????文件???????1592??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX$NoMlockCacheManipulator.class
?????文件???????2575??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX$Stat.class
?????文件???????9842??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX.class
?????文件???????1595??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$Windows$AccessRight.class
?????文件???????2316??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$Windows.class
?????文件???????9297??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO.class
?????文件??????30453??2019-06-11?15:53??Hadoop\bin\org\apache\hadoop\mapred\YARNRunner.class
?????文件?????428480??2019-06-04?15:48??Hadoop\bin\output\.part-r-00000.crc
?????文件??????????8??2019-06-04?15:48??Hadoop\bin\output\._SUCCESS.crc
?????文件???54844160??2019-06-04?15:48??Hadoop\bin\output\part-r-00000
?????文件??????????0??2019-06-04?15:48??Hadoop\bin\output\_SUCCESS
?????文件??????62983??2016-01-14?08:45??Hadoop\lib\activation-1.1.jar
?????文件???????4467??2016-01-14?08:45??Hadoop\lib\aopalliance-1.0.jar
?????文件??????44925??2016-01-14?08:45??Hadoop\lib\apacheds-i18n-2.0.0-M15.jar
?????文件?????691479??2016-01-14?08:45??Hadoop\lib\apacheds-kerberos-codec-2.0.0-M15.jar
?????文件??????16560??2016-01-14?08:45??Hadoop\lib\api-asn1-api-1.0.0-M20.jar
............此處省略124個文件信息
評論
共有 條評論