資源簡介
包含兩種平臺上運行的kmeans算法:一種是在Hadoop系統上的并行化kmeans算法,支持讀文件,執行聚類算法,輸出質心文件,將每個數據的聚類信息輸出到控制臺上;另一種是串行的聚類算法,支持讀文件數據,執行kmeans算法,將每個數據的聚類信息輸出到文件中。代碼注釋清晰。

代碼片段和文件信息
package?com.kmeans;
import?java.io.IOException;
import?org.apache.hadoop.conf.Configuration;
import?org.apache.hadoop.fs.FSDataInputStream;
import?org.apache.hadoop.fs.FileStatus;
import?org.apache.hadoop.fs.FileSystem;
import?org.apache.hadoop.fs.Path;
import?org.apache.hadoop.io.DoubleWritable;
import?org.apache.hadoop.io.LongWritable;
import?org.apache.hadoop.io.NullWritable;
import?org.apache.hadoop.io.Text;
import?org.apache.hadoop.io.WritableComparable;
import?org.apache.hadoop.io.WritableComparator;
import?org.apache.hadoop.mapreduce.Counter;
import?org.apache.hadoop.mapreduce.Job;
import?org.apache.hadoop.mapreduce.Mapper;
import?org.apache.hadoop.mapreduce.Reducer;
import?org.apache.hadoop.mapreduce.Reducer.Context;
import?org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import?org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import?org.apache.hadoop.util.LineReader;
class?Center{
protected?static?int?k?=?3; //質心的個數
protected?static?int?dimension?=?2;?//數據的維度
//從初始的質心文件中加載質心,并返回質心文件字符串,質心之間用tab分割
public?String?loadInitCenter(Path?path)?throws?IOException?{
StringBuffer?sb?=?new?StringBuffer();
Configuration?conf?=?new?Configuration();
FileSystem?hdfs?=?FileSystem.get(conf);
FSDataInputStream?dis?=?hdfs.open(path);
LineReader?in?=?new?LineReader(dis?conf);
Text?line?=?new?Text();
while(in.readLine(line)?>?0)?{
sb.append(line.toString().trim());//trim():去掉字符串兩端多余的空格
sb.append(“\t“);
}
return?sb.toString().trim();
}
//從每次迭代的質心文件里讀取質心,并返回字符串
public?String?loadCenter(Path?path)?throws?IOException?{
StringBuffer?sb?=?new?StringBuffer();
Configuration?conf?=?new?Configuration();
FileSystem?hdfs?=?FileSystem.get(conf);
//獲取文件列表
FileStatus[]?files?=?hdfs.listStatus(path);
for(int?i?=?0;?i? Path?filePath?=?files[i].getPath();
if(!filePath.getName().contains(“part“))?continue;
FSDataInputStream?dis?=?hdfs.open(filePath);
LineReader?in?=?new?LineReader(dis?conf);
Text?line?=?new?Text();
while(in.readLine(line)?>?0)?{
sb.append(line.toString().trim());
sb.append(“\t“);
}
}
return?sb.toString().trim();
}
}
public?class?Kmeans?{
private?static?String?FLAG?=?“a“;//用于存聚類中心信息
//計算兩個向量之間的?歐式距離
public?static?double?distance(double[]?a?double[]?b)?{
if(a?==?null?||?b?==?null?||?a.length?!=?b.length)?return?Double.MAX_VALUE;
double?d?=?0;
for(int?i?=?0;?i? d?+=?Math.pow(a[i]?-?b[i]?2);
}
return?Math.sqrt(d);
}
public?static?class?mapper?extends?Mapperject?Text?Text?Text>{
double[][]?centers?=?new?double[Center.k][];//存儲每個簇中心的信息
String[]?centerstrArray?=?null;//用于存儲聚類中心的字符串連接信息
public?void?setup(Context?context)?{
//將放在context中的聚類中心轉換為數組的形式,方便使用
String?kmeansS?=?context.getConfiguration().get(FLAG);
centerstrArray?=?kmeansS.split(“\t“);
for(int?i?=?0;?i? String[]?segs?=?centerstrArray[i].split(““);
centers[i]?=?new?double[segs
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2019-05-27?02:44??hadoop并行化和非并行化的kmeans算法\
?????目錄???????????0??2019-05-27?02:44??hadoop并行化和非并行化的kmeans算法\并行化kmeans算法\
?????文件????????7808??2019-05-26?11:49??hadoop并行化和非并行化的kmeans算法\并行化kmeans算法\Kmeans.java
?????目錄???????????0??2019-05-27?02:44??hadoop并行化和非并行化的kmeans算法\非并行化kmeans算法\
?????文件????????4515??2019-05-26?12:27??hadoop并行化和非并行化的kmeans算法\非并行化kmeans算法\Kmeans.java
- 上一篇:opencv_3rdparty中所有ffmpeg庫
- 下一篇:職工工資管理系統
評論
共有 條評論