-
大小: 11KB文件類型: .java金幣: 1下載: 0 次發(fā)布日期: 2021-06-13
- 語言: Java
- 標簽: Hadoop??大數(shù)據(jù)??倒排索引??
資源簡介
這是山東大學大數(shù)據(jù)實驗二,用Hadoop實現(xiàn)文檔的倒排索引
代碼片段和文件信息
package?cn.edu.zucc.mapreduce;
import?java.io.BufferedReader;??
import?java.io.FileReader;??
import?java.io.IOException;??
import?java.net.URI;??
import?java.util.List;??
import?java.util.Set;??
import?java.util.StringTokenizer;??
import?java.util.ArrayList;??
import?java.util.TreeSet;??
??
import?org.apache.hadoop.conf.Configuration;??
import?org.apache.hadoop.filecache.DistributedCache;??
import?org.apache.hadoop.fs.Path;??
import?org.apache.hadoop.io.IntWritable;??
import?org.apache.hadoop.io.Text;??
import?org.apache.hadoop.mapreduce.RecordReader;??
import?org.apache.hadoop.mapreduce.lib.input.LineRecordReader;??
import?org.apache.hadoop.mapreduce.InputSplit;??
import?org.apache.hadoop.mapreduce.lib.input.FileSplit;??
import?org.apache.hadoop.mapreduce.TaskAttemptContext;??
import?org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;??
import?org.apache.hadoop.mapreduce.Job;??
import?org.apache.hadoop.mapreduce.Mapper;??
import?org.apache.hadoop.mapreduce.Reducer;??
import?org.apache.hadoop.mapreduce.lib.input.FileInputFormat;??
import?org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;??
/**
?*?mapper
?*?input:
?*?output:
?*?
?*?partitioner
?*?只根據(jù)word哈希
?*?
?*?Combiner
?*?input:
?*?output:
?*?
?*?reducer
?*?intput:
?*?output:;<..>;...>
?*?
?**/
??
public?class?indexinverted?{??
?????
????/*
?????*?因為重寫了RecordReader類,這里要重寫FileInputFormat類來使用自定義FileNameRecordReader.
?????*?這個類的主要作用就是返回一個FileNameRecordReader類的實例。
?????*/
????public?static?class?FileNameRecordReader?extends?RecordReader?{??
????????String?fileName;??
????????LineRecordReader?lrr?=?new?LineRecordReader();??
??
????????@Override??
????????//返回key
????????public?Text?getCurrentKey()?throws?IOException?InterruptedException?{??
????????????return?new?Text(fileName);??
????????}??
??
????????@Override??
????????//返回value
????????public?Text?getCurrentValue()?throws?IOException?InterruptedException?{??
????????????return?lrr.getCurrentValue();??
????????}??
??
????????@Override??
????????public?void?initialize(InputSplit?arg0?TaskAttemptContext?arg1)?throws?IOException?InterruptedException?{??
????????????lrr.initialize(arg0?arg1);??
????????????fileName?=?((FileSplit)?arg0).getPath().getName();??//獲得文件名
????????}??
??
????????public?void?close()?throws?IOException?{??
????????????lrr.close();??
????????}??
??
????????public?boolean?nextKeyValue()?throws?IOException?InterruptedException?{??
????????????return?lrr.nextKeyValue();??
????????}??
??
????????public?float?getProgress()?throws?IOException?InterruptedException?{??
????????????return?lrr.getProgress();??
????????}??
????}?
????
????/*
?*??FileNameRecordReader類繼承自RecordReader,是RecordReader類的自定義實現(xiàn).
?*??主要作用是將記錄所在的文件名作為key,而不是記錄行所在文件的偏移,獲取文件名所用的語句為:
?????*??fileName?=?((FileSplit)?arg0).getPath().getName();
?*/
????public?static?class?FileName
評論
共有 條評論