91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

資源簡(jiǎn)介

山東大學(xué)大數(shù)據(jù)課程的實(shí)驗(yàn)二。基于hadoop集群系統(tǒng)(也可以在偽分布式系統(tǒng)上運(yùn)行)系統(tǒng)使用Java編寫(xiě)的倒排索引實(shí)現(xiàn),具有使用停詞表功能,使用正則表達(dá)式選擇規(guī)范的單詞。代碼重構(gòu)了setup(),map(),combiner(),partitation()和reducer()函數(shù),功能是對(duì)文檔進(jìn)行倒排索引,得到一個(gè)單詞有序,且單詞的文件列表同樣有序的倒排列表集合。

資源截圖

代碼片段和文件信息

package?com.Test4;

import?org.apache.hadoop.conf.Configuration;
import?org.apache.hadoop.fs.Path;
import?org.apache.hadoop.io.IntWritable;
import?org.apache.hadoop.io.Text;
import?org.apache.hadoop.mapreduce.Job;
import?org.apache.hadoop.mapreduce.Mapper;
import?org.apache.hadoop.mapreduce.Reducer;
import?org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import?org.apache.hadoop.mapreduce.lib.input.FileSplit;
import?org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import?org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

import?java.io.FileInputStream;
import?java.io.IOException;
import?java.net.URI;
import?java.util.ArrayList;
import?java.util.List;
import?java.util.Scanner;
import?java.util.Set;
import?java.util.StringTokenizer;
import?java.util.TreeSet;
import?java.util.regex.Matcher;
import?java.util.regex.Pattern;

public?class?InvertedIndex?{
public?static?class?myMap?extends?Mapperject?Text?Text?IntWritable>{
private?final?static?IntWritable?one?=?new?IntWritable(1);
private?URI[]?remoteFiles;?//?存放停用詞txt文檔統(tǒng)一資源標(biāo)識(shí)符?
private?Set?stopwords;?//存放停用詞?
public?void?setup(Context?context)?throws?IOException?InterruptedException{
Configuration?conf?=?context.getConfiguration();
?remoteFiles?=?Job.getInstance(conf).getCacheFiles();//獲取stop_words.txt?
?stopwords?=?new?TreeSet();?//對(duì)于URI列表里的每一個(gè)停用詞表?
?for?(int?i?=?0;?i? ?FileInputStream?in?=new?FileInputStream(new?Path(remoteFiles[i].getPath()).getName().toString());
?//讀取文件的每一行?
?Scanner?sc?=new?Scanner(in);?
?while?(sc.hasNextLine())?{
?String?line?=?sc.nextLine();
?String[]?split?=?line.trim().split(“?“);?//trim():去掉空格?制表符等,split(“?“):依據(jù)文件格式可用可不用‘
?for?(int?j?=?0;?j? ?stopwords.add(split[j]);?
?}
}
sc.close();
?}
???? }

public?void?map(object?key?Text?value?Context?context)?throws?IOException?InterruptedException?{
FileSplit?inputSplit?=?(FileSplit)?context.getInputSplit();?
String?filename=inputSplit.getPath().getName();?//獲取文件名?
//正則表達(dá)式去除特殊字符
String?str?=?“\\w+“;//字母或數(shù)字或下劃線或漢字
Pattern?pattern?=?Pattern.compile(str);
String?line?=?value.toString().toLowerCase();
StringTokenizer?itr?=?new?StringTokenizer(line);
String?temp?=?new?String();
for(;?itr.hasMoreTokens();)?{
temp?=?itr.nextToken();
Matcher?ma?=?pattern.matcher(temp);
while(ma.find())?{
String?word?=?ma.group();
if(!stopwords.contains(word))?{
Text?text?=?new?Text();
text.set(word+“#“+filename);?//key?=?word+#+filename
context.write(text?one);
}
}
}
}
}
public?static?class?myCombiner?extends?Reducer{
public?void?reduce(Text?key?Iterable?values?Context?context)?throws?IOException?InterruptedException?{
int?sum?=?0;
for?(IntWritable?val?:?values)?{

評(píng)論

共有 條評(píng)論

相關(guān)資源