資源簡介
基于lucene實現word、ppt、excel、pdf、txt全文檢索
代碼片段和文件信息
???
??
import?java.io.BufferedReader;
import?java.io.File;???
import?java.io.FileInputStream;
import?java.io.FileNotFoundException;
import?java.io.FileReader;???
import?java.io.IOException;???
import?java.io.InputStream;
import?java.io.InputStreamReader;
import?java.io.Reader;
import?java.io.StringReader;
import?java.text.SimpleDateFormat;
import?java.util.Date;???
??
import?org.apache.lucene.analysis.standard.StandardAnalyzer;???
import?org.apache.lucene.document.DateTools;???
import?org.apache.lucene.document.Document;???
import?org.apache.lucene.document.Field;???
import?org.apache.lucene.index.IndexWriter;???
import?org.apache.lucene.store.Directory;???
import?org.apache.lucene.store.SimpleFSDirectory;???
import?org.apache.lucene.util.Version;???
import?org.apache.pdfbox.pdfparser.PDFParser;
import?org.apache.pdfbox.pdmodel.PDDocument;
import?org.apache.pdfbox.util.PDFTextStripper;
import?org.apache.poi.hslf.HSLFSlideShow;
import?org.apache.poi.hslf.model.Slide;
import?org.apache.poi.hslf.model.TextRun;
import?org.apache.poi.hslf.usermodel.RichTextRun;
import?org.apache.poi.hslf.usermodel.SlideShow;
import?org.apache.poi.hssf.usermodel.HSSFCell;
import?org.apache.poi.hssf.usermodel.HSSFDateUtil;
import?org.apache.poi.hssf.usermodel.HSSFRow;
import?org.apache.poi.hssf.usermodel.HSSFSheet;
import?org.apache.poi.hssf.usermodel.HSSFWorkbook;
import?org.apache.poi.hwpf.HWPFDocument;
import?org.apache.poi.hwpf.usermodel.Paragraph;
import?org.apache.poi.hwpf.usermodel.Range;
import?org.apache.poi.poifs.filesystem.DocumentEntry;
import?org.apache.poi.poifs.filesystem.DocumentInputStream;
import?org.apache.poi.poifs.filesystem.POIFSFileSystem;
import?org.apache.poi.util.LittleEndian;
/**??
?*?創建索引?Lucene?3.0+??
?*?@author?Administrator??
?*??
?*/??
public?class?indexer?{???
????/**??
?????*?@param?args??
?????*?@throws?Exception?
?????*/??
????public?static?void?main(String[]?args)?throws?Exception?{???
????????//保存索引文件的地方???
????????String?indexDir?=?“data\\test\\indexDir“;???
????????//將要搜索TXT文件的地方???
????????String?dateDir?=?“data\\test\\dateDir“;???
????????IndexWriter?indexWriter?=?null;???
????????//創建Directory對象???
????????Directory?dir?=?new?SimpleFSDirectory(new?File(indexDir));???
????????//創建IndexWriter對象
????????//第一個參數是Directory第二個是分詞器
????????//第三個表示是否是創建如果為false為在此基礎上面修改
????????//第四表示表示分詞的最大值,比如說new?MaxFieldLength(2),就表示兩個字一分,
????????//一般用IndexWriter.MaxFieldLength.LIMITED????
????????indexWriter?=?new?IndexWriter(dirnew?StandardAnalyzer(Version.LUCENE_30)true
???????? IndexWriter.MaxFieldLength.UNLIMITED);???
????????File[]?files?=?new?File(dateDir).listFiles();???
????????for?(int?i?=?0;?i????????? Document?doc?=?null;
???????? if(files[i].getName().endsWith(“.txt“)){
????????????doc?=?new?Document();???
????????????//創建Field對象,并放入doc對象中???
????????????doc.add(new?Field(“contents“?new?FileReader(files[i])));????
??????
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????720??2011-12-15?09:09??LuceneFirstDemo\.classpath
?????文件????????391??2011-12-15?09:03??LuceneFirstDemo\.project
?????文件???????9932??2011-12-15?15:18??LuceneFirstDemo\bin\indexer.class
?????文件???????2613??2011-12-15?15:21??LuceneFirstDemo\bin\searcher.class
?????文件????3780619??2008-02-11?17:31??LuceneFirstDemo\data\test\dateDir\ArcMap_Tutorial.pdf
?????文件???????7168??2011-12-15?12:00??LuceneFirstDemo\data\test\dateDir\excelTest1.xls
?????文件???????7168??2011-12-15?15:23??LuceneFirstDemo\data\test\dateDir\excelTest2.xls
?????文件????5053692??2007-03-30?00:00??LuceneFirstDemo\data\test\dateDir\Manual.pdf
?????文件???????9216??2011-12-14?19:11??LuceneFirstDemo\data\test\dateDir\testDoc1.doc
?????文件???????9216??2011-12-15?15:23??LuceneFirstDemo\data\test\dateDir\testDoc2.doc
?????文件??????18432??2011-12-15?15:26??LuceneFirstDemo\data\test\dateDir\testPPT1.ppt
?????文件??????18432??2011-12-15?15:27??LuceneFirstDemo\data\test\dateDir\testPPT2.ppt
?????文件?????????81??2011-12-14?18:36??LuceneFirstDemo\data\test\dateDir\testTxt1.txt
?????文件?????????71??2011-12-15?15:25??LuceneFirstDemo\data\test\dateDir\testTxt2.txt
?????文件?????????20??2011-12-15?15:27??LuceneFirstDemo\data\test\indexDir\segments.gen
?????文件????????242??2011-12-15?15:27??LuceneFirstDemo\data\test\indexDir\segments_p
?????文件?????440888??2011-12-15?15:27??LuceneFirstDemo\data\test\indexDir\_o.fdt
?????文件?????????84??2011-12-15?15:27??LuceneFirstDemo\data\test\indexDir\_o.fdx
?????文件?????????37??2011-12-15?15:27??LuceneFirstDemo\data\test\indexDir\_o.fnm
?????文件???????6987??2011-12-15?15:27??LuceneFirstDemo\data\test\indexDir\_o.frq
?????文件?????????34??2011-12-15?15:27??LuceneFirstDemo\data\test\indexDir\_o.nrm
?????文件??????75960??2011-12-15?15:27??LuceneFirstDemo\data\test\indexDir\_o.prx
?????文件????????476??2011-12-15?15:27??LuceneFirstDemo\data\test\indexDir\_o.tii
?????文件??????30648??2011-12-15?15:27??LuceneFirstDemo\data\test\indexDir\_o.tis
?????文件????1466301??2011-11-22?14:46??LuceneFirstDemo\lib\lucene-core-3.5.0.jar
?????文件????3824099??2011-12-14?18:10??LuceneFirstDemo\lib\pdfbox-1.6.0.jar
?????文件???10308868??2011-12-14?18:32??LuceneFirstDemo\lib\pdfbox-app-1.6.0.jar
?????文件????1675036??2010-10-29?20:40??LuceneFirstDemo\lib\poi-3.7-20101029.jar
?????文件?????840218??2010-10-29?20:40??LuceneFirstDemo\lib\poi-scratchpad-3.7-20101029.jar
?????文件??????13220??2011-12-15?15:18??LuceneFirstDemo\src\indexer.java
............此處省略12個文件信息
- 上一篇:佳博標簽打印機 tspl指令集
- 下一篇:阿里巴巴大數據競賽 多個賽事的ppt收集
評論
共有 條評論