資源簡介
對圖片進(jìn)行水印處理,利用TESS4J識別圖片并提取信息生成excel表格。里面有字庫文件所以占用空間較大。
說明:https://blog.csdn.net/xlantian/article/details/80789115
代碼片段和文件信息
import?java.awt.Rectangle;
import?java.io.File;
import?java.io.FileOutputStream;
import?java.io.IOException;
import?net.sourceforge.tess4j.*;
import?org.apache.poi.ss.usermodel.HorizontalAlignment;
import?org.apache.poi.xssf.usermodel.*;
public?class?RP?{
private?String?a0=“無法識別“c0=“無法識別“;
public?void?PickUp(String?s)?????????????//提取文字中的公司名和注冊號
{
int?c1=-1c2=-1;
c1=s.lastIndexOf(“號?:“);
c2=s.indexOf(“\n“);
if(c1!=-1&&c2!=-1)
{
c0=s.substring(c1+4?c2);
}
int?a1=-1a2=-1;
a1=s.indexOf(“稱?:“);
a2=s.lastIndexOf(“\n“);
if(a1!=-1&&a2!=-1)
{
a0=s.substring(a1+4?a2);
}
/* else{
int?b1=s.indexOf(“名稱:“);
????int?b2=s.lastIndexOf(“有限公司“);
????if(b1!=-1)
????{
??b0=s.substring(b1+3?b2+2);
}
}*/
}
public?void?toExcel(int?iXSSFWorkbook?wbXSSFSheet?sheet)???????????//將文字信息做成表格
{
if(i==0)????????????????????????????????????????????????????//第一行的話,就設(shè)置列名等屬性
{
XSSFRow?row=sheet.createRow(0);
XSSFCellstyle?style=(XSSFCellstyle)wb.createCellstyle();
style.setAlignment(HorizontalAlignment.CENTER);
XSSFCell?cell=row.createCell(0);
cell.setCellValue(“企業(yè)名稱“);
????????cell.setCellstyle(style);??????????
????????cell?=?row.createCell(1);?
????????cell.setCellValue(“企業(yè)注冊號“);??
????????cell.setCellstyle(style);
}
else???????????????????????????????????//不是第一行就將識別到的信息輸入表格
{
XSSFRow?row?=?sheet.createRow(i);?
row.createCell(0).setCellValue(this.a0);??
row.createCell(1).setCellValue(this.c0);?
}
}
public?static?void?main(String[]?args)?throws?IOException?{
RP?rp=new?RP();
int?num=1;
Rectangle?ret=new?Rectangle(0055080);???????????//設(shè)置一個矩形區(qū)域,作為識別部分,減少運(yùn)行時間提高識別率
File?root?=?new?File(System.getProperty(“user.dir“)?+?“/imgs“);//存放處理后的圖片,imgs文件夾
File?res=new?File(System.getProperty(“user.dir“)?+?“/res“);//源圖片位置,res文件夾下
ITesseract?instance?=?new?Tesseract();
instance.setLanguage(“songti“);??????????????//使用訓(xùn)練好中文字庫識別
XSSFWorkbook?wb=new?XSSFWorkbook();
XSSFSheet?sheet=wb.createSheet(“信息匯總“);
rp.toExcel(0wbsheet);????????//設(shè)置列名
try?{
File[]?ress?=?res.listFiles();
int?i=0;
for(File?file?:?ress){
i++;
WaterMark.Clean(file.getAbsolutePath()“F:\\eclipse-workspace\\ReadPicture\\imgs\\“+i+“.png“);
}????????//去除源圖片水印,處理后的圖片放到img文件夾
File[]?files?=?root.listFiles();
for?(File?file?:?files)?{????????????????????????//對去除水印后的圖片逐個處理
String?result?=?instance.doOCR(fileret);??????????//開始采用doOCR(file)效率很低,因為圖片內(nèi)容太多
System.out.print(result);
rp.a0=“無法識別“;
rp.c0=“無法識別“;
rp.PickUp(result);?????????????//調(diào)用信息提取的函數(shù),提取出企業(yè)名和企業(yè)注冊號
rp.toExcel(numwbsheet);??????//調(diào)用toExcel函數(shù),將提取到的信息寫入
num++;
}
}?catch?(TesseractException?e)?{
System.err.println(e.getMessage());
}
?try?{??
????????????FileOutputStream?fout?=?new?FileOutputStr
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-06-15?10:49??ReadPicture\
?????文件????????2180??2018-05-27?13:22??ReadPicture\.classpath
?????文件?????????387??2018-05-21?20:13??ReadPicture\.project
?????目錄???????????0??2018-06-21?13:52??ReadPicture\.settings\
?????文件?????????592??2018-05-21?20:13??ReadPicture\.settings\org.eclipse.jdt.core.prefs
?????目錄???????????0??2018-06-21?13:52??ReadPicture\bin\
?????目錄???????????0??2018-06-16?11:23??ReadPicture\bin\com\
?????目錄???????????0??2018-06-16?11:23??ReadPicture\bin\com\recognition\
?????目錄???????????0??2018-06-16?11:23??ReadPicture\bin\com\recognition\software\
?????目錄???????????0??2018-06-16?11:23??ReadPicture\bin\com\recognition\software\jdeskew\
?????文件?????????676??2018-06-16?11:23??ReadPicture\bin\com\recognition\software\jdeskew\ImageDeskew$HoughLine.class
?????文件????????3616??2018-06-16?11:23??ReadPicture\bin\com\recognition\software\jdeskew\ImageDeskew.class
?????文件????????3316??2018-06-16?11:23??ReadPicture\bin\com\recognition\software\jdeskew\ImageUtil.class
?????目錄???????????0??2018-06-16?11:23??ReadPicture\bin\net\
?????目錄???????????0??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\
?????目錄???????????0??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\
?????文件?????????287??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$CANCEL_FUNC.class
?????文件?????????892??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$EANYCODE_CHAR.class
?????文件????????1263??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$ETEXT_DESC.class
?????文件?????????559??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$Tessba
?????文件?????????580??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessChoiceIterator.class
?????文件?????????583??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessMutableIterator.class
?????文件?????????411??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessOcrEngineMode.class
?????文件?????????421??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessOrientation.class
?????文件?????????574??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessPageIterator.class
?????文件?????????421??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessPageIteratorLevel.class
?????文件?????????785??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessPageSegMode.class
?????文件?????????438??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessParagraphJustification.class
?????文件?????????841??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessPolyBlockType.class
?????文件?????????580??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessResultIterator.class
?????文件?????????580??2018-06-16?11:23??ReadPicture\bin\net\sourceforge\tess4j\ITessAPI$TessResultRenderer.class
............此處省略153個文件信息
評論
共有 條評論