資源簡介
只需要把html讀出來,放到方法里面,就能得到html的文本,很好的方法,我找了好久,現在發上來
代碼片段和文件信息
/*
?*?File:?WebFormatter.java
?*?Created?on?2005-6-24
?*?Author:?Liao?Xuefeng?asklxf@163.com
?*?Copyright?(C)?2005?Liao?Xuefeng.
?*/
import?java.util.*;
import?java.text.SimpleDateFormat;
/**
?*?Do?some?format?on?web?display.
?*?
?*?@author?Xuefeng
?*/
public?class?WebFormatter?{
????public?static?String?html2text(String?html)?{
????????StringBuffer?sb?=?new?StringBuffer(html.length());
????????char[]?data?=?html.toCharArray();
????????int?start?=?0;
????????boolean?previousIsPre?=?false;
????????Token?token?=?null;
????????for(;;)?{
????????????token?=?parse(data?start?previousIsPre);
????????????if(token==null)
????????????????break;
????????????previousIsPre?=?token.isPreTag();
????????????sb?=?sb.append(token.getText());
????????????start?+=?token.getLength();
????????}
????????return?sb.toString();
????}
????private?static?Token?parse(char[]?data?int?start?boolean?previousIsPre)?{
????????if(start>=data.length)
????????????return?null;
????????//?try?to?read?next?char:
????????char?c?=?data[start];
????????if(c==‘<‘)?{
????????????//?this?is?a?tag?or?comment?or?script:
????????????int?end_index?=?indexOf(data?start+1?‘>‘);
????????????if(end_index==(-1))?{
????????????????//?the?left?is?all?text!
????????????????return?new?Token(Token.TOKEN_TEXT?data?start?data.length?previousIsPre);
????????????}
????????????String?s?=?new?String(data?start?end_index-start+1);
????????????//?now?we?got?s=“<...>“:
????????????if(s.startsWith(““);
????????????????if(end_comment_index==(-1))?{
????????????????????//?illegal?end?but?treat?as?comment:
????????????????????return?new?Token(Token.TOKEN_COMMENT?data?start?data.length?previousIsPre);
????????????????}
????????????????else
????????????????????return?new?Token(Token.TOKEN_COMMENT?data?start?end_comment_index+3?previousIsPre);
????????????}
????????????String?s_lowerCase?=?s.toLowerCase();
????????????if(s_lowerCase.startsWith(“ript“))?{?//?this?is?a?script:
????????????????int?end_script_index?=?indexOf(data?start+1?“ ript>“);
????????????????if(end_script_index==(-1))
????????????????????//?illegal?end?but?treat?as?script:
????????????????????return?new?Token(Token.TOKEN_script?data?start?data.length?previousIsPre);
????????????????else
????????????????????return?new?Token(Token.TOKEN_script?data?start?end_script_index+9?previousIsPre);
????????????}
????????????else?{?//?this?is?a?tag:
????????????????return?new?Token(Token.TOKEN_TAG?data?start?start+s.length()?previousIsPre);
????????????}
????????}
????????//?this?is?a?text:
????????int?next_tag_index?=?indexOf(data?start+1?‘<‘);
????????if(next_tag_index==(-1))
????????????return?new?Token(Token.TOKEN_TEXT?data?start?data.length?previousIsPre);
????????return?new?Token(Token.TOKEN_TEXT?data?start?next_tag_index?previousIsP
評論
共有 條評論