复制代码代码如下 :
패키지 com.test;
import org.htmlparser.node;
import org.htmlparser.nodefilter;
import org.htmlparser.parser;
import org.htmlparser.filters.tagnamefilter;
import org.htmlparser.tags.tabletag;
import org.htmlparser.util.nodelist;
/**
* t : 利用 htmlparser 提取网页纯文本的例子
*/
공개 클래스 testhtmlparser {
public static void testhtml () {
노력하다 {
문자열 scurrentline;
문자열 stotalstring;
scurrentline = "";
stotalstring = "";
java.io.inputstream l_urlstream;
java.net.url l_url = new java.net.url ( "http://www.ideagrace.com/html/doc/2006/07/04/00929.html");
java.net.httpurlconnection l_connection = (java.net.httpurlconnection) l_url.openconnection ();
l_connection.connect ();
l_urlstream = l_connection.getInputStream ();
java.io.bufferedReader l_Reader = new java.io.bufferedReader (new java.io.inputStreamReader (l_urlstream));
while ((scurrentline = l_reader.readline ())! = null) {
stotalstring += scurrentline +"/r/n";
// system.out.println (stotalstring);
}
문자열 testText = extrestText (stotalString);
System.out.println (TestText);
} catch (예외 e) {
e.printstacktrace ();
}
}
public static string extrecttext (String inputhtml)는 예외 {
StringBuffer text = new StringBuffer ();
parser parser = parser.createParser (new String (inputhtml.getBytes (), "gbk"), "gbk");
// 遍历所有的节点
Nodelist Nodes = Parser.extractallNodestHatMatch (new NodeFilter () {
공개 부울 수락 (노드 노드) {
진실을 반환하십시오.
}
});
System.out.println (Nodes.size ()); // 打印节点的数量
for (int i = 0; i <nodes.size (); i ++) {
노드 nodet = nodes.elementat (i);
//system.out.println (nodet.getText ());
text.append (new String (nodet.toplaintextstring (). getbytes ( "gbk"))+"/r/n");
}
return text.toString ();
}
public static void test5 (String Resource)는 예외 {
Parser myparser = 새로운 파서 (리소스);
myparser.setencoding ( "gbk");
문자열 filterstr = "테이블";
NodeFilter 필터 = 새로운 TagNameFilter (Filterstr);
NODELIST NODELIST = MYPARSER.ExtractAllNodestHatMatch (필터);
Tabletag tabletag = (tabletag) nodelist.elementat (11);
}
public static void main (string [] args)은 예외 {
// test5 ( "http://www.google.com");
testhtml ();
}
}