复制代码代码如下:
package com.test;
import org.htmlparser.node;
import org.htmlparser.nodeFilter;
import org.htmlparser.parser;
import org.htmlparser.filters.tagnameFilter;
import org.htmlparser.tags.tableTag;
import org.htmlparser.util.nodelist;
/ **
* 标题: 利用 htmlparser 提取网页纯文本的例子
* /
classe publique TestHtmlParser {
public statique void testhtml () {
essayer {
String ScurrentLine;
String stotalstring;
ScurrentLine = "";
stotalstring = "";
java.io.inputstream l_urlstream;
java.net.url l_url = new java.net.url ("http://www.ideagrace.com/html/doc/2006/07/04/00929.html");
java.net.httpurlconnection l_connection = (java.net.httpurlconnection) l_url.openconnection ();
l_connection.connect ();
l_urlstream = l_connection.getInputStream ();
java.io.bufferedReader l_reader = new Java.io.buffereDReader (new Java.io.inputStreamReader (l_urlstream));
while ((scurrentline = l_reader.readline ())! = null) {
stotalstring + = Scurrentline + "/ r / n";
// System.out.println (stotalstring);
}
String testText = extractText (stotalstring);
System.out.println (testText);
} catch (exception e) {
e.printStackTrace ();
}
}
public static String ExtractText (String inputhtml) lance l'exception {
StringBuffer Text = new StringBuffer ();
Parser parser = parser.createParser (new String (inputhtml.getbytes (), "gbk"), "gbk");
// 遍历所有的节点
NODELIST NODES = PARSER.EXTRATALLNODETHATMATCH (NOUVEAU NODEFILTER () {
booléen public accepter (nœud nœud) {
Retour Vrai;
}
});
System.out.println (nœuds.size ()); // 打印节点的数量
pour (int i = 0; i <nœuds.size (); i ++) {
Nœud nodet = nœuds.elementat (i);
//System.out.println (nodet.getText ());
text.append (new String (Nodet.toplainTextString (). GetBytes ("gbk")) + "/ r / n");
}
return text.toString ();
}
Public Static Void Test5 (String Resource) lève une exception {
Parser myParser = new Parser (ressource);
myParser.SetEncoding ("gbk");
String filterstr = "Table";
NodeFilter Filter = new TagNameFilter (filterstr);
NodeList nodelist = myParser.ExtractAllNodesthatMatch (filtre);
Tabletag Tabletag = (Tabletag) Nodelist.Elementat (11);
}
public static void main (String [] args) lève une exception {
// test5 ("http://www.google.com");
TestHtml ();
}
}