复制代码代码如下:
paket com.test;
impor org.htmlparser.node;
impor org.htmlparser.nodefilter;
impor org.htmlparser.parser;
impor org.htmlparser.filters.tagnamefilter;
impor org.htmlparser.tags.tabletag;
impor org.htmlparser.util.nodelist;
/**
* 标题: 利用 htmlparser 提取网页纯文本的例子
*/
TestHtMlParser kelas publik {
public static void testHtml () {
mencoba {
String Scurrentline;
String stotalstring;
Scurrentline = "";
stalasstring = "";
java.io.inputstream l_urlstream;
java.net.url l_url = java.net.url baru ("http://www.ideagrace.com/html/doc/2006/07/04/00929.html");
java.net.httpurlconnection l_connection = (java.net.httpurlconnection) l_url.openconnection ();
l_connection.connect ();
l_urlstream = l_connection.getInputStream ();
java.io.bufferedReader l_reader = new java.io.bufferedReader (java.io.inputStreamReader baru (l_urlstream));
while ((scurrentline = l_reader.readline ())! = null) {
StotalString += Scurrentline +"/r/n";
// System.out.println (StotalString);
}
String testText = ExtractText (StotalString);
System.out.println (testText);
} catch (Exception e) {
e.printstacktrace ();
}
}
Public Static String ExtractText (String InputhTml) melempar Exception {
Teks StringBuffer = StringBuffer baru ();
Parser parser = parser.createParser (string baru (inputhtml.getbytes (), "gbk"), "gbk");
// 遍历所有的节点
NodeList node = parser.extractAllNodestAtmatch (nodefilter baru () {
public boolean accept (simpul simpul) {
Kembali Benar;
}
});
System.out.println (node.size ()); // 打印节点的数量
untuk (int i = 0; i <nodes.size (); i ++) {
Node nodet = nodes.elementat (i);
//System.out.println (nodet.getText ());
text.append (string baru (nodet.toplaintextString (). getBytes ("gbk"))+"/r/n");
}
return text.tostring ();
}
public static void test5 (String Resource) melempar Exception {
Parser myparser = parser baru (sumber daya);
myparser.setencoding ("gbk");
String filterstr = "tabel";
Nodefilter filter = tagnamefilter baru (filterstr);
Nodelist nodelist = myparser.extractAllNodestAtmatch (filter);
Tabletag tabletag = (tabletag) nodelist.elementat (11);
}
public static void main (string [] args) melempar pengecualian {
// test5 ("http://www.google.com");
testhtml ();
}
}