复制代码代码如下::
Paket com.test;
import org.htmlparser.node;
import org.htmlparser.nodeFilter;
import org.htmlparser.parser;
import org.htmlparser.filters.tagnameFilter;
import org.htmlparser.tags.tabletag;
import org.htmlparser.util.nodelist;
/**
* 标题: 利用 htmlParser 提取网页纯文本的例子
*/
public class testhtmlparser {
public static void testhtml () {
versuchen {
String -Skurrentline;
String Stotalstring;
scurrentline = "";
stotalstring = "";
java.io.inputstream l_urlstream;
java.net.url l_url = new java.net.url ("http://www.ideagrace.com/html/doc/2006/07/04/00929.html");
java.net.httpurlConnection l_connection = (java.net.httpurlConnection) l_url.openconnection ();
l_connection.connect ();
l_urlstream = l_connection.getInputStream ();
java.io.bufufferedReader l_reader = new Java.io.buffenedReader (new java.io.inputStreamReader (l_urlstream));
while ((scurrentline = l_reader.readline ())! = null) {
stotalstring += scurrentline +"/r/n";
// system.out.println (stotalstring);
}
String testText = extractText (StotalString);
System.out.println (testtext);
} catch (Ausnahme e) {
E. printstacktrace ();
}
}
public static String extractText (String Inputhtml) löst Ausnahme {aus {
StringBuffer text = new StringBuffer ();
Parser Parser = Parser.CreateParser (neuer String (inputhtml.getBytes (), "gbk"), "gbk");
// 遍历所有的节点
Nodelist nodes = parser.extractAllnodesthatMatch (new nodeFilter () {
public boolean Accept (Knotenknoten) {
zurückkehren;
}
});
System.out.println (nodes.size ()); // 打印节点的数量
für (int i = 0; i <nodes.size (); i ++) {
Node nodet = nodes.elementat (i);
//System.out.println (nodet.getText ());
text.Append (neuer String (nodet.toplaintextstring (). getBytes ("gbk"))+"/r/n");
}
return text.toString ();
}
public static void test5 (String Resource) löst Ausnahme {aus {
Parser myparser = neuer Parser (Ressource);
MyParser.SetenCoding ("GBK");
String filterstr = "table";
NodeFilter filter = new TagNameFilter (filterstr);
Nodelist nodelist = myparser.extractAllnodesthatMatch (Filter);
Tabletag Tabletag = (Tabletag) nodelist.elementat (11);
}
public static void main (String [] args) löst Ausnahme {aus {
// test5 ("http://www.google.com");
testhtml ();
}
}