复制代码代码如下:
paquete com.test;
importar org.htmlparser.node;
importar org.htmlparser.nodeFilter;
importar org.htmlparser.parser;
importar org.htmlparser.filters.tagnameFilter;
importar org.htmlparser.tags.tabletag;
importar org.htmlparser.util.nodelist;
/**
* 标题: 利用 htmlParser 提取网页纯文本的例子
*/
clase pública testHtmlParser {
public static void testHtml () {
intentar {
Línea de perforación de cadena;
StotalString;
ScurrentLine = "";
stotalString = "";
java.io.inputstream l_urlstream;
java.net.url l_url = new java.net.url ("http://www.ideagrace.com/html/doc/2006/07/04/00929.html");
java.net.httpurlconnection l_connection = (java.net.httpurlconnection) l_url.openconnection ();
l_connection.connect ();
l_urlstream = l_connection.getInputStream ();
java.io.bufferedReader l_reader = new java.io.bufferedReader (nuevo java.io.inputstreamreader (l_urlstream));
while ((scurrentline = l_reader.readline ())! = null) {
stotalString += scurrentline +"/r/n";
// System.out.println (stotalString);
}
Cadena testText = ExtractText (stotalString);
System.out.println (testText);
} capt (excepción e) {
E.PrintStackTrace ();
}
}
public static String ExtractText (String inPuthtml) lanza la excepción {
StringBuffer text = new StringBuffer ();
Parser parser = parser.createParser (new String (inPuthtml.getBytes (), "gbk"), "gbk");
// 遍历所有的节点
Nodelist nodos = parser.extractallNodeStHatMatch (new NodeFilter () {
Public Boolean Acept (nodo de nodo) {
devolver verdadero;
}
});
System.out.println (nodo.size ()); // 打印节点的数量
para (int i = 0; i <nodo.size (); i ++) {
Nodo nodo = nodo.Elementat (i);
//System.out.println (nodet.gettext ());
text.append (new String (Nodet.ToPlainextString (). GetBytes ("GBK"))+"/r/n");
}
return text.toString ();
}
public static void test5 (recurso de cadena) arroja una excepción {
Parser myParser = new Parser (recurso);
myparser.setEncoding ("GBK");
Cadena filterStr = "table";
NODEFILTER FILTER = new TagNameFilter (FilterStr);
Nodelist nodelist = myParser.extractallNodeStHatMatch (filtro);
Tabletag Tabletag = (Tabletag) Nodelist.Elementat (11);
}
public static void main (string [] args) lanza la excepción {
// test5 ("http://www.google.com");
testHtml ();
}
}