复制代码代码如下:
pacote com.test;
importar org.htmlparser.node;
importar org.htmlparser.nodeFilter;
importar org.htmlparser.parser;
importar org.htmlparser.filters.tagnamefilter;
importar org.htmlparser.tags.tableTag;
importar org.htmlparser.util.nodelist;
/**
* 标题: 利用 htmlparser 提取网页纯文本的例子
*/
classe pública testhtmlparser {
public static void testhtml () {
tentar {
String scurrentline;
String stotalstring;
scurrentline = "";
STOTALSTRING = "";
java.io.inputStream l_urlstream;
java.net.url l_url = new java.net.url ("http://www.ideagrace.com/html/doc/2006/07/04/00929.html");
java.net.httpurlConnection l_connection = (java.net.httpurlConnection) l_url.openconnection ();
l_connection.connect ();
l_urlstream = l_connection.getInputStream ();
java.io.BufferedReader l_Reader = new java.io.BufferedReader (new java.io.inputStreamReader (l_urlstream));
while ((scurrentline = l_reader.readline ())! = null) {
STOTALSTRING += ScurrentLine +"/R/N";
// system.out.println (stotalstring);
}
String testText = ExtractText (STOTALSTRING);
System.out.println (testText);
} catch (Exceção e) {
E.PrintStackTrace ();
}
}
public Static String ExtractText (String inputhtml) lança exceção {
StringBuffer text = new StringBuffer ();
Parser parser = parser.createParser (new string (inputhtml.getbytes (), "gbk"), "gbk");
// 遍历所有的节点
Nodelist nós = parser.extractallnodesthatmatch (new NodeFilter () {
public boolean Acep (nó nó) {
retornar true;
}
});
System.out.println (modes.size ()); // 打印节点的数量
for (int i = 0; i <modes.size (); i ++) {
Nó node = nós.Elementat (i);
//System.out.println (nodet.getText ());
text.append (new string (nodet.toplainTextString (). getBytes ("gbk"))+"/r/n");
}
retornar text.toString ();
}
public static void test5 (recurso da string) lança exceção {
Analisador myparser = novo analisador (recurso);
myParser.SetEncoding ("GBK");
String filterstr = "tabela";
Filtro nodeFilter = new TagnameFilter (filterstr);
Nodelist nodelist = myparser.extractallnodesthatmatch (filtro);
Tableteg tabletag = (comprimido) nodelist.Elementat (11);
}
public static void main (string [] args) lança exceção {
// test5 ("http://www.google.com");
testhtml ();
}
}