複製代碼代碼如下:
包com.test;
導入org.htmlparser.node;
導入org.htmlparser.nodefilter;
導入org.htmlparser.parser;
導入org.htmlparser.filters.tagnamefilter;
導入org.htmlparser.tags.tabletag;
導入org.htmlparser.util.nodelist;
/**
*標題:利用htmlparser提取網頁純文本的例子
*/
公共類Testhtmlparser {
公共靜態void testhtml(){
嘗試 {
字符串scurrentline;
字符串stotalstring;
scurrentline =“”;
stotalString =“”;
java.io.inputstream l_urlstream;
java.net.url l_url = new Java.net.url(“ http://www.ideagrace.com/html/doc/2006/07/07/04/00929.html”);
java.net.httpurlconnection l_connection =(java.net.httpurlConnection)l_url.openconnection();
l_connection.connect();
l_urlstream = l_connection.getInputStream();
java.io.bufferedReader l_reader = new Java.io.bufferedReader(new Java.io.inputStreamReader(l_urlstream));
while(((scurrentline = l_reader.readline())!= null){
stotalstring += scurrentline +“/r/n”;
// system.out.println(stotalString);
}
字符串testText = extractText(stotalString);
system.out.println(testText);
} catch(異常E){
e.printstacktrace();
}
}
公共靜態字符串extracttext(字符串inputhtml)拋出異常{
StringBuffer text = new StringBuffer();
Parser parser = parser.createparser(new String(inputhtml.getBytes(),“ gbk”),“ gbk”);
//遍歷所有的節點
nodelist nodes = parser.extractallNodestHatMatch(new NodeFilter(){
public boolean接受(節點節點){
返回true;
}
});
system.out.println(nodes.size()); //打印節點的數量
for(int i = 0; i <nodes.size(); i ++){
node nodet = nodes.elementat(i);
//system.out.println(nodet.getText());
text.append(new String(nodet.toplaintextstring()。getBytes(“ gbk”))+“/r/n”);
}
返回text.tostring();
}
公共靜態void test5(字符串資源)拋出異常{
解析器myparser =新解析器(資源);
myparser.setencoding(“ gbk”);
字符串filterstr =“ table”;
NodeFilter Filter = new TagNameFilter(filterstr);
Nodelist Nodelist = myParser.extractallNodestHatMatch(filter);
tabletag tabletag =(tabletag)Nodelist.Elementat(11);
}
public static void main(string [] args)拋出異常{
// test5(“ http://www.google.com”);
testhtml();
}
}