复制代码代码如下:
包com.test;
导入org.htmlparser.node;
导入org.htmlparser.nodefilter;
导入org.htmlparser.parser;
导入org.htmlparser.filters.tagnamefilter;
导入org.htmlparser.tags.tabletag;
导入org.htmlparser.util.nodelist;
/**
*标题:利用htmlparser提取网页纯文本的例子
*/
公共类Testhtmlparser {
公共静态void testhtml(){
尝试 {
字符串scurrentline;
字符串stotalstring;
scurrentline =“”;
stotalString =“”;
java.io.inputstream l_urlstream;
java.net.url l_url = new Java.net.url(“ http://www.ideagrace.com/html/doc/2006/07/07/04/00929.html”);
java.net.httpurlconnection l_connection =(java.net.httpurlConnection)l_url.openconnection();
l_connection.connect();
l_urlstream = l_connection.getInputStream();
java.io.bufferedReader l_reader = new Java.io.bufferedReader(new Java.io.inputStreamReader(l_urlstream));
while(((scurrentline = l_reader.readline())!= null){
stotalstring += scurrentline +“/r/n”;
// system.out.println(stotalString);
}
字符串testText = extractText(stotalString);
system.out.println(testText);
} catch(异常E){
e.printstacktrace();
}
}
公共静态字符串extracttext(字符串inputhtml)抛出异常{
StringBuffer text = new StringBuffer();
Parser parser = parser.createparser(new String(inputhtml.getBytes(),“ gbk”),“ gbk”);
//遍历所有的节点
nodelist nodes = parser.extractallNodestHatMatch(new NodeFilter(){
public boolean接受(节点节点){
返回true;
}
});
system.out.println(nodes.size()); //打印节点的数量
for(int i = 0; i <nodes.size(); i ++){
node nodet = nodes.elementat(i);
//system.out.println(nodet.getText());
text.append(new String(nodet.toplaintextstring()。getBytes(“ gbk”))+“/r/n”);
}
返回text.tostring();
}
公共静态void test5(字符串资源)抛出异常{
解析器myparser =新解析器(资源);
myparser.setencoding(“ gbk”);
字符串filterstr =“ table”;
NodeFilter Filter = new TagNameFilter(filterstr);
Nodelist Nodelist = myParser.extractallNodestHatMatch(filter);
tabletag tabletag =(tabletag)Nodelist.Elementat(11);
}
public static void main(string [] args)抛出异常{
// test5(“ http://www.google.com”);
testhtml();
}
}