Java Analysis HTML อัลกอริทึม (ตัวอย่างอัลกอริทึม Java Web Spider)

ผู้เขียน：Eve Cole เวลาอัปเดต：2025-02-23 16:00:04

ทุกคนท้อแท้เมื่อพบหน้า HTML ที่ซับซ้อนและยุ่งยาก เพราะมันยากที่จะได้รับข้อมูลที่เกี่ยวข้อง

วิธีที่เก่าแก่ที่สุดคือพยายามใช้การแสดงออกอย่างสม่ำเสมอ

วิธีที่สองคือการใช้แพ็คเกจ HTMLParser โอเพ่นซอร์ส

ฉันมีซอร์สโค้ดของ htmlparser ที่นี่ซึ่งสามารถรับไฮเปอร์ลิงก์ทั้งหมดได้

การคัดลอกรหัสมีดังนี้:

* หากต้องการเปลี่ยนเทมเพลตนี้ให้เลือกเครื่องมือ |

* และเปิดเทมเพลตในตัวแก้ไข

การทดสอบแพ็คเกจ;

นำเข้า java.util.hashmap;

นำเข้า java.util.map;

นำเข้า org.htmlparser.node;

นำเข้า org.htmlparser.nodefilter;

นำเข้า org.htmlparser.parser;

นำเข้า org.htmlparser.tags.linktag;

นำเข้า org.htmlparser.util.nodelist;

ชั้นเรียนสาธารณะ getLinkTest {

โมฆะคงที่สาธารณะหลัก (สตริง [] args) {

พยายาม {

// กรองแท็ก <a> ผ่านตัวกรอง

parser parser = ตัวแยกวิเคราะห์ใหม่ ("// www.vevb.com");

NodeList Nodelist = parser.extractallNodestHatMatch (nodefilter ใหม่ () {

// ใช้วิธีนี้เพื่อกรองแท็ก

บูลีนสาธารณะยอมรับ (โหนดโหนด) {

if (node instanceof linktag) // tag

กลับมาจริง;

กลับเท็จ;

// พิมพ์

สำหรับ (int i = 0; i <nodelist.size (); i ++) {

linktag n = (linktag) nodelist.elementat (i);

//system.out.print (n.getStringText () + "== >>");

//system.out.println (n.extractlink ());

พยายาม {

if (n.extractLink (). เท่ากับ ("// www.vevb.com")) {

System.out.println (N.ExtractLink ());

} catch (Exception e) {

E.PrintStackTrace ();

วิธีที่สามคือวิธีที่ฉันใช้ในตอนนี้

การคัดลอกรหัสมีดังนี้:

* หากต้องการเปลี่ยนเทมเพลตนี้ให้เลือกเครื่องมือ |

* และเปิดเทมเพลตในตัวแก้ไข

Package Exec;

นำเข้า Java.io.File;

นำเข้า java.io.ioException;

นำเข้า org.htmlcleaner.CleanerProperties;

นำเข้า org.htmlcleaner.htmlcleaner;

นำเข้า org.htmlcleaner.prettyxmlserializer;

นำเข้า org.htmlcleaner.tagnode;

ชั้นเรียนสาธารณะ htmlclean {

โมฆะสาธารณะ Cleanhtml (สตริง htmlurl, สตริง xmlurl) {

พยายาม {

Long Start = System.currentTimeMillis ();

htmlcleaner cleaner = new htmlcleaner ();

cleanerProperties อุปกรณ์ประกอบฉาก = cleaner.getProperties ();

props.setUsecDataForScriptandStyle (จริง);

props.setRecognizeunicodechars (จริง);

props.setuseEmptyElementTags (จริง);

props.setAdvancedxmlescape (จริง);

props.setTranslatespecialEntities (จริง);

props.setBooleanattributeValues ("ว่าง");

tagnode node = cleaner.clean (ไฟล์ใหม่ (htmlurl));

System.out.println ("VREME:" + (System.CurrentTimeMillis () - เริ่มต้น));

ใหม่ prettyxmlserializer (อุปกรณ์ประกอบฉาก) .writexmltofile (โหนด, xmlurl);

System.out.println ("VREME:" + (System.CurrentTimeMillis () - เริ่มต้น));

} catch (ioexception e) {

E.PrintStackTrace ();