Exemple d'expression régulière Java correspondant à toutes les URL et au texte du lien sur une page Web

Auteur：Eve Cole Date de mise à jour：2025-02-24 12:00:03

La copie de code est la suivante:

Importer java.io.bufferedReader;

Importer java.io.ioException;

Importer java.io.inputStreamReader;

Importer java.net.malformedUrlexception;

Importer java.net.url;

import java.util.arraylist;

import java.util.hashmap;

Importer java.util.list;

import java.util.regex.matcher;

import java.util.regex.pattern;

importer java.net. *;

importer java.io. *;

import java.util.regex. *;

/ *

Obtenez la règle spécifiée en construisant des expressions régulières

* /

URL de classe publique

{

Start STARTURL;

UrlContent de chaîne;

String contentArea;

chaîne privée Strareabingin, Strareaend;

String privé StringInurl, StringNoInUrl;

String strContent; // le contenu collecté obtenu

String [] allUrls;

String privé Regex;

Urlandtitle Urlandtitle = New Urlandtitle ();

public static void main (String [] args)

{

URLS myUrl = nouvelles URL ("<corps", "/ corps>");

myurl.getstarturl ("http://www.zuzwn.com/");

myUrl.getUrlContent ();

myUrl.getContentArea ();

myurl.getstarturl ("http://www.zuzwn.com/");

myUrl.getStringNotinUrl ("Google");

myUrl.urls ();

//System.out.println("Starturl:"+Myurl.starturl);

//System.out.println("urlcontent:"+myurl.urlcontent);

//System.out.println("contentarea:"+myurl.contentarea);

}

// Initialise les constructeurs Strareabin et Strareaend

URL publiques (String Strareabin, String Strareaend)

{

this.strareabegin = strareabingin;

this.strareaend = strareaend;

}

URL du public à vide ()

{

int i = 0;

// String regex = "<a href ="? '? Http: // [a-za-z0-9] + /. [A-za-z0-9] + /. [A-za-z] + / ?--/.

String regex = "<a. *? / A>";

// String regex = "http: //.*?>";

Pattern pt = motive.compile (regex);

Matcher mt = pt.matcher (contentArea);

While (mt.find ())

{

System.out.println (mt.group ());

i ++;

// Obtenez le titre

Titre de Matcher = Pattern.Compile (">. *? </a>") .matcher (mt.group ());

While (title.find ())

{

System.out.println ("Title:" + Title.Group (). RempaceALL ("> | </a>", ""));

}

// Obtenez l'URL

Matcher MyUrl = Pattern.Compile ("href =. *?>"). Matcher (mt.group ());

While (myurl.find ())

{

System.out.println ("Site Web:" + MyUrl.Group (). RempaceALL ("Href = |>", ""));

}

System.out.println ();

}

System.out.println ("Il y a des totaux" + i + "Résultats conformes");

}

// Obtenez le site Web de la collection de démarrage

public void getStarturl (String startUrl)

{

this.starturl = startUrl;

}

// Obtenez le contenu où se trouve l'URL;

public void geturlcontent ()

{

StringBuffer est = new StringBuffer ();

essayer

{

URL myUrl = nouvelle URL (startUrl);

BufferedReader br = new BufferedReader (

new inputStreamReader (myUrl.openStream ()));

String S;

while ((s = br.readline ())! = null)

{

IS.APPEND (S);

}

urlContent = is.toString ();

}

Catch (exception e)

{

System.out.println ("Fichier URL Échec de la sortie");

e.printStackTrace ();

}

// Obtenez la zone de correspondance où se trouve l'URL

public void getContentArea ()

{

int pos1 = 0, pos2 = 0;

pos1 = urlcontent.indexof (strareabin) + strareabin.length ();

pos2 = urlcontent.indexof (Strareaend, pos1);

ContentArea = UrlContent.Substring (POS1, POS2);

}

// Les deux fonctions suivantes obtiennent les mots clés que l'URL doit contenir et les mots clés qui ne peuvent pas être inclus

// seules des expériences préliminaires sont effectuées ici. Dans la dernière étape, il devrait y avoir plus d'un mot-clé protégé et d'un mot clé qui ne peut pas être inclus.

public void getStringInurl (String StringInurl)

{

this.stringInurl = stringInurl;

}

public void getStringNotinUrl (String StringNotinUrl)

{

this.stringNotinUrl = stringNotinUrl;

}

// Obtenez les règles de collecte

// Obtenez l'URL

public void geturl ()

{

}

String public getRegex ()

{

retour regex;

}

classe Urlandtitle

{

String myUrl;

Titre de la chaîne;

}