复制代码代码如下:
package com.letv.cloud.spider;
import java.util.hashset;
Importer java.util.list;
importer us.codecraft.webmagic.page;
importer us.codecraft.webmagic.site;
importer us.codecraft.webmagic.spider;
importer us.codecraft.webmagic.processor.pageProcessor;
classe publique MoviePaperPageProcessor implémente PageProcessor {
Page du site privé = site.me (). SetRetryTimes (3) .SetSleEptime (1000);
Site public getSite () {
page de retour;
}
Processus de vide public (page de page) {
List <string> links = page.gethtml (). Links (). Regex (
"http://posters.aa.com/poster//d+") .all ();
links = supprime uplicate (liens);
page.AddTargetRequests (liens);
page.putfield ("title", page.gethtml (). xpath (
"// div [@ id = 'imdbleftSsecc'] / central / h1 / text ()"). toString ());
page.putfield ("imgurl", page.gethtml (). xpath (
"// div [@ id = 'imdbleftSsecc'] / central / img / @ src"). toString ());
}
public static void main (String [] args) {
pour (int i = 1; i <= 3; i ++) {
Spider.Create (nouveau filmPaperPageProcessor ()). AddUrl (
"http://posters.aa.co/poster_page/" + i) .thread (5) .run ();
}
}
Liste statique publique supprimée (liste de liste) {
HashSet HS = new HashSet (liste);
list.clear ();
list.addall (HS);
Liste de retour;
}
}