复制代码代码如下::
Paket com.letv.cloud.spider;
import Java.util.hashset;
importieren java.util.list;
Import US.Codecraft.Webmagic.page;
Import US.Codecraft.Webmagic.site;
Import US.Codecraft.Webmagic.spider;
Import US.Codecraft.Webmagic.Processor.PageProcessor;
öffentliche Klasse MoviePaperPagePageProcessor implementiert PageProcessor {
private Site Page = Site.Me (). SetRetryTimes (3) .SetsLeptime (1000);
öffentliche Seite GetSite () {
Rückgabeseite;
}
public void Process (Seite Seite) {
Liste <String> links = page.gethtml (). Links (). Regex (
"http://posters.aa.com/poster//d+") .All ();
links = remedUtuplicate (links);
page.addtargetRequests (Links);
page.putfield ("title", page.gethtml (). xpath (
"// div [@id = 'imdbleftsecc']/center/h1/text ()"). toString ());
page.putfield ("imgurl", page.gethtml (). xpath (
"// div [@id = 'imdbleftsecc']/center/img/@src"). toString ());
}
public static void main (String [] args) {
für (int i = 1; i <= 3; i ++) {
Spider.create (neuer moviepaperpagePageprocessor ()). Addurl (
"http://posters.aa.co/poster_page/" + i) .thread (5) .run ();
}
}
public statische Liste REMORDURDUCEPLICE (LISTLISTE) {
Hashset HS = New Hashset (Liste);
list.clear ();
list.addall (hs);
Rückgabeliste;
}
}