复制代码代码如下:
paket com.letv.cloud.spider;
impor java.util.hashset;
impor java.util.list;
impor us.codecraft.webmagic.page;
impor us.codecraft.webmagic.site;
impor us.codecraft.webmagic.spider;
impor us.codecraft.webmagic.processor.pageprocessor;
Public Class MoviePaperPageProcessor mengimplementasikan Pageprocessor {
halaman situs pribadi = situs.me (). setRetryTimes (3) .setsleeptime (1000);
situs publik getSite () {
halaman kembali;
}
Proses public void (halaman halaman) {
Daftar <String> links = page.getHtml (). Links (). Regex (
"http://posters.aa.com/poster//d+") .all ();
tautan = Removeduplicate (tautan);
page.addtargetRequests (tautan);
page.putfield ("title", page.gethtml (). xpath (
"// div [@id = 'imdbleftsecc']/center/h1/text ()"). ToString ());
page.putfield ("imgurl", page.gethtml (). xpath (
"// div [@id = 'imdbleftsecc']/center/img/@src"). ToString ());
}
public static void main (string [] args) {
untuk (int i = 1; i <= 3; i ++) {
Spider.create (new MoviePaperPageProcessor ()). Addurl (
"http://posters.aa.co/poster_page/" + i) .thread (5) .run ();
}
}
Daftar statis publik dihapusuplikat (daftar daftar) {
Hashset hs = hashset baru (daftar);
list.clear ();
list.addall (hs);
daftar pengembalian;
}
}