Detailed explanation of JAVA crawling web page pictures, JAVA crawling website pictures using regular expressions

Author：Eve Cole Update Time：2025-05-20 10:32:01

Use Java to grab all images on the web page:

Use two regular expressions:

1. The regular matching the img tag in html: <img.*src=(.*?)[^>]*?>

2. Match the http path in src in the img tag: http:/"?(.*?)(/"|>|//s+)

accomplish:

 package org.swinglife.main; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /*** * java crawl network images* @author swinglife * */ public class CatchImage { // Address private static final String URL = "http://www.csdn.net"; // Encoding private static final String ECODING = "UTF-8"; // Get the img tag regular private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>"; // Get the src path regular private static final String IMGSRC_REG = "http:/"?(.*?)(/"|>|//s+)"; public static void main(String[] args) throws Exception { CatchImage cm = new CatchImage(); //Get html text content String HTML = cm.getHTML(URL); //Get the image tag List<String> imgUrl = cm.getImageUrl(HTML); //Get the image src address List<String> imgSrc = cm.getImageSrc(imgUrl); //Download the image cm.Download(imgSrc); } /*** * Get HTML content* * @param url * @return * @throws Exception */ private String getHTML(String url) throws Exception { URL uri = new URL(url); URLConnection connection = uri.openConnection(); InputStream in = connection.getInputStream(); byte[] buf = new byte[1024]; int length = 0; StringBuffer sb = new StringBuffer(); while ((length = in.read(buf, 0, buf.length)) > 0) { sb.append(new String(buf, ECODING)); } in.close(); return sb.toString(); } /*** * Get ImageUrl address* * @param HTML * @return */ private List<String> getImageUrl(String HTML) { Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML); List<String> listImgUrl = new ArrayList<String>(); while (matcher.find()) { listImgUrl.add(matcher.group()); } return listImgUrl; } /*** * Get ImageSrc address* * @param listImageUrl * @return */ private List<String> getImageSrc(List<String> listImageUrl) { List<String> listImgSrc = new ArrayList<String>(); for (String image : listImageUrl) { Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image); while (matcher.find()) { listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1)); } } return listImgSrc; } /*** * Download image* * @param listImgSrc */ private void Download(List<String> listImgSrc) { try { for (String url : listImgSrc) { String imageName = url.substring(url.lastIndexOf("/") + 1, url.length()); URL uri = new URL(url); InputStream in = uri.openStream(); FileOutputStream fo = new FileOutputStream(new File(imageName)); byte[] buf = new byte[1024]; int length = 0; System.out.println("Start download:" + url); while ((length = in.read(buf, 0, buf.length)) != -1) { fo.write(buf, 0, length); } in.close(); fo.close(); System.out.println(imageName + "Download Complete"); } } catch (Exception e) { System.out.println("Download Failed"); } } }

The above is all the content of this article. I hope it will be helpful to everyone's learning and I hope everyone will support Wulin.com more.