Java accurately extracts web page publishing time

Author：Eve Cole Update Time：2025-04-02 20:32:01
The release time of various formats in the web page is extracted, and the release time is expressed in the regular "yyyy-MM-dd HH:mm:ss" format. You can only try to pursue accuracy, but because the format of the network release time is very flexible, it cannot be 100% correct.
 package whu.extract.pubtime.core; import java.util.ArrayList;import java.util.Calendar;import java.util.Collections;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern; import whu.utils.TimeUtil; /** * Created On March 13, 2014 at 2:49:05 pm * @description Get the publishing time of the webpage*/public class FetchPubTime { /** Represents 8 consecutive dates in the url, for example http://www.baidu.com/20140311/2356.html */ private static String url_reg_whole= "([-|/|_]{1}20//d{6})"; /** Represents a date separated by - or / with a year, month and date, for example http://www.baidu.com/2014-3-11/2356.html */ private static String url_reg_sep_ymd = "([-|/|_]{1}20//d{2}[-|/|_]{1}//d{1,2})"; /** Indicates that dates separated by - or /, only year and month, for example, http://www.baidu.com/2014-3/2356.html */ private static String url_reg_sep_ym = "([-|/|_]{1}20//d{2}[-|/|_]{1}//d{1,2})"; private static Calendar current = Calendar.getInstance(); /** Correct formatted time regular expression*/ private static String rightTimeReg = "^((//d{2}(([02468][048])|([13579][26]))[//-////s]?(((0?[13578])|(1[02]))[//-/////s]?((0?[1-9])|([1-2][0-9])|(3[01])))|((0?[469])|( 11))[//-////s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[//-////s]?((0?[1-9])|([1-2][0-9])))))|(//d{2}(([02468][1235679])|([13579][01345789 ]))[//-/////s]?((((0?[13578])|(1[02]))[//-/////s]?((0?[1-9])|([1-2][0-9])|(3[01])))|((0?[469])|(11))[//-/////s]?((0?[1-9])|([1-2][0-9]] )|(30))|(0?2[//-////s]?((0?[1-9])|(1[0-9])|(2[0-8]))))))))))))))))?$"; /** * @param url * @param urlContent * @return */ public static String getPubTimeVarious(String url,String urlContent) { String pubTime = getPubTimeFromUrl(url); //Not in the link, matching if(pubTime == null) { if(urlContent!=null&&!urlContent.trim().equals("")) return extractPageDate(urlContent); } return pubTime; } /** Extract the publishing time from the url and return YYYY-MM-DD HH:mm:ss-formatted string * @param url * @return */ public static String getPubTimeFromUrl(String url) { Pattern p_whole = Pattern.compile(url_reg_whole); Matcher m_whole = p_whole.matcher(url); if(m_whole.find(0)&&m_whole.groupCount()>0) { String time = m_whole.group(0); time = time.substring(1, time.length()); //Each step cannot exceed the current time if(current.compareTo(TimeUtil.strToCalendar(time, "yyyyMMdd"))>=0) { return time.substring(0,4)+"-"+time.substring(4,6)+"-"+ time.substring(6,8)+"+"+"00:00:00"; } } p_whole = null; m_whole = null; Pattern p_sep = Pattern.compile(url_reg_sep_ymd); Matcher m_sep = p_sep.matcher(url); if(m_sep.find(0)&&m_sep.groupCount()>0) { String time = m_sep.group(0); time = time.substring(1,time.length()); String[] seg = time.split("[-|/|_]{1}"); Calendar theTime = Calendar.getInstance(); theTime.set(Calendar.YEAR,Integer.parseInt(seg[0])); theTime.set(Calendar.MONTH, Integer.parseInt(seg[1])); theTime.set(Calendar.DAY_OF_MONTH, Integer.parseInt(seg[2])); if(current.compareTo(theTime)>=0) { return seg[0]+"-"+seg[1]+"-"+seg[2]+" "+"00:00:00"; } } p_sep = null; m_sep = null; Pattern p_sep_ym = Pattern.compile(url_reg_sep_ym); Matcher m_sep_ym = p_sep_ym.matcher(url); if(m_sep_ym.find(0)&&m_sep_ym.groupCount()>0) { String time = m_sep_ym.group(0); time = time.substring(1,time.length()); Calendar theTime = Calendar.getInstance(); String[] seg = time.split("[-|/|_]{1}"); theTime.set(Calendar.YEAR,Integer.parseInt(seg[0])); theTime.set(Calendar.MONTH, Integer.parseInt(seg[1])); theTime.set(Calendar.DAY_OF_MONTH, 1); if(current.compareTo(theTime)>=0) { return seg[0]+"-"+seg[1]+"-"+"+"01"+" "+"00:00:00"; } } return null; } /** Extract the publication time from the webpage source code* Regular expression extracts the date implementation code in java* December 19, 2013 15:58:42 * Read the time in the form of 2013-12-19 15:48:33 or 2013-12-19 or 2012/3/05* @param text String to be extracted* @return Return date* @author: oschina * @Createtime: Jan 21, 2013 */ public static String extractPageDate(String text) { boolean containsHMS =false; String dateStr = text.replaceAll("r?n", " "); try { List matches = null; Pattern p_detail = Pattern.compile("(20//d{2}[-/]//d{1,2}[-/]//d{1,2} //d{1,2}://d{1,2}://d{1,2})|(20//d{1,2} month//d{1,2} day)", Pattern.CASE_INSENSITIVE|Pattern.MULTILINE); //If it is just the year, month, day, day, hour, minute, second, second, then follow the following Pattern p = Pattern.compile("(20//d{2}[-/]//d{1,2}[-/]//d{1,2})|(20//d{2} year, month, month, month, month, month, second)", Pattern.CASE_INSENSITIVE|Pattern.MULTILINE); //Matcher matcher = p.matcher(dateStr); Matcher matcher_detail = p_detail.matcher(dateStr); if(!(matcher_detail.find(0) && matcher_detail.groupCount() >= 1)) { matcher_detail = p.matcher(dateStr); containsHMS = true; }else matcher_detail = p_detail.matcher(dateStr); if (matcher_detail.find() && matcher_detail.groupCount() >= 1) { matches = new ArrayList(); for (int i = 1; i <= matcher_detail.groupCount(); i++) { String temp = matcher_detail.group(i); matches.add(temp); } } else { matches = Collections.EMPTY_LIST; } if (matches.size() > 0) { for(int i=0;i<matches.size();i++) { String pubTime = matches.get(i).toString().trim(); //Take out the first value pubTime = pubTime.replace("/", "-").replace("year", "-").replace("month", "-").replace("day", "-"); if(current.compareTo(TimeUtil.strToCalendar(pubTime, "yyyy-MM-dd"))>=0) { if(containsHMS) pubTime+=" "+"00:00:00"; if(pubTime.matches(rightTimeReg)) { return pubTime; } } } } else { return null; } } catch (Exception e) { return null; } return null; }}
The above is all about this article, I hope it will be helpful for everyone to learn Java programming.