Java extrai com precisão o tempo de publicação da página da web

Autor：Eve Cole Data da Última Atualização：2025-04-02 20:32:01
O tempo de liberação de vários formatos na página da web é extraído e o tempo de liberação é expresso no formato regular "AAAA-MM-DD HH: MM: SS". Você só pode tentar buscar precisão, mas como o formato do tempo de liberação da rede é muito flexível, não pode estar 100% correto.
 pacote whu.extract.pubtime.core; importar java.util.arraylist; importar java.util.calendar; importar java.util.Collections; importar java.util.list; importar java.util.regex.matcher; importar java.util.regex.pattern; importar whu.utils.TimeUtil; /** * Created On March 13, 2014 at 2:49:05 pm * @description Get the publishing time of the webpage*/public class FetchPubTime { /** Represents 8 consecutive dates in the url, for example http://www.baidu.com/20140311/2356.html */ private static String url_reg_whole= "([-|/| _] {1} 20 // d {6})"; /** Representa uma data separada por-ou/com um ano, mês e data, por exemplo http://www.baidu.com/2014-3-11/2356.html*/string estática privada url_reg_sep_ymd = "([-|/| _] {1} 20 // d {2} [-|/| _] {1} // d {1,2})"; /** indica que as datas separadas por - ou/, apenas ano e mês, por exemplo, http://www.baidu.com/2014-3/2356.html*/private static string url_reg_sep_ym = "([-|/| _] {1} 20 // d {2} [-|/| _] {1} // d {1,2})"; Corrente do calendário estático privado = calendário.getInstance (); / ** Expressão regular de tempo formatado correto*/ string estática privada RighttimeReg = "^((// d {2} (([02468] [048]) | ([13579] [26]) [//-//// s]? (((0? [13578]) | (1 [02])) [//-//// s]? ((0? [1-9]) | ([1-2] [0-9]) | (3 [01])) | ((0? [469]) | ( 11)) [//-//// s]? ((0? [1-9]) | ([1-2] [0-9]) | (30))) | (0? 2 [//-/// s]? ((0? [ 1-9]) | ([1-2] [0-9])))))) | (// d {2} (([02468] [1235679]) | ([13579] [01345789 ])) [//-//// s]? (((((0? [13578]) | (1 [02]) [//-///// s]? ((0? [1-9]) | ([1-2] [0-9]) | (3 [01]))) | ((0? [469]) | (11)) [//-///// s]? ((0? [1-9]) | ([1-2] [0-9]] ) | (30)) | (0? / ** * @param url * @param urlContent * @return */ public static string getpubtimeVarious (string url, string urlcontent) {string pubtime = getpubtimeFromurl (url); // não está no link, correspondendo if (pubTime == null) {if (urlContent! = Null &&! Urlcontent.trim (). Equals ("")) return extraCagedate (urlContent); } retornar PubTime; } / ** Extraia o tempo de publicação do URL e retorne aaaaa-mm-dd hh: mm: string formatada por ss * @param url * @return * / public static string getpubtimefromurl (string url) {padrony p_whole = padrony.compile (url_reg_whole); Matcher m_whole = p_whole.matcher (url); if (m_whole.find (0) && m_whole.groupCount ()> 0) {string time = m_whole.group (0); time = time.substring (1, time.length ()); // Cada etapa não pode exceder o horário atual se (current.compareto (timeutil.strtocalendar (tempo, "yyyymmdd"))> = 0) {retorno time.substring (0,4)+"-"+time.substring (4,6)+-"+time.Substring (6,8)+" "" }} p_whole = null; m_whole = null; Padrão p_sep = padrony.compile (url_reg_sep_ymd); Matcher m_sep = p_sep.matcher (url); if (m_sep.find (0) && m_sep.groupCount ()> 0) {string time = m_sep.group (0); time = time.substring (1, time.length ()); String [] seg = time.split ("[-|/| _] {1}"); Calendário tetime = calendar.getInstance (); thetime.set (calendário.year, Integer.parseint (seg [0])); tetime.set (calendário.month, inteiro.parseint (seg [1])); thEtime.set (calendar.day_of_month, Integer.parseint (seg [2])); if (current. }} p_sep = null; m_sep = null; Padrão p_sep_ym = padrony.compile (url_reg_sep_ym); Matcher m_sep_ym = p_sep_ym.matcher (url); if (m_sep_ym.find (0) && m_sep_ym.groupCount ()> 0) {string time = m_sep_ym.group (0); time = time.substring (1, time.length ()); Calendário tetime = calendar.getInstance (); String [] seg = time.split ("[-|/| _] {1}"); thetime.set (calendário.year, Integer.parseint (seg [0])); tetime.set (calendário.month, inteiro.parseint (seg [1])); tetime.set (calendar.day_of_month, 1); if (current. Formulário de 2013-12-19 15:48:33 ou 2013-12-19 ou 2012/3/05* @Param Text String a ser extraído* @return Data de retorno* @author: oschina* @createTime: 21 de janeiro, 2013*/public static stracgate (text) {boolean. Lista corresponde = NULL; Dia) ", Pattern.Case_insensitive | Pattern.multiline); // Se for apenas o ano, mês, dia, dia, hora, minuto, segundo, segundo, siga o seguinte padrão p = padrão.compile (" (20 // d {2} [-/] // d {1,2} [/] Segundo) ", Pattern.Case_insensitive | Pattern.Multiline); // Matcher Matcher = P.Matcher (DATEST); Matcher Matche_Detail = p_detail.matcher (DATESTR); if (! (Matcher_Detail.find (0) && Matcher_Detail.GroupCount ()> = 1)) {Matcher_Detail = P.Matcher (DATEST); contenhashms = true; } else Matcher_Detail = p_detail.matcher (DATEST); if (matcher_detail.find () && Matcher_detail.groupCount ()> = 1) {Matches = new ArrayList (); para (int i = 1; i <= matcher_detail.groupCount (); i ++) {string temp = matcher_detail.group (i); matches.add (temp); }} else {Matches = collection.Empty_List; } if (matches.size ()> 0) {for (int i = 0; i <matches.size (); i ++) {string pubtime = matches.get (i) .toString (). TRIM (); // Retire o primeiro valor PubTime = PubTime.Replace ("/", "-"). Substituir ("ano", "-"). Substituir ("mês", "-"). Substituir ("dia", "-"); if (current. if (PubTime.Matches (RightTimeReg)) {retorna PubTime; }}}} else {return null; }} catch (Exceção e) {return null; } retornar nulo; }}
O exposto acima é tudo sobre este artigo, espero que seja útil para todos aprenderem a programação Java.