Some people regard the crawler as a treasure. So far, some people are selling TND for money. I strongly BS these guys for what they are! Maybe the stuff below is a bit lame.
The one below does not have the function of writing to the library. We have reached this step. The function of entering the library is very simple. Please complete it yourself if necessary. You can improve other functions by yourself! Copy the code and run it directly to see the effect
Dim Url,List_PageCode,Array_ArticleID,i,ArticleID
Dim Content_PageCode,Content_TempCode
Dim Content_CategoryID,Content_CategoryName,BorderID,ClassID,BorderName,ClassName
Dim ArticleTitle,ArticleAuthor,ArticleFrom,ArticleContent
Url = http://www.webasp.net/article/class/1.htm
List_PageCode = getHTTPPage(Url)
List_PageCode = RegExpText(List_PageCode, print</th></tr>,</table><table border=0 cellpadding=5,0)
List_PageCode = RegExpText(List_PageCode,<td align=left><a href='../,'><img border=0 src='../images/authortype0.gif',1)'Get the articles of the current list page Links, separated by
Array_ArticleID = Split(List_PageCode,,)'Create an array to store article IDs
For i=0 To Ubound(Array_ArticleID)-1
ArticleID = Array_ArticleID(i)'Article ID
Content_PageCode = getHTTPPage(http://www.webasp.net/article/&ArticleID) 'Get the content of the article page
'==========Get the article category and related ID parameters to start========================
Content_TempCode = RegExpText(Content_PageCode,<a href=/article/>Technical Tutorial</a> >> ,>> Content</td>,0)
Content_CategoryID = RegExpText(Content_PageCode,<a href='../class,/'>,1)
BorderID = Split(Content_CategoryID,,)(0)' Category ID
ClassID = Split(Content_CategoryID,,)(1)'Subclass ID
'==========Check whether the major category existsStart================
'If it does not exist, store it in the database
'==========Check whether the major category existsEnd================
'Response.Write(BorderID & , & ClassID & <br />)
Content_CategoryName = RegExpText(Content_PageCode,/'>,</a>,1)
BorderName = Split(Content_CategoryName,,)(0)'Category name
ClassName = Split(Content_CategoryName,,)(1)'Subclass name
'==========Check whether the subclass existsStart================
'If it does not exist, store it in the database
'==========Check if subclass exists end================
'==========Getting the article classification and related ID parameters ends========================
'==========Get the title and content of the article and start==============================
ArticleTitle = RegExpText(Content_PageCode,<tr><td align=center bgcolor=#DEE2F5><strong>,</strong></td></tr>,0)
ArticleAuthor = RegExpText(Content_PageCode,<tr><td><span class=blue>Author:</span>,</td></tr>,0)
ArticleFrom = RegExpText(Content_PageCode,<tr><td><span class=blue>Source:</span>,</td></tr>,0)
ArticleContent = RegExpText(Content_PageCode,<tr><td class=content style=WORD-WRAP: break-word id=zoom>,</td></tr>&VBCrlf& </table>&VBCrlf& </td></tr> </table>,0)
'==========Get the article title and content end==============================
Response.Write(ArticleTitle& <br /><br />)
Response.Flush()
Next
Attached are a few functions:
Function getHTTPPage(url)
IF(IsObjInstalled(Microsoft.XMLHTTP) = False)THEN
Response.Write <br><br>The server does not support the Microsoft.XMLHTTP component
Err.Clear
Response.End
END IF
On Error Resume Next
Dim http
SET http=Server.CreateObject(Msxml2.XMLHTTP)
Http.open GET,url,False
Http.send()
IF(Http.readystate<>4)THEN
Exit Function
END IF
getHTTPPage=BytesToBSTR(Http.responseBody,GB2312)
SET http=NOTHING
IF(Err.number<>0)THEN
Response.Write <br><br>Error getting file content
'Response.End
Err.Clear
END IF
End Function
Function BytesToBstr(CodeBody,CodeSet)
Dim objStream
SET objStream = Server.CreateObject(adodb.stream)
objStream.Type = 1
objStream.Mode =3
objStream.Open
objStream.Write CodeBody
objStream.Position = 0
objStream.Type = 2
objStream.Charset = CodeSet
BytesToBstr = objStream.ReadText
objStream.Close
SET objStream = NOTHING
End Function
'================================================
'Function: Check whether the component has been installed
'Return value: True ---- Already installed
' False ---- not installed
'================================================
Function IsObjInstalled(objName)
On Error Resume Next
IsObjInstalled = False
Err = 0
Dim testObj
SET testObj = Server.CreateObject(objName)
IF(0 = Err)THEN IsObjInstalled = True
SET testObj = NOTHING
Err = 0
End Function
Function RegExpText(strng,strStart,strEnd,n)
Dim regEx,Match,Matches,RetStr
SET regEx = New RegExp
regEx.Pattern = strStart&([/s/S]*?)&strEnd
regEx.IgnoreCase = True
regEx.Global = True
SET Matches = regEx.Execute(strng)
For Each Match in Matches
IF(n=1)THEN
RetStr = RetStr & regEx.Replace(Match.Value,$1) & ,
ELSE
RetStr = RetStr & regEx.Replace(Match.Value,$1)
END IF
Next
RegExpText = RetStr
SET regEx=NOTHING
End Function