Recently, I often simulate web page submissions and return to the web page source code, and then obtain the corresponding elements in the web page. Therefore, I need to often parse the corresponding various elements in Html. The network is a good thing. After searching, I found several Delphi versions of HtmlParser classes. I tried to use several libraries, but found that the parsing was incomplete, and some problems occurred more or less! So I thought that if there is a browser on the interface, we can operate web page elements through the Document interface of WebBrowser, which is very convenient! However, if you simulate web page submission, WebBrowser does not necessarily appear on the interface. There must be a way to directly parse HTML without using WebBrowser. That is, I don’t want the WebBrowser shell. As long as the Document document interface object inside it can realize HTML After parsing it, searching for MSDN, and then Google it, it works, and the construction method is as follows:
//Create IHTMLDocument2 interface
CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, FHtmlDoc);
After the interface is created, the document elements can be parsed, which is very refreshing!
Combining my own unique operations, I have encapsulated some web page elements such as Combobox, Table, Frame, etc. and implemented an HTMLParser. The general code is as follows:
Only a declaration is given here, please download the code at the end
Code
(************************************************* *****)
(*Dexian Studio*)
(*Web page element operation class library*)
(**)
(*DxHtmlElement Unit*)
(*Copyright(c) 2008-2010 No idle*)
(*email:[email protected]:75492895*)
(************************************************* *****)
unit DxHtmlElement;
interface
uses Windows, sysUtils, Clipbrd, MSHTML, ActiveX, OleCtrls, Graphics, TypInfo;
{Get EleMent Type}
function IsSelectElement(eleElement: IHTMLElement): Boolean;
function IsPwdElement(eleElement: IHTMLElement): Boolean;
function IsTextElement(element: IHTMLElement): boolean;
function IsTableElement(element: IHTMLElement): Boolean;
function IsElementCollection(element: IHTMLElement): Boolean;
function IsChkElement(element: IHTMLElement): boolean;
function IsRadioBtnElement(element: IHTMLElement): boolean;
function IsMemoElement(element: IHTMLElement): boolean;
function IsFormElement(element: IHTMLElement): boolean;
function IsIMGElement(element: IHTMLElement): boolean;
function IsInIMGElement(element: IHTMLElement): boolean;
function IsLabelElement(element: IHTMLElement): boolean;
function IsLinkElement(element: IHTMLElement): boolean;
function IsListElement(element: IHTMLElement): boolean;
function IsControlElement(element: IHTMLElement): boolean;
function IsObjectElement(element: IHTMLElement): boolean;
function IsFrameElement(element: IHTMLElement): boolean;
function IsInPutBtnElement(element: IHTMLElement): boolean;
function IsInHiddenElement(element: IHTMLElement): boolean;
function IsSubmitElement(element: IHTMLElement): boolean;
{Get ImgElement Data}
function GetPicIndex(doc: IHTMLDocument2; Src: string; Alt: string): Integer;
function GetPicElement(doc: IHTMLDocument2;imgName: string;src: string;Alt: string): IHTMLImgElement;
function GetRegCodePic(doc: IHTMLDocument2;ImgName: string; Src: string; Alt: string): TPicture; overload;
function GetRegCodePic(doc: IHTMLDocument2;Index: integer): TPicture; overload;
function GetRegCodePic(doc: IHTMLDocument2;element: IHTMLIMGElement): TPicture;overload;
type
TObjectFromLResult = function(LRESULT: lResult;const IID: TIID; WPARAM: wParam;out pObject): HRESULT; stdcall;
TEleMentType = (ELE_UNKNOW,ELE_TEXT,ELE_PWD,ELE_SELECT,ELE_CHECKBOX,ELE_RADIOBTN,ELE_MEMO,ELE_FORM,ELE_IMAGE,
ELE_LABEL,ELE_LINK,ELE_LIST,ELE_CONTROL,ELE_OBJECT,ELE_FRAME,ELE_INPUTBTN,ELE_INIMAGE,ELE_INHIDDEN);
function GetElementType(element: IHTMLELEMENT): TEleMentType;
function GetElementTypeName(element: IHTMLELEMENT): string;
function GetHtmlTableCell(aTable: IHTMLTable; aRow,aCol: Integer): IHTMLElement;
function GetHtmlTable(aDoc: IHTMLDocument2; aIndex: Integer): IHTMLTable;
function GetWebBrowserHtmlTableCellText(Doc: IHTMLDocument2;
const TableIndex, RowIndex, ColIndex: Integer;var ResValue: string):Boolean;
function GetHtmlTableRowHtml(aTable: IHTMLTable; aRow: Integer): IHTMLElement;
function GetWebBrowserHtmlTableCellHtml(Doc: IHTMLDocument2;
const TableIndex,RowIndex,ColIndex: Integer;var ResValue: string):Boolean;
function GeHtmlTableHtml(aTable: IHTMLTable; aRow: Integer): IHTMLElement;
function GetWebBrowserHtmlTableHtml(Doc: IHTMLDocument2;
const TableIndex,RowIndex: Integer;var ResValue: string):Boolean;
type
TDxWebFrameCollection = class;
TDxWebElementCollection = class;
TLoadState = (Doc_Loading,Doc_Completed,Doc_Invalidate);
TDxWebFrame = class
Private
FFrame: IHTMLWINDOW2;
FElementCollections: TDxWebElementCollection;
FWebFrameCollections: TDxWebFrameCollection;
function GetSrc: string;
function GetElementCount: integer;
function GetWebFrameCollections: TDxWebFrameCollection;
function GetElementCollections: TDxWebElementCollection;
function GetDocument: IHTMLDOCUMENT2;
function GetReadState: TLoadState;
function GetIsLoaded: boolean;
procedure SetFrame(const Value: IHTMLWINDOW2);
function GetName: string;
public
Constructor Create(IFrame: IHTMLWINDOW2);
Destructor Destroy;override;
property Frame: IHTMLWINDOW2 read FFrame write SetFrame;
property Src: string read GetSrc;
property Document: IHTMLDOCUMENT2 read GetDocument;
property Name: string read GetName;
property Frames: TDxWebFrameCollection read GetWebFrameCollections;
property ElementCount: integer read GetElementCount;
property ElementCollections: TDxWebElementCollection read GetElementCollections;
property ReadyState: TLoadState read GetReadState;
property IsLoaded: boolean read GetIsLoaded;
end;
TDxWebFrameCollection = Class
Private
FFrameCollection: IHTMLFramesCollection2;
Frame: TDxWebFrame;
function GetCount: integer;
function GetFrameInterfaceByIndex(index: integer): IHTMLWINDOW2;
function GetFrameInterfaceByName(Name: string): IHTMLWINDOW2;
function GetFrameByIndex(index: integer): TDxWebFrame;
function GetFrameByName(Name: string): TDxWebFrame;
procedure SetFrameCollection(const Value: IHTMLFramesCollection2);
public
Constructor Create(ACollection: IHTMLFramesCollection2);
Destructor Destroy;override;
property FrameCollection: IHTMLFramesCollection2 read FFrameCollection write SetFrameCollection;
property Count: integer read GetCount;
property FrameInterfaceByIndex[index: integer]: IHTMLWINDOW2 read GetFrameInterfaceByIndex;
property FrameInterfaceByName[Name: string]: IHTMLWINDOW2 read GetFrameInterfaceByName;
property FrameByIndex[index: integer]: TDxWebFrame read GetFrameByIndex;
property FrameByName[Name: string]: TDxWebFrame read GetFrameByName;
end;
TDxWebElementCollection = class
Private
FCollection: IHTMLElementCollection;
FChildCollection:TDxWebElementCollection;
function GetCollection(index: String): TDxWebElementCollection;
function GetCount: integer;
function GetElement(itemName: string; index: integer): IHTMLElement;
function GetElementByName(itemName: string): IHTMLELEMENT;
function GetElementByIndex(index: integer): IHTMLELEMENT;
procedure SetCollection(const Value: IHTMLElementCollection);
public
Constructor Create(ACollection: IHTMLElementCollection);
Destructor Destroy;override;
property Collection: IHTMLElementCollection read FCollection write SetCollection;
property ChildElementCollection[index: String]: TDxWebElementCollection read GetCollection;
property ElementCount: integer read GetCount;
property Element[itemName: string;index: integer]: IHTMLElement read GetElement;
property ElementByName[itemName: string]: IHTMLELEMENT read GetElementByName;
property ElementByIndex[index: integer]: IHTMLELEMENT read GetElementByIndex;
end;
TLinkCollection = class(TDxWebElementCollection)
end;
TDxWebTable = class;
TDxTableCollection = class
Private
FTableCollection: IHTMLElementCollection;
FDocument: IHTMLDOCUMENT2;
FWebTable: TDxWebTable;
function GetTableInterfaceByName(AName: string): IHTMLTABLE;
procedure SetDocument(Value: IHTMLDOCUMENT2);
function GetTableInterfaceByIndex(index: integer): IHTMLTABLE;
function GetCount: integer;
function GetTableByIndex(index: integer): TDxWebTable;
function GetTableByName(AName: string): TDxWebTable;
public
Constructor Create(Doc: IHTMLDOCUMENT2);
destructor Destroy;override;
property TableInterfaceByName[AName: string]: IHTMLTABLE read GetTableInterfaceByName;
property TableInterfaceByIndex[index: integer]: IHTMLTABLE read GetTableInterfaceByIndex;
property TableByName[AName: string]: TDxWebTable read GetTableByName;
property TableByIndex[index: integer]: TDxWebTable read GetTableByIndex;
property Document: IHTMLDOCUMENT2 read FDocument write SetDocument;
property Count: integer read GetCount;
end;
TDxWebTable = class
Private
FTableInterface: IHTMLTABLE;
function GetRowCount: integer;
procedure SetTableInterface(const Value: IHTMLTABLE);
function GetCell(ACol, ARow: integer): string;
function GetRowColCount(RowIndex: integer): integer;
function GetInnerHtml: string;
function GetInnerText: string;
function GetCellElement(ACol, ARow: Integer): IHTMLTableCell;
public
Constructor Create(ATable: IHTMLTABLE);
property TableInterface: IHTMLTABLE read FTableInterface write SetTableInterface;
property RowCount: integer read GetRowCount;
property Cell[ACol: integer;ARow: integer]: string read GetCell;
property CellElement[ACol: Integer;ARow: Integer]: IHTMLTableCell read GetCellElement;
property RowColCount[RowIndex: integer]: integer read GetRowColCount;
property InnerHtml: string read GetInnerHtml;
property InnerText: string read GetInnerText;
end;
TDxWebCombobox = class
Private
FHtmlSelect: IHTMLSelectElement;
function GetCount: Integer;
procedure SetItemIndex(const Value: Integer);
function GetItemIndex: Integer;
function GetName: string;
procedure SetName(const Value: string);
function GetValue: string;
procedure SetValue(const Value: string);
procedure SetCombInterface(const Value: IHTMLSelectElement);
function GetItemByName(EleName: string): string;
function GetItemByIndex(index: integer): string;
function GetItemAttribute(index: Integer; AttribName: string): OleVariant;
public
constructor Create(AWebCombo: IHTMLSelectElement);
procedure Add(Ele: IHTMLElement);
procedure Insert(Ele: IHTMLElement;Index: Integer);
procedure Remove(index: Integer);
property CombInterface: IHTMLSelectElement read FHtmlSelect write SetCombInterface;
property Count: Integer read GetCount;
property ItemIndex: Integer read GetItemIndex write SetItemIndex;
property ItemByIndex[index: integer]: string read GetItemByIndex;
property ItemByName[EleName: string]: string read GetItemByName;
property ItemAttribute[index: Integer;AttribName: string]: OleVariant read GetItemAttribute;
property Name: string read GetName write SetName;
property value: string read GetValue write SetValue;
end;
Implementation
end.
Code implementation unit of HTMLParser parsing class
Code
(************************************************* *****)
(*Dexian Studio*)
(*HTML parsing unit library*)
(**)
(*DxHtmlParser Unit*)
(*Copyright(c) 2008-2010 No idle*)
(*email:[email protected]:75492895*)
(************************************************* *****)
unit DxHtmlParser;
interface
uses Windows, MSHTML, ActiveX, DxHtmlElement, Forms;
type
TDxHtmlParser = class
Private
FHtmlDoc: IHTMLDocument2;
FHTML: string;
FWebTables: TDxTableCollection;
FWebElements: TDxWebElementCollection;
FWebComb: TDxWebCombobox;
procedure SetHTML(const Value: string);
function GetWebCombobox(AName: string): TDxWebCombobox;
public
constructor Create;
destructor Destroy;override;
property HTML: string read FHTML write SetHTML;
property WebTables: TDxTableCollection read FWebTables;
property WebElements: TDxWebElementCollection read FWebElements;
property WebCombobox[Name: string]: TDxWebCombobox read GetWebCombobox;
end;
Implementation
{ TDxHtmlParser }
constructor TDxHtmlParser.Create;
Begin
CoInitialize(nil);
//Create IHTMLDocument2 interface
CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, FHtmlDoc);
Assert(FHtmlDoc<>nil, 'Build HTMLDocument interface failed');
FHtmlDoc.Set_designMode('On'); //Set to design mode, no script execution
while not (FHtmlDoc.readyState = 'complete') do
Begin
sleep(1);
application.ProcessMessages;
end;
FWebTables := TDxTableCollection.Create(FHtmlDoc);
FWebElements := TDxWebElementCollection.Create(nil);
FWebComb := TDxWebCombobox.Create(nil);
end;
destructor TDxHtmlParser.Destroy;
Begin
FWebTables.Free;
FWebElements.Free;
FWebComb.Free;
CoUninitialize;
inherited;
end;
function TDxHtmlParser.GetWebCombobox(AName: string): TDxWebCombobox;
Begin
if FWebElements.Collection <> nil then
Begin
FWebComb.CombInterface := FWebElements.ElementByName[AName] as IHTMLSelectElement;
Result := FWebComb;
end
else Result := nil;
end;
procedure TDxHtmlParser.SetHTML(const Value: string);
Begin
if FHTML <> Value then
Begin
FHTML := Value;
FHtmlDoc.body.innerHTML := FHTML;
FWebElements.Collection := FHtmlDoc.all;
end;
end;
end.