如何使用DIHtmlParser解析标签内的Name:&Value文本?我尝试使用Clever Components的TCLHtmlParser进行此操作,但失败了。第二个问题是DIHtmlParser可以解析单个标签,例如通过其子标签循环。对于这样一个简单的问题来说,这是一场噩梦。
<div class="tvRow tvFirst hasLabel tvFirst" title="example1">
<label class="tvLabel">Name:</label>
<span class="tvValue">Value</span>
<div class="clear"></div></div>
<div class="tvRow tvFirst hasLabel tvFirst" title="example2">
<label class="tvLabel">Name:</label>
<span class="tvValue">Value</span>
<div class="clear"></div></div>
您可以使用IHTMLDocument2
DOM来解析HTML中您需要的任何元素:
uses ActiveX, MSHTML;
const
HTML =
'<div class="tvRow tvFirst hasLabel tvFirst" title="example1">' +
'<label class="tvLabel">Name:</label>' +
'<span class="tvValue">Value</span>' +
'<div class="clear"></div>' +
'</div>';
procedure TForm1.Button1Click(Sender: TObject);
var
doc: OleVariant;
el: OleVariant;
i: Integer;
begin
doc := coHTMLDocument.Create as IHTMLDocument2;
doc.write(HTML);
doc.close;
ShowMessage(doc.body.innerHTML);
for i := 0 to doc.body.all.length - 1 do
begin
el := doc.body.all.item(i);
if (el.tagName = 'LABEL') and (el.className = 'tvLabel') then
ShowMessage(el.innerText);
if (el.tagName = 'SPAN') and (el.className = 'tvValue') then
ShowMessage(el.innerText);
end;
end;
我想提一下我今天发现的另一个非常好的HTML解析器:htmlp
(Delphi Dom HTML Parser and Converter)。它显然没有IHTMLDocument2
那么灵活,但它很容易使用,快速,免费,并支持旧版Delphi版本的Unicode。
样品用法:
uses HtmlParser, DomCore;
function GetDocBody(HtmlDoc: TDocument): TElement;
var
i: integer;
node: TNode;
begin
Result := nil;
for i := 0 to HtmlDoc.documentElement.childNodes.length - 1 do
begin
node := HtmlDoc.documentElement.childNodes.item(i);
if node.nodeName = 'body' then
begin
Result := node as TElement;
Break;
end;
end;
end;
procedure THTMLForm.Button2Click(Sender: TObject);
var
HtmlParser: THtmlParser;
HtmlDoc: TDocument;
i: Integer;
body, el: TElement;
node: TNode;
begin
HtmlParser := THtmlParser.Create;
try
HtmlDoc := HtmlParser.parseString(HTML);
try
body := GetDocBody(HtmlDoc);
if Assigned(body) then
for i := 0 to body.childNodes.length - 1 do
begin
node := body.childNodes.item(i);
if (node is TElement) then
begin
el := node as TElement;
if (el.tagName = 'div') and (el.GetAttribute('class') = 'tvRow tvFirst hasLabel tvFirst') then
begin
// iterate el.childNodes here...
ShowMessage(IntToStr(el.childNodes.length));
end;
end;
end;
finally
HtmlDoc.Free;
end;
finally
HtmlParser.Free
end;
end;
也可以使用HTMLP parser与THtmlFormatter和OXml XPath parsing的组合
uses
// Htmlp
HtmlParser,
DomCore,
Formatter,
// OXml
OXmlPDOM,
OXmlUtils;
function HtmlToXHtml(const Html: string): string;
var
HtmlParser: THtmlParser;
HtmlDoc: TDocument;
Formatter: THtmlFormatter;
begin
HtmlParser := THtmlParser.Create;
try
HtmlDoc := HtmlParser.ParseString(Html);
try
Formatter := THtmlFormatter.Create;
try
Result := Formatter.GetText(HtmlDoc);
finally
Formatter.Free;
end;
finally
HtmlDoc.Free;
end;
finally
HtmlParser.Free;
end;
end;
type
TCard = record
Store: string;
Quality: string;
Quantity: string;
Price: string;
end;
TCards = array of TCard;
function ParseCard(const Node: PXMLNode): TCard;
const
StoreXPath = 'div[1]/ax';
QualityXPath = 'div[3]';
QuantityXPath = 'div[4]';
PriceXPath = 'div[5]';
var
CurrentNode: PXMLNode;
begin
Result := Default(TCard);
if Node.SelectNode(StoreXPath, CurrentNode) then
Result.Store := CurrentNode.Text;
if Node.SelectNode(QualityXPath, CurrentNode) then
Result.Quality := CurrentNode.Text;
if Node.SelectNode(QuantityXPath, CurrentNode) then
Result.Quantity := CurrentNode.Text;
if Node.SelectNode(PriceXPath, CurrentNode) then
Result.Price := CurrentNode.Text;
end;
procedure THTMLForm.OpenButtonClick(Sender: TObject);
var
Html: string;
Xml: string;
FXmlDocument: IXMLDocument;
QueryNode: PXMLNode;
XPath: string;
NodeList: IXMLNodeList;
i: Integer;
Card: TCard;
begin
Html := System.IOUtils.TFile.ReadAllText(FileNameEdit.Text, TEncoding.UTF8);
Xml := HtmlToXHtml(Html);
Memo.Lines.Text := Xml;
// Parse with XPath
FXMLDocument := CreateXMLDoc;
FXMLDocument.WriterSettings.IndentType := itIndent;
if not FXMLDocument.LoadFromXML(Xml) then
raise Exception.Create('Source document is not valid');
QueryNode := FXmlDocument.DocumentElement;
XPath := '//div[@class="row pricetableline"]';
NodeList := QueryNode.SelectNodes(XPath);
for i := 0 to NodeList.Count -1 do
begin
Card := ParseCard(NodeList[i]);
Memo.Lines.Text := Memo.Lines.Text + sLineBreak +
Format('%0:s %1:s %2:s %3:s', [Card.Store, Card.Quality, Card.Quantity, Card.Price]);
end;
Memo.SelStart := 0;
Memo.SelLength := 0;
end;