Excel VBA Web Scrape - getElementsbyTagName.Item数字不是常量

问题描述 投票:0回答:2

我正在尝试使用下面的以下VBA代码来抓取县,人口和中位数家庭值。这有效,但我遇到了以下问题:项目编号(在下面的块引号上)有所不同,这意味着对于某些邮政编码,我检索不正确的数据点,请参阅下面的电子表格图片。

enter image description here

我研究过变量项目编号,但唯一接近的情况是检索collection,然后按类名选择元素。但是,我正在使用的HTML没有类名,请参见下文:

是否有可能首先找到字符串“County:”然后检索TagName td的innerText?我还考虑过返回整个表,但数据托管在两个不同的表中。正如你所看到的,我有一些想法,但似乎找不到一个好的例子,所以我有点迷失它的代码方面。

enter image description here

Sub ZipCodeScrape()

Set ZipCodeRange = Range("C2", Range("C2").End(xlDown))

Dim IE As Object
Set IE = New InternetExplorer

Dim url As String
url = "https://www.unitedstateszipcodes.org/"

Dim County As String
Dim Population As String
Dim MedianHomeVal As String
Dim HTMLdoc As HTMLDocument

For Each cell In ZipCodeRange

    IE.navigate (url & cell.Value)

    'Allows IE to load
    While IE.readyState <> 4
        DoEvents
    Wend

    Set HTMLdoc = IE.document

    County = HTMLdoc.getElementsByTagName("td").Item(2).innerText  
    Population = HTMLdoc.getElementsByTagName("td").Item(6).innerText  
    MedianHomeVal = HTMLdoc.getElementsByTagName("td").Item(12).innerText

    cell.Offset(0, 1) = County
    cell.Offset(0, 2) = Population
    cell.Offset(0, 3) = MedianHomeVal

Next cell

End Sub
html vba excel-vba getelementsbytagname getelementsbyclassname
2个回答
3
投票

试试这个。它会获取预期的输出(县,人口和中位数)。我在上传的图片中设置了很少的搜索选项。顺便说一句,搜索应该用邮政编码,否则populationmedian home value不会出现在该网页上。

Sub ZipCodeScrape()
    Dim IE As New InternetExplorer, html As HTMLDocument
    Dim search_input As Variant, posts As Object, post As Object, elem As Object

    With IE
        .Visible = True
        .navigate "https://www.unitedstateszipcodes.org/"
        Do Until .readyState = READYSTATE_COMPLETE: Loop
        Set html = .document
    End With

    Application.Wait Now + TimeValue("00:00:03")

    For Each search_input In [{"32937","33056","33312","33844","34698"}]

        html.getElementById("q").Value = search_input
        html.getElementsByClassName("btn btn-danger")(0).Click
        Application.Wait Now + TimeValue("00:00:05")

        For Each posts In html.getElementsByTagName("th")
            If InStr(posts.innerText, "County:") > 0 Then Row = Row + 1: Cells(Row, 1) = posts.NextSibling.innerText: Exit For
        Next posts
        For Each post In html.getElementsByTagName("th")
            If InStr(post.innerText, "Population") > 0 Then Cells(Row, 2) = post.ParentNode.getElementsByTagName("td")(0).innerText: Exit For
        Next post
        For Each elem In html.getElementsByTagName("th")
            If InStr(elem.innerText, "Median Home Value") > 0 Then Cells(Row, 3) = elem.ParentNode.getElementsByTagName("td")(0).innerText: Exit For
        Next elem
    Next search_input
    IE.Quit
End Sub

2
投票

也许循环通过<th>元素然后得到next sibling节点,它应该是所需的td节点?我没有测试过这个。

For each header in HTMLdoc.getElementsByTagName("th")
    If header.InnerText = "County: " then
        county = header.NextSibling.InnerText
        Exit For
    End If
Next
© www.soinside.com 2019 - 2024. All rights reserved.