如何从网站的搜索栏中删除所有可能的结果

问题描述 投票:0回答:1

这是我的第一个网络抓取任务。我的任务是抓this网站

这是一个包含丹麦律师姓名的网站。我的困难是我只能根据我在搜索栏中输入的特定名称查询来检索名称。是否有一个在线网络工具,我可以用来刮取网站包含的所有名称?我使用像Import.io这样的工具到目前为止没有成功。我对这一切的运作方式感到非常困惑。

web-scraping web-crawler scrape google-crawlers
1个回答
1
投票

请向下滚动到更新2

该网站强制您输入至少一个搜索参数,因此您可以遍历Arbejdsområde列表的所有项目,为每个项目发出请求。下面是一个示例,展示了如何在Excel VBA中完成(打开VBE,创建标准模块,粘贴代码并运行Test()):

Option Explicit

Sub Test()

    Dim sResponse As String
    Dim oItems As Object
    Dim vItem
    Dim aData
    Dim sContent As String
    Dim lPage As Long
    Dim i As Long
    Dim j As Long

    ' Retrieve search page HTML content
    XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/", "", "", "", sResponse
    ' Extract work areas items
    ExtractOptions sResponse, "ctl00$ContentPlaceHolder$Search$AreaSelect", oItems
    oItems.Remove oItems.Keys()(0)
    sContent = ""
    ' Process each work area item
    For Each vItem In oItems.Items()
        Debug.Print "Item [" & vItem & "]"
        lPage = 0
        ' Process each results page
        Do
            Debug.Print vbTab & "Page [" & lPage & "]"
            ' Retrieve result page HTML content
            XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&a=" & vItem & "&p=" & lPage, "", "", "", sResponse
            ' Extract result table
            ParseResponse _
                "<table\b[^>]*?id=""ctl00_ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
                sResponse, _
                aData, _
                False
            ' Store parsed table
            sContent = sContent & aData(0)
            Debug.Print vbTab & "Parsed " & Len(sContent)
            lPage = lPage + 1
            DoEvents
        Loop Until InStr(sResponse, "<a class=""next""") = 0
    Next
    ' Extract data from the whole content
    ParseResponse _
        "<tr.*?onclick=""location.href='([^']*)'"">\s*" & _
        "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
        "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
        "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
        "</tr>", _
        sContent, _
        aData, _
        False
    ' Rebuild nested arrays to 2d array for output
    aData = Denestify(aData)
    ' Decode HTML
    For i = 1 To UBound(aData, 1)
        For j = 2 To 4
            aData(i, j) = GetInnerText((aData(i, j)))
        Next
    Next
    ' Output
    With ThisWorkbook.Sheets(1)
        .Cells.Delete
        Output2DArray .Cells(1, 1), aData
        .Columns.AutoFit
        .Rows.AutoFit
    End With
    MsgBox "Completed"

End Sub

Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)

    Dim aHeader

    'With CreateObject("MSXML2.ServerXMLHTTP")
        '.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
    With CreateObject("MSXML2.XMLHTTP")
        .Open sMethod, sUrl, False ' , "u051772", "fy17janr"
        If IsArray(aSetHeaders) Then
            For Each aHeader In aSetHeaders
                .SetRequestHeader aHeader(0), aHeader(1)
            Next
        End If
        .Send (sFormData)
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With

End Sub

Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)

    Dim aTmp0
    Dim vItem

    ' Escape RegEx special characters
    For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
        sName = Replace(sName, vItem, "\" & vItem)
    Next
    ' Extract the whole <select> for parameter
    ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
    ' Extract each parameter <option>
    ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
    ' Put each parameter and value into dictionary
    Set oOptions = CreateObject("Scripting.Dictionary")
    For Each vItem In aTmp0
        oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
    Next

End Sub

Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)

    Dim oMatch
    Dim aTmp0()
    Dim sSubMatch

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = bGlobal
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If oMatch.SubMatches.Count = 1 Then
                PushItem aData, oMatch.SubMatches(0)
            Else
                aTmp0 = Array()
                For Each sSubMatch In oMatch.SubMatches
                    PushItem aTmp0, sSubMatch
                Next
                PushItem aData, aTmp0
            End If
        Next
    End With

End Sub

Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    ReDim Preserve aData(UBound(aData) + 1)
    aData(UBound(aData)) = vItem

End Sub

Function GetInnerText(sText As String) As String

    Static oHtmlfile As Object
    Static oDiv As Object

    If oHtmlfile Is Nothing Then
        Set oHtmlfile = CreateObject("htmlfile")
        oHtmlfile.Open
        Set oDiv = oHtmlfile.createElement("div")
    End If
    oDiv.innerHTML = sText
    GetInnerText = oDiv.innerText

End Function

Function Denestify(aRows)

    Dim aData()
    Dim aItems()
    Dim i As Long
    Dim j As Long

    If UBound(aRows) = -1 Then Exit Function
    ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
    For j = 0 To UBound(aRows)
        If IsArray(aRows(j)) Then
            aItems = aRows(j)
            For i = 0 To UBound(aItems)
                If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
                aData(j + 1, i + 1) = aItems(i)
            Next
        Else
            aData(j + 1, 1) = aRows(j)
        End If
    Next
    Denestify = aData

End Function

Sub Output2DArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

第一次检索所有数据需要几分钟(之后再次启动时,所有请求都从缓存中加载,使流程速度明显加快,从服务器获取最新数据需要to clean up the cache in IE settings)。我的输出如下:

output

通常不推荐使用RegEx进行HTML解析,因此there is disclaimer。在这种情况下处理的数据非常简单,这就是使用RegEx解析的原因。关于RegEx:introduction(特别是syntax),introduction JSVB flavor

顺便说一下,使用类似的方法还有另一个答案:1234

UPDATE

上面建议的抓取是基于解析由Arbejdsområde参数过滤的搜索结果,结果是,实际返回的结果是不准确的。那些有Arbejdsområder倍数的律师在结果中存在多次,而且Arbejdsområder空的根本没有结果。

可以用于这种刮擦的另一个参数而不是ArbejdsområdeRetskreds。所有律师记录都包含地址,只有一个地址,因此结果是完整的,不包含重复。请注意,一位律师可以与多个办事处联系,因此这将是结果中的几个记录。

有一些代码可以为循环中的每个条目抓取详细信息:

Option Explicit

Sub Test()

    Dim sResponse As String
    Dim oItems As Object
    Dim vKey
    Dim sItem As String
    Dim aTmp
    Dim aData
    Dim lPage As Long
    Dim i As Long
    Dim j As Long

    ' Retrieve search page HTML content
    XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/", "", "", "", sResponse
    ' Extract Retskreds items
    ExtractOptions sResponse, "ctl00$ContentPlaceHolder$Search$CourtSelect", oItems
    oItems.Remove oItems.Keys()(0)
    i = 0
    ' Process each Retskreds item
    For Each vKey In oItems
        sItem = oItems(vKey)
        Debug.Print "Area " & sItem & " " & vKey
        lPage = 0
        ' Process each results page
        Do
            Debug.Print vbTab & "Page " & lPage
            ' Retrieve results page
            XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&c=" & sItem & "&p=" & lPage, "", "", "", sResponse
            ' Extract table
            ParseResponse _
                "<table\b[^>]*?id=""ctl00_ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
                sResponse, _
                aTmp, _
                False
            ' Extract data from the table
            ParseResponse _
                "<tr.*?onclick=""location.href='([^']*)'"">\s*" & _
                "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                "</tr>", _
                aTmp(0), _
                aData, _
                True
            ' Add Retskreds name
            For i = i To UBound(aData)
                aTmp = aData(i)
                PushItem aTmp, vKey
                aData(i) = aTmp
            Next
            Debug.Print vbTab & "Parsed " & UBound(aData)
            lPage = lPage + 1
            DoEvents
        Loop Until InStr(sResponse, "<a class=""next""") = 0
    Next
    ' Retrieve detailed info for each entry
    For i = 0 To UBound(aData)
        aTmp = aData(i)
        ' Retrieve details page
        aTmp(0) = "http://www.advokatnoeglen.dk" & aTmp(0)
        ' Extract details
        XmlHttpRequest "GET", aTmp(0), "", "", "", sResponse
        ParseResponse _
            DecodeUriComponent( _
                "Arbejdsomr%C3%A5der\: [\s\S]*?</h2>[\s\S]*?" & _
                "Beskikkelses%C3%A5r\: ([^<]*)[\s\S]*?" & _
                "F%C3%B8dsels%C3%A5r\: ([^<]*)[\s\S]*?" & _
                "M%C3%B8deret for landsret\: ([^<]*)[\s\S]*?" & _
                "M%C3%B8deret for h%C3%B8jesteret\: ([^<]*)[\s\S]*?" & _
                "E-mail\: [\s\S]*?href='\/email\.aspx\?e\=(.*?)'[\s\S]*?" & _
                "Mobiltlf\.\: ([\d\(\)\-+ ]*?)\s*<"), _
            sResponse, _
            aTmp, _
            True, _
            False
        aTmp(9) = StrReverse(aTmp(9))
        aData(i) = aTmp
        Debug.Print vbTab & "Details " & i
        DoEvents
    Next
    ' Rebuild nested arrays to 2d array for output
    aData = Denestify(aData)
    ' Decode HTML
    For i = 1 To UBound(aData, 1)
        For j = 2 To 4
            aData(i, j) = Trim(Replace(GetInnerText((aData(i, j))), vbCrLf, ""))
        Next
    Next
    ' Output
    With ThisWorkbook.Sheets(1)
        .Cells.Delete
        OutputArray .Cells(1, 1), _
            Array("URL", _
                "Navn", _
                "Firma", _
                DecodeUriComponent("Arbejdsomr%C3%A5der"), _
                DecodeUriComponent("Retskreds"), _
                DecodeUriComponent("Beskikkelses%C3%A5r"), _
                DecodeUriComponent("F%C3%B8dsels%C3%A5r"), _
                DecodeUriComponent("M%C3%B8deret for landsret"), _
                DecodeUriComponent("M%C3%B8deret for h%C3%B8jesteret"), _
                "E-mail", _
                "Mobiltlf." _
            )
        Output2DArray .Cells(2, 1), aData
        .Columns.AutoFit
        .Rows.AutoFit
    End With
    MsgBox "Completed"

End Sub

Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)

    Dim aHeader

    'With CreateObject("MSXML2.ServerXMLHTTP")
        '.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
    With CreateObject("MSXML2.XMLHTTP")
        .Open sMethod, sUrl, False
        If IsArray(aSetHeaders) Then
            For Each aHeader In aSetHeaders
                .SetRequestHeader aHeader(0), aHeader(1)
            Next
        End If
        .Send (sFormData)
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With

End Sub

Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)

    Dim aTmp0
    Dim vItem

    ' Escape RegEx special characters
    For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
        sName = Replace(sName, vItem, "\" & vItem)
    Next
    ' Extract the whole <select> for parameter
    ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
    ' Extract each parameter <option>
    ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
    ' Put each parameter and value into dictionary
    Set oOptions = CreateObject("Scripting.Dictionary")
    For Each vItem In aTmp0
        oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
    Next

End Sub

Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bNestSubMatches = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)

    Dim oMatch
    Dim aTmp0()
    Dim sSubMatch

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = bGlobal
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If oMatch.SubMatches.Count = 1 Then
                PushItem aData, oMatch.SubMatches(0)
            Else
                If bNestSubMatches Then
                    aTmp0 = Array()
                    For Each sSubMatch In oMatch.SubMatches
                        PushItem aTmp0, sSubMatch
                    Next
                    PushItem aData, aTmp0
                Else
                    For Each sSubMatch In oMatch.SubMatches
                        PushItem aData, sSubMatch
                    Next
                End If
            End If
        Next
    End With

End Sub

Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    ReDim Preserve aData(UBound(aData) + 1)
    aData(UBound(aData)) = vItem

End Sub

Function DecodeUriComponent(sEncoded As String) As String

    Static objHtmlfile As Object

    If objHtmlfile Is Nothing Then
        Set objHtmlfile = CreateObject("htmlfile")
        objHtmlfile.parentWindow.execScript "function decode(s) {return decodeURIComponent(s)}", "jscript"
    End If
    DecodeUriComponent = objHtmlfile.parentWindow.decode(sEncoded)

End Function

Function GetInnerText(sText As String) As String

    Static oHtmlfile As Object
    Static oDiv As Object

    If oHtmlfile Is Nothing Then
        Set oHtmlfile = CreateObject("htmlfile")
        oHtmlfile.Open
        Set oDiv = oHtmlfile.createElement("div")
    End If
    oDiv.innerHTML = sText
    GetInnerText = oDiv.innerText

End Function

Function Denestify(aRows)

    Dim aData()
    Dim aItems()
    Dim i As Long
    Dim j As Long

    If UBound(aRows) = -1 Then Exit Function
    ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
    For j = 0 To UBound(aRows)
        If IsArray(aRows(j)) Then
            aItems = aRows(j)
            For i = 0 To UBound(aItems)
                If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
                aData(j + 1, i + 1) = aItems(i)
            Next
        Else
            aData(j + 1, 1) = aRows(j)
        End If
    Next
    Denestify = aData

End Function

Sub OutputArray(oDstRng As Range, aCells As Variant, Optional sFormat As String = "@")

    With oDstRng
        .Parent.Select
        With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
            .NumberFormat = sFormat
            .Value = aCells
        End With
    End With

End Sub

Sub Output2DArray(oDstRng As Range, aCells As Variant, Optional sFormat As String = "@")

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = sFormat
            .Value = aCells
        End With
    End With

End Sub

4689名律师共有4896个参赛作品:

output

更新2

似乎得到完整的列表你可以用(空格)作为Firma参数进行搜索:http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&firm=%20,目前有6511个条目。应该更改结果解析的Sub Test()代码,如下所示:

Option Explicit

Sub Test()

    Dim sResponse As String
    Dim aTmp
    Dim aData
    Dim lPage As Long
    Dim i As Long
    Dim j As Long

    lPage = 0
    ' Process each results page
    Do
        Debug.Print vbTab & "Page " & lPage
        ' Retrieve results page
        XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&firm=%20&p=" & lPage, "", "", "", sResponse
        ' Extract table
        ParseResponse _
            "<table\b[^>]*?id=""ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
            sResponse, _
            aTmp, _
            False
        ' Extract data from the table
        ParseResponse _
            "<tr.*?onclick=""location.href=&#39;(.*?)&#39;"">\s*" & _
            "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
            "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
            "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
            "</tr>", _
            aTmp(0), _
            aData, _
            True
        Debug.Print vbTab & "Parsed " & (UBound(aData) + 1)
        lPage = lPage + 1
        DoEvents
    Loop Until InStr(sResponse, "<a class=""next""") = 0
    ' Retrieve detailed info for each entry
    For i = 0 To UBound(aData)
        aTmp = aData(i)
        ' Retrieve details page
        aTmp(0) = "http://www.advokatnoeglen.dk" & aTmp(0)
        ' Extract details
        Do
            XmlHttpRequest "GET", aTmp(0), "", "", "", sResponse
            If InStr(sResponse, "<title>Runtime Error</title>") = 0 Then Exit Do
            DoEvents
        Loop
        ParseResponse _
            DecodeUriComponent( _
                "Arbejdsomr%C3%A5der\: [\s\S]*?</h2>[\s\S]*?" & _
                "Beskikkelses%C3%A5r\: ([^<]*)[\s\S]*?" & _
                "(:?F%C3%B8dsels%C3%A5r\: ([^<]*)[\s\S]*?)?" & _
                "M%C3%B8deret for landsret\: ([^<]*)[\s\S]*?" & _
                "M%C3%B8deret for h%C3%B8jesteret\: ([^<]*)[\s\S]*?" & _
                "(:?E-mail [\s\S]*?href='\/email\.aspx\?e\=(.*?)'[\s\S]*?)?" & _
                "Mobiltlf\.\: ([\d\(\)\-+ ]*?)\s*<"), _
            sResponse, _
            aTmp, _
            True, _
            False
        aTmp(8) = StrReverse(aTmp(8))
        aData(i) = aTmp
        Debug.Print vbTab & "Details " & i
        DoEvents
    Next
    ' Rebuild nested arrays to 2d array for output
    aData = Denestify(aData)
    ' Decode HTML
    For i = 1 To UBound(aData, 1)
        For j = 2 To 4
            aData(i, j) = Trim(Replace(GetInnerText((aData(i, j))), vbCrLf, ""))
        Next
    Next
    ' Output
    With ThisWorkbook.Sheets(1)
        .Cells.Delete
        OutputArray .Cells(1, 1), _
            Array("URL", _
                "Navn", _
                "Firma", _
                DecodeUriComponent("Arbejdsomr%C3%A5der"), _
                DecodeUriComponent("Beskikkelses%C3%A5r"), _
                DecodeUriComponent("F%C3%B8dsels%C3%A5r"), _
                DecodeUriComponent("M%C3%B8deret for landsret"), _
                DecodeUriComponent("M%C3%B8deret for h%C3%B8jesteret"), _
                "E-mail", _
                "Mobiltlf." _
            )
        Output2DArray .Cells(2, 1), aData
        .Columns.AutoFit
        .Rows.AutoFit
    End With
    MsgBox "Completed"

End Sub
© www.soinside.com 2019 - 2024. All rights reserved.