我怎样才能凑这个网站知道,我用硒,飞溅,模仿XHR请求,我失败了

问题描述 投票:1回答:1

我需要刮这个website这是一个“加载更多”按钮。

  1. 我用硒,但执行时间太长,我没能凑所有需要的物品。
  2. 我采用飞溅,但我没有得到任何结果
  3. 我试图模拟XHR请求,我也失败了

什么工具都适合刮这样的网站?

json excel vba web-scraping xmlhttprequest
1个回答
0
投票

首先,让我们清楚如何做的项目工作的新部分的下载过程。在浏览器,E。 G。 Chrome浏览器,按F12打开DevTools,导航到https://www.tayara.tn/sc/immobilier/appartements,向下滚动,并提出一些新的项目负荷,请到网络标签,过滤器设置为XHR,它看起来就像如下图所示:

Network tab

你可能每次单击“Montrer加号”按钮,大小为新的请求大约5 KB记录通知。目前都在响应所需的数据:

XHR response

为了使这种XHR你需要从之前的响应检索data.listings.pageInfo.endCursor值,并把它作为variables.page.offset财产进入请求负载,当然,你需要保持整个有效载荷结构还,并添加相关的报头:

XHR request

关于variables.page.offset财产。实际上,它由三个Base64编码的部分,很明显,电子解码后。 G。 cDEwbg==.MjAxOS0wMS0yNlQyMDoyMTo1OFo=.NjAwMA==是一些前缀p10n +起始日期2019-01-26T20:21:58Z +共检索到的项目6000。所以,你可以通过改变最后一个值要求项目的任何其他部分。此外,您还可以指定每个在variables.page.count财产的请求项目金额(似乎限制为100个)。

这里是表示这种刮可以怎么做,VBA例子。进口JSON.bas模块进入VBA项目的JSON处理。

Option Explicit

Sub Test()

    Dim sCat As String
    Dim oResSht As Worksheet
    Dim oResCell As Range
    Dim lNextOutput As Long
    Dim sOffset As String
    Dim oRes As Object
    Dim sPayload As String
    Dim sJSONString As String
    Dim vJSON
    Dim sState As String
    Dim aItems
    Dim oItem

    ' Set category for parsing
    sCat = "2"
    ' Set output sheet
    Set oResSht = ThisWorkbook.Sheets(1)
    With oResSht
        .Cells.Delete
        .Cells.WrapText = False
        Set oResCell = .Cells(1, 1)
    End With
    lNextOutput = 1000
    sOffset = ""
    Set oRes = CreateObject("Scripting.Dictionary")
    Do
        ' Retrieve JSON content
        sPayload = _
            "{""query"":""query ListingsPage($page: Page, $filter: SearchFilter, $sortBy: SortOrder) {\n  listings: searchAds(page: $page, filter: $filter, sortBy: $sortBy) " & _
            "{\n    items {\n      uuid\n      title\n      price\n      currency\n      thumbnail\n      createdAt\n      state\n      category " & _
            "{\n        id\n        name\n        engName\n        __typename\n      }\n      user {\n        uuid\n        displayName\n        avatar(width: 96, height: 96) " & _
            "{\n          url\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    trackingInfo " & _
            "{\n      transactionId\n      listName\n      recommenderId\n      experimentId\n      variantId\n      __typename\n    }\n    totalCount\n    pageInfo " & _
            "{\n      startCursor\n      hasPreviousPage\n      endCursor\n      hasNextPage\n      __typename\n    }\n    __typename\n  }\n}\n""," & _
            """variables"":{""page"":{""count"":100,""offset"":""" & sOffset & """},""filter"":{""queryString"":null,""category"":""" & sCat & """,""regionId"":null,""attributeFilters"":[]},""sortBy"":""CREATED_DESC""},""operationName"":""ListingsPage""}"
        With CreateObject("MSXML2.XMLHTTP")
            .Open "POST", "https://www.tayara.tn/graphql", True
            .setRequestHeader "content-type", "application/json"
            '.setRequestHeader "user-agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
            .setRequestHeader "content-length", Len(sPayload)
            .send (sPayload)
            Do Until .readyState = 4: DoEvents: Loop
            sJSONString = .responseText
        End With
        ' Parse JSON sample
        JSON.Parse sJSONString, vJSON, sState
        Select Case True
            Case sState <> "Object"
                Debug.Print Now & " Invalid JSON response"
            Case IsNull(vJSON("data"))
                Debug.Print Now & " Response contains no data"
            Case Else
                ' Retrieve items
                aItems = vJSON("data")("listings")("items")
                ' Add retrieved items to resulting dataset
                For Each oItem In aItems
                    Set oRes(oRes.Count) = oItem
                Next
                ' Check if the page is last
                If vJSON("data")("listings")("pageInfo")("hasNextPage") = False Then Exit Do
                ' Retrieve offset property for next page request
                sOffset = vJSON("data")("listings")("pageInfo")("endCursor")
                Debug.Print Now & " " & sOffset
                ' Output once per 1000 parsed items
                If oRes.Count >= lNextOutput Then
                    Output oRes, oResCell
                    lNextOutput = oRes.Count + 1000
                End If
        End Select
        DoEvents
    Loop
    ' Finally output results
    Output oRes, oResCell
    MsgBox "Completed" & vbCrLf & "Actually parsed: " & oRes.Count & vbCrLf & """totalCount"" from API response: " & vJSON("data")("listings")("totalCount")

End Sub

Sub Output(vData, oTarget As Range)

    Dim aData()
    Dim aHeader()

    ' Convert raw JSON to 2d array and output to target range
    JSON.ToArray vData, aData, aHeader
    With oTarget
        OutputArray oTarget.Cells(1, 1), aHeader
        Output2DArray oTarget.Cells(1, 1).Offset(1, 0), aData
        .Parent.Columns.AutoFit
    End With

End Sub

Sub OutputArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

Sub Output2DArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

对我来说,输出如下:

output

顺便说一句,类似的方法应用于in other answers

© www.soinside.com 2019 - 2024. All rights reserved.