我需要刮这个website这是一个“加载更多”按钮。
什么工具都适合刮这样的网站?
首先,让我们清楚如何做的项目工作的新部分的下载过程。在浏览器,E。 G。 Chrome浏览器,按F12打开DevTools,导航到https://www.tayara.tn/sc/immobilier/appartements,向下滚动,并提出一些新的项目负荷,请到网络标签,过滤器设置为XHR,它看起来就像如下图所示:
你可能每次单击“Montrer加号”按钮,大小为新的请求大约5 KB记录通知。目前都在响应所需的数据:
为了使这种XHR你需要从之前的响应检索data.listings.pageInfo.endCursor
值,并把它作为variables.page.offset
财产进入请求负载,当然,你需要保持整个有效载荷结构还,并添加相关的报头:
关于variables.page.offset
财产。实际上,它由三个Base64编码的部分,很明显,电子解码后。 G。 cDEwbg==.MjAxOS0wMS0yNlQyMDoyMTo1OFo=.NjAwMA==
是一些前缀p10n
+起始日期2019-01-26T20:21:58Z
+共检索到的项目6000
。所以,你可以通过改变最后一个值要求项目的任何其他部分。此外,您还可以指定每个在variables.page.count
财产的请求项目金额(似乎限制为100个)。
这里是表示这种刮可以怎么做,VBA例子。进口JSON.bas模块进入VBA项目的JSON处理。
Option Explicit
Sub Test()
Dim sCat As String
Dim oResSht As Worksheet
Dim oResCell As Range
Dim lNextOutput As Long
Dim sOffset As String
Dim oRes As Object
Dim sPayload As String
Dim sJSONString As String
Dim vJSON
Dim sState As String
Dim aItems
Dim oItem
' Set category for parsing
sCat = "2"
' Set output sheet
Set oResSht = ThisWorkbook.Sheets(1)
With oResSht
.Cells.Delete
.Cells.WrapText = False
Set oResCell = .Cells(1, 1)
End With
lNextOutput = 1000
sOffset = ""
Set oRes = CreateObject("Scripting.Dictionary")
Do
' Retrieve JSON content
sPayload = _
"{""query"":""query ListingsPage($page: Page, $filter: SearchFilter, $sortBy: SortOrder) {\n listings: searchAds(page: $page, filter: $filter, sortBy: $sortBy) " & _
"{\n items {\n uuid\n title\n price\n currency\n thumbnail\n createdAt\n state\n category " & _
"{\n id\n name\n engName\n __typename\n }\n user {\n uuid\n displayName\n avatar(width: 96, height: 96) " & _
"{\n url\n __typename\n }\n __typename\n }\n __typename\n }\n trackingInfo " & _
"{\n transactionId\n listName\n recommenderId\n experimentId\n variantId\n __typename\n }\n totalCount\n pageInfo " & _
"{\n startCursor\n hasPreviousPage\n endCursor\n hasNextPage\n __typename\n }\n __typename\n }\n}\n""," & _
"""variables"":{""page"":{""count"":100,""offset"":""" & sOffset & """},""filter"":{""queryString"":null,""category"":""" & sCat & """,""regionId"":null,""attributeFilters"":[]},""sortBy"":""CREATED_DESC""},""operationName"":""ListingsPage""}"
With CreateObject("MSXML2.XMLHTTP")
.Open "POST", "https://www.tayara.tn/graphql", True
.setRequestHeader "content-type", "application/json"
'.setRequestHeader "user-agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
.setRequestHeader "content-length", Len(sPayload)
.send (sPayload)
Do Until .readyState = 4: DoEvents: Loop
sJSONString = .responseText
End With
' Parse JSON sample
JSON.Parse sJSONString, vJSON, sState
Select Case True
Case sState <> "Object"
Debug.Print Now & " Invalid JSON response"
Case IsNull(vJSON("data"))
Debug.Print Now & " Response contains no data"
Case Else
' Retrieve items
aItems = vJSON("data")("listings")("items")
' Add retrieved items to resulting dataset
For Each oItem In aItems
Set oRes(oRes.Count) = oItem
Next
' Check if the page is last
If vJSON("data")("listings")("pageInfo")("hasNextPage") = False Then Exit Do
' Retrieve offset property for next page request
sOffset = vJSON("data")("listings")("pageInfo")("endCursor")
Debug.Print Now & " " & sOffset
' Output once per 1000 parsed items
If oRes.Count >= lNextOutput Then
Output oRes, oResCell
lNextOutput = oRes.Count + 1000
End If
End Select
DoEvents
Loop
' Finally output results
Output oRes, oResCell
MsgBox "Completed" & vbCrLf & "Actually parsed: " & oRes.Count & vbCrLf & """totalCount"" from API response: " & vJSON("data")("listings")("totalCount")
End Sub
Sub Output(vData, oTarget As Range)
Dim aData()
Dim aHeader()
' Convert raw JSON to 2d array and output to target range
JSON.ToArray vData, aData, aHeader
With oTarget
OutputArray oTarget.Cells(1, 1), aHeader
Output2DArray oTarget.Cells(1, 1).Offset(1, 0), aData
.Parent.Columns.AutoFit
End With
End Sub
Sub OutputArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
对我来说,输出如下:
顺便说一句,类似的方法应用于in other answers。