我在vba写了一个脚本,从movie names
网站上刮掉不同的genre
和他们的torrent。虽然name
和genre
存在于它的登陆页面中,但是我创建了一个脚本来解析相同的一层深层(从它们的主页面)。更清楚的是,这是page之一,我的主页是什么意思。我的脚本正在完美地解析它们。但是,我的目的是做同样的异步请求。目前,脚本正在以同步方式执行它的工作(以阻塞方式)。
在我的previous post,我得到了omegastripes
的回答,他创造了一个脚本(which more or less performs like how multiprocessing works
),意味着工作asynchronously
。这就是我发现这个想法的地方,但不能在以下脚本中实现相同的功能。
我到目前为止的尝试:
Sub GetInfo()
Const URL = "https://yts.am/browse-movies"
Dim Http As New ServerXMLHTTP60, Html As New HTMLDocument
Dim post As HTMLDivElement, oName$, oGenre$, R&
Dim I&, key As Variant, iDic As Object
Set iDic = CreateObject("Scripting.Dictionary")
With Http
.Open "GET", URL, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".browse-movie-wrap .browse-movie-title")
For I = 0 To .Length - 1
iDic(.Item(I).getAttribute("href")) = 1
Next I
End With
For Each key In iDic.keys
With Http
.Open "GET", key, False
.send
Html.body.innerHTML = .responseText
End With
oName = Html.querySelector("h1").innerText
oGenre = Html.querySelector("h2").NextSibling.innerText
R = R + 1: Cells(R, 1) = oName
Cells(R, 2) = oGenre
Next key
End Sub
我怎样才能在上面的脚本中进行任何更改才能使asynchronously
工作?
以下示例显示了使用异步请求池的单循环解析器实现。代码从头到尾解析所有浏览页面和电影页面,这两种类型都是同时解析的。电影URL从“浏览页面”解析并放置在“电影队列”中,然后解析队列中每个电影页面的详细信息并输出到工作表。它处理所有HTTP请求错误类型,并重试直到限制。
将以下代码放入标准模块:
Option Explicit
Sub Test()
Const PoolCapacity = 30 ' Async requests qty
Const MoviesMin = 55 ' Movies in queue + expected movies min qty to request new browse page
Const ReqDelayMin = 0.15 ' Min delay between requests to avoid ban, sec
Const ReqTimeout = 15 ' Request timeout, sec
Const ReqRetryMax = 3 ' Attempts for each request before quit
Dim oWS As Worksheet
Dim y As Long
Dim ocPool As Collection
Dim ocMovies As Collection
Dim lMoviesPerPage As Long
Dim lBPageIndex As Long
Dim lBPagesInPoolQty As Long
Dim bLastBPageReached As Boolean
Dim dPrevReqSent As Double
Dim i As Long
Dim bBPageInPool As Boolean
Dim dT As Double
Dim bFail As Boolean
Dim sResp As String
Dim oMatches As Object
Dim oMatch
Dim oReq As Object
Dim oRequest As cRequest
' Prepare worksheet
Set oWS = ThisWorkbook.Sheets(1)
oWS.Cells.Delete
y = 1
' Init
Set ocPool = New Collection ' Requests async pool
Set ocMovies = New Collection ' Movies urls queue
lMoviesPerPage = 20 ' Movies per page qty
lBPageIndex = 1 ' Current browse page index for request
bLastBPageReached = False ' Last page reached flag
dPrevReqSent = -60# * 60# * 24# ' Init delay timer
' Start parsing
Do
lBPagesInPoolQty = 0 ' How many browse pages currently in pool
' Check pool for all flagged and completed requests
For i = ocPool.Count To 1 Step -1
bBPageInPool = Not ocPool(i).IsMovie
' Delay from last request
dT = Timer - dPrevReqSent
If dT < 0 Then dT = dT + 60# * 60# * 24#
Select Case True
' Check request has no sent flag
Case Not ocPool(i).NeedSend
On Error Resume Next
bFail = False
sResp = ""
With ocPool(i).HTTPRequest
' Check http request is ready and status is OK
Select Case True
Case .ReadyState < 4 ' Not ready
Case .Status \ 100 <> 2 ' Wrong status
Debug.Print .Status & " / " & .StatusText & " (" & ocPool(i).URL & ")"
bFail = True
Case Else ' Ready and OK
sResp = .ResponseText
End Select
End With
If sResp = "" Then
' Request elapsed time
dT = Timer - ocPool(i).SendTimer
If dT < 0 Then dT = dT + 60# * 60# * 24#
' Check request is failed
Select Case True
Case Err.Number <> 0 ' Runtime error
Debug.Print Err.Number & " / " & Err.Description & " (" & ocPool(i).URL & ")"
bFail = True
Case dT > ReqTimeout ' Timeout
Debug.Print "Timeout (" & ocPool(i).URL & ")"
bFail = True
End Select
On Error GoTo 0
If bFail Then ' Request has been failed
ocPool(i).FailsCount = ocPool(i).FailsCount + 1
' Check attempts
If ocPool(i).FailsCount > ReqRetryMax Then
Debug.Print "Quit (" & ocPool(i).URL & ")"
ocPool.Remove i ' Quit
bBPageInPool = False
Else
ocPool(i).NeedSend = True ' Raise send flag to retry
End If
End If
Else ' Response received
If ocPool(i).IsMovie Then
' Response from movie page
With CreateObject("VBScript.RegExp")
' Parse Title, Year, Genre
' <h1 itemprop\="name">___</h1>\s*<h2>___</h2>\s*<h2>___</h2>
.Pattern = "<h1 itemprop\=""name"">([^<]*)</h1>\s*<h2>([^<]*)</h2>\s*<h2>([^<]*)</h2>"
Set oMatches = .Execute(sResp)
If oMatches.Count = 1 Then ' Output to worksheet
oWS.Cells(y, 1).Value = oMatches(0).SubMatches(0)
oWS.Cells(y, 2).Value = oMatches(0).SubMatches(1)
oWS.Cells(y, 3).Value = oMatches(0).SubMatches(2)
y = y + 1
End If
End With
Else
' Response from browse page
With CreateObject("VBScript.RegExp")
.Global = True
' Parse movies urls
' <a href="___" class="browse-movie-link">
.Pattern = "<a href=""([^""]*)"" class=""browse-movie-link"">"
Set oMatches = .Execute(sResp)
For Each oMatch In oMatches
ocMovies.Add oMatch.SubMatches(0) ' Movies queue fed
Next
' Parse next page button
' <a href="/browse-movies?page=___">Next
.Pattern = "<a href\=""/browse-movies\?page\=\d+"">Next "
bLastBPageReached = bLastBPageReached Or Not .Test(sResp)
End With
If Not bLastBPageReached Then lMoviesPerPage = oMatches.Count ' Update lMoviesPerPage
End If
ocPool.Remove i
bBPageInPool = False
End If
' Check request has send flag raised and delay enough
Case dT > ReqDelayMin
' Send the request
Set oReq = CreateObject("MSXML2.ServerXMLHTTP.6.0")
With oReq
.Open "GET", ocPool(i).URL, True
' .SetProxy 2, "190.12.55.210:46078"
.SetRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
.Send
End With
ocPool(i).NeedSend = False
ocPool(i).SendTimer = Timer
dPrevReqSent = ocPool(i).SendTimer
Set ocPool(i).HTTPRequest = oReq
End Select
If bBPageInPool Then lBPagesInPoolQty = lBPagesInPoolQty + 1
DoEvents
Next
' Check if there is a room for a new request in pool
If ocPool.Count < PoolCapacity Then
' Add one new request to pool
' Check if movies in queue + expected movies are not enough
If ocMovies.Count + lBPagesInPoolQty * lMoviesPerPage < MoviesMin And Not bLastBPageReached Then
' Add new request for next browse page to feed movie queue
Set oRequest = New cRequest
With oRequest
.URL = "https://yts.am/browse-movies?page=" & lBPageIndex
.IsMovie = False
.NeedSend = True
.FailsCount = 0
End With
ocPool.Add oRequest
lBPageIndex = lBPageIndex + 1
Else
' Check if movie page urls are parsed and available in queue
If ocMovies.Count > 0 Then
' Add new request for next movie page from queue
Set oRequest = New cRequest
With oRequest
.URL = ocMovies(1)
.IsMovie = True
.NeedSend = True
.FailsCount = 0
End With
ocPool.Add oRequest
ocMovies.Remove 1
End If
End If
End If
DoEvents
Loop While ocPool.Count > 0 ' Loop until the last request completed
MsgBox "Completed"
End Sub
将以下代码放在名为cRequest
的类模块中:
Public URL As String
Public IsMovie As Boolean
Public NeedSend As Boolean
Public SendTimer As Double
Public HTTPRequest As Object
Public FailsCount As Long
请小心减少请求Const ReqDelayMin
之间的延迟。一旦我以高费率推出它,它工作了一段时间并导致Cloudflare DDoS保护触发,目前,我无法直接从我的IP使代码工作,唯一的方法是使用代理请求(你可以看到.SetProxy
的注释行。即使在Chrome中,我现在也在进行Cloudflare重定向:
因此,该方法只是揭示了这个问题,然而,最安全和更有效的方法是使用the website API中描述的this answer。
这段代码应该可以解决问题。它使用MSXML2.XMLHTTP
对象来处理请求。
这是获取信息的Module
代码:
Sub GetInfo()
On Error GoTo FailedState
If Not xmlHttpRequest Is Nothing Then Set xmlHttpRequest = Nothing
Dim MyXmlHttpHandler As CXMLHTTPHandler
Dim url As String
url = "https://yts.am/browse-movies"
Set xmlHttpRequest = New MSXML2.XMLHTTP
' Create an instance of the wrapper class.
Set MyXmlHttpHandler = New CXMLHTTPHandler
MyXmlHttpHandler.Initialize xmlHttpRequest
' Assign the wrapper class object to onreadystatechange.
xmlHttpRequest.OnReadyStateChange = MyXmlHttpHandler
' Get the page stuff asynchronously.
xmlHttpRequest.Open "GET", url, True
xmlHttpRequest.send ""
Exit Sub
FailedState:
MsgBox Err.Number & ": " & Err.Description
End Sub
这是class
CXMLHTTPHandler异步处理响应:
Option Explicit
Dim m_xmlHttp As MSXML2.XMLHTTP60
Public Sub Initialize(ByRef xmlHttpRequest As MSXML2.XMLHTTP60)
Set m_xmlHttp = xmlHttpRequest
End Sub
Sub OnReadyStateChange()
Debug.Print m_xmlHttp.readyState
If m_xmlHttp.readyState = 4 Then
'Now the page is loaded
'insert here your code to process the response
MsgBox m_xmlHttp.responseText 'i.e. print the response
End If
End Sub
如果您想了解更多细节,请查看here。
我的答案的基础是@Louis提到的this帖子,其中只执行了一次调用,但是你需要很多。我对GetInfoAsync
方法的速度有多快感到非常惊讶。
如何使用示例:
normal
的一个按钮叫GetInfo
,async
的一个按钮叫GetInfoAsync
。出于异步调用的目的,在此表单中声明了两个集合,一个包含请求对象,另一个包含处理程序。每个请求都是异步发送的,并且有自己的处理程序,当响应文本到达时处理响应文本。CXMLHTTPHandler
被创建。将此文件导入您的项目。用户表格
Option Explicit
Private Const url = "https://yts.am/browse-movies"
Private m_requests As VBA.Collection
Private m_handlers As VBA.Collection
Private Sub UserForm_Initialize()
Set m_requests = New VBA.Collection
Set m_handlers = New VBA.Collection
End Sub
Private Sub CommandButton1_Click()
GetInfoAsync
End Sub
Private Sub CommandButton2_Click()
GetInfo
End Sub
Sub GetInfoAsync()
Dim iDic As Object
Dim Html As New HTMLDocument
Dim Http As New ServerXMLHTTP60
Dim I&
Dim key As Variant
Set iDic = CreateObject("Scripting.Dictionary")
With Http
.Open "GET", url, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".browse-movie-wrap .browse-movie-title")
For I = 0 To .Length - 1
iDic(.Item(I).getAttribute("href")) = 1
Next I
End With
Dim myXmlHttpHandler As CXMLHTTPHandler
Dim myXmlHttpRequest As MSXML2.XMLHTTP60
For Each key In iDic.keys
Set myXmlHttpRequest = New MSXML2.XMLHTTP60
Set myXmlHttpHandler = New CXMLHTTPHandler
m_requests.Add myXmlHttpRequest
m_handlers.Add myXmlHttpHandler
myXmlHttpHandler.Initialize myXmlHttpRequest
myXmlHttpRequest.OnReadyStateChange = myXmlHttpHandler
myXmlHttpRequest.Open "GET", key, True
myXmlHttpRequest.send ""
Next key
End Sub
Sub GetInfo()
Dim Http As New ServerXMLHTTP60, Html As New HTMLDocument
Dim post As HTMLDivElement, oName$, oGenre$, r&
Dim I&, key As Variant, iDic As Object
Set iDic = CreateObject("Scripting.Dictionary")
With Http
.Open "GET", url, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".browse-movie-wrap .browse-movie-title")
For I = 0 To .Length - 1
iDic(.Item(I).getAttribute("href")) = 1
Next I
End With
For Each key In iDic.keys
DoEvents
With Http
.Open "GET", key, False
.send
Html.body.innerHTML = .responseText
End With
oName = Html.querySelector("h1").innerText
oGenre = Html.querySelector("h2").NextSibling.innerText
r = r + 1: Cells(r, 1) = oName
Cells(r, 2) = oGenre
Next key
End Sub
类CXMLHTTPHandler(将其导入您的VBA项目)
VERSION 1.0 CLASS
BEGIN
MultiUse = -1 'True
END
Attribute VB_Name = "CXMLHTTPHandler"
Attribute VB_GlobalNameSpace = False
Attribute VB_Creatable = False
Attribute VB_PredeclaredId = False
Attribute VB_Exposed = False
Option Explicit
Private m_xmlHttp As MSXML2.XMLHTTP60
Public Sub Initialize(ByRef xmlHttpRequest As MSXML2.XMLHTTP60)
Set m_xmlHttp = xmlHttpRequest
End Sub
Sub OnReadyStateChange()
Attribute OnReadyStateChange.VB_UserMemId = 0
Dim oName$, oGenre$
If m_xmlHttp.readyState = 4 Then
If m_xmlHttp.Status = 200 Then
Dim Html As New HTMLDocument
Dim Http As New ServerXMLHTTP60
Set Http = New ServerXMLHTTP60
Html.body.innerHTML = m_xmlHttp.responseText
oName = Html.querySelector("h1").innerText
oGenre = Html.querySelector("h2").NextSibling.innerText
Dim r
r = ActiveSheet.UsedRange.SpecialCells(xlCellTypeLastCell).Row + 1
Cells(r, 1) = oName
Cells(r, 2) = oGenre
Else
'Error happened
End If
End If
End Sub
需要参考