同时运行多个 Selenium 任务并防止系统崩溃

问题描述 投票:0回答:1

我将从 Selenium 和

HtmlAgilityPack
(在 C#、.NET 7 中)的网站获取每个国家/地区的人口排名。这段代码适用于 10 个国家/地区,但当我想请求所有国家/地区时,由于任务较多,系统崩溃并且速度缓慢。有什么办法?

static async void GetData()
{
    string website = "......";
    List<string> Countries = new List<string>()
    {
        // 195 Countries
    };

    List<Task<JObject>> Tasks = new List<Task<JObject>>();
    foreach (string countryName in Countries)
    {
        Tasks.Add(FetchData(website + "/" + countryName));
    }
    await Task.WhenAll(Tasks);
    foreach (JObject populationRank in Tasks.Select(task => task.Result))
    {
        WriteLine(populationRank);
    }
}

static Task<JObject> FetchData(string URL)
{
    return Task.Run(async () =>
    {
        ChromeDriver myDriver = new CustomizedDriver();
        myDriver.Navigate().GoToUrl(URL);
        HtmlDocument Document = new HtmlDocument();
        Document.LoadHtml(await myDriver.GetPageSourceAsync());
        JObject Object = new JObject()
        {
            ["PopulationRank"] = Document.DocumentNode.SelectSingleNode("//div[@id='popRank']").InnerText
        };
        myDriver.Quit();
        myDriver.Dispose();
        return Object;
    });
}
static Task<string> GetPageSourceAsync(this IWebDriver driver)
{
    return Task.Run(() =>
    {
        while (true)
        {
            string PageState = (string)( (IJavaScriptExecutor)driver ).ExecuteScript("return document.readyState");
            if (PageState == "interactive" || PageState == "complete")
                return driver.PageSource;
        }
    });
}
static ChromeDriver CustomizedDriver()
{
    ChromeDriverService chromeService = ChromeDriverService.CreateDefaultService();
    chromeService.HideCommandPromptWindow = true;
    ChromeOptions options = new ChromeOptions();
    options.PageLoadStrategy = PageLoadStrategy.None;
    options.AddArgument("--headless --disable-cookies --blink-settings=imagesEnabled=false");
    return new ChromeDriver(chromeService, options);
}
c# selenium-webdriver asynchronous async-await task
1个回答
0
投票

您不应该一次创建大量任务。相反,您应该使用 Parallel.ForEachAsync,如本示例所示:

    static async void GetData()
{
    string website = "......";
    List<string> Countries = new List<string>()
    {
        // 195 Countries
    };

    var results = new ConcurrentBag<JObject>();
    
    var parallelOptions = new ParallelOptions()
    {
        MaxDegreeOfParallelism = 10 // Here you control how many countries in parallel to process.
    };

    Parallel.ForEachAsync(Countries, parallelOptions, async (country, token) => 
    {
        var result = await FetchData(website + "/" + countryName);
        results.Add(result);
    }

    foreach (JObject populationRank in results))
    {
        WriteLine(populationRank);
    }
}

static async Task<JObject> FetchData(string URL)
{
        ChromeDriver myDriver = new CustomizedDriver();
        myDriver.Navigate().GoToUrl(URL);
        HtmlDocument Document = new HtmlDocument();
        Document.LoadHtml(await myDriver.GetPageSourceAsync());
        JObject Object = new JObject()
        {
            ["PopulationRank"] = Document.DocumentNode.SelectSingleNode("//div[@id='popRank']").InnerText
        };
        myDriver.Quit();
        myDriver.Dispose();
        return Object;
}
static Task<string> GetPageSourceAsync(this IWebDriver driver)
{
    return Task.Run(() =>
    {
        while (true)
        {
            string PageState = (string)( (IJavaScriptExecutor)driver ).ExecuteScript("return document.readyState");
            if (PageState == "interactive" || PageState == "complete")
                return driver.PageSource;
        }
    });
}
static ChromeDriver CustomizedDriver()
{
    ChromeDriverService chromeService = ChromeDriverService.CreateDefaultService();
    chromeService.HideCommandPromptWindow = true;
    ChromeOptions options = new ChromeOptions();
    options.PageLoadStrategy = PageLoadStrategy.None;
    options.AddArgument("--headless --disable-cookies --blink-settings=imagesEnabled=false");
    return new ChromeDriver(chromeService, options);
}
© www.soinside.com 2019 - 2024. All rights reserved.