我将从 Selenium 和 HtmlAgilityPack(C# 语言)的网站获取每个国家的人口排名。这段代码适用于 10 个国家/地区,但当我想请求所有国家/地区时,由于任务较多,系统崩溃并且速度缓慢。有什么办法?
static async void GetData()
{
string website = "......";
List<string> Countries = new List<string>()
{
// 195 Countries
};
List<Task<JObject>> Tasks = new List<Task<JObject>>();
foreach (string countryName in Countries)
{
Tasks.Add(FetchData(website + "/" + countryName));
}
await Task.WhenAll(Tasks);
foreach (JObject populationRank in Tasks.Select(task => task.Result))
{
WriteLine(populationRank);
}
}
static Task<JObject> FetchData(string URL)
{
return Task.Run(async () =>
{
ChromeDriver myDriver = new CustomizedDriver();
myDriver.Navigate().GoToUrl(URL);
HtmlDocument Document = new HtmlDocument();
Document.LoadHtml(await myDriver.GetPageSourceAsync());
JObject Object = new JObject()
{
["PopulationRank"] = Document.DocumentNode.SelectSingleNode("//div[@id='popRank']").InnerText
};
myDriver.Quit();
myDriver.Dispose();
return Object;
});
}
static Task<string> GetPageSourceAsync(this IWebDriver driver)
{
return Task.Run(() =>
{
while (true)
{
string PageState = (string)( (IJavaScriptExecutor)driver ).ExecuteScript("return document.readyState");
if (PageState == "interactive" || PageState == "complete")
return driver.PageSource;
}
});
}
static ChromeDriver CustomizedDriver()
{
ChromeDriverService chromeService = ChromeDriverService.CreateDefaultService();
chromeService.HideCommandPromptWindow = true;
ChromeOptions options = new ChromeOptions();
options.PageLoadStrategy = PageLoadStrategy.None;
options.AddArgument("--headless --disable-cookies --blink-settings=imagesEnabled=false");
return new ChromeDriver(chromeService, options);
}
您不应该一次创建大量任务。相反,您应该使用 Parallel.ForEachAsync,如本示例所示:
static async void GetData()
{
string website = "......";
List<string> Countries = new List<string>()
{
// 195 Countries
};
var results = new ConcurrentBag<JObject>();
var parallelOptions = new ParallelOptions()
{
MaxDegreeOfParallelism = 10 // Here you control how many countries in parallel to process.
};
Parallel.ForEachAsync(Countries, parallelOptions, async (country, token) =>
{
var result = await FetchData(website + "/" + countryName);
results.Add(result);
}
foreach (JObject populationRank in results))
{
WriteLine(populationRank);
}
}
static async Task<JObject> FetchData(string URL)
{
ChromeDriver myDriver = new CustomizedDriver();
myDriver.Navigate().GoToUrl(URL);
HtmlDocument Document = new HtmlDocument();
Document.LoadHtml(await myDriver.GetPageSourceAsync());
JObject Object = new JObject()
{
["PopulationRank"] = Document.DocumentNode.SelectSingleNode("//div[@id='popRank']").InnerText
};
myDriver.Quit();
myDriver.Dispose();
return Object;
}
static Task<string> GetPageSourceAsync(this IWebDriver driver)
{
return Task.Run(() =>
{
while (true)
{
string PageState = (string)( (IJavaScriptExecutor)driver ).ExecuteScript("return document.readyState");
if (PageState == "interactive" || PageState == "complete")
return driver.PageSource;
}
});
}
static ChromeDriver CustomizedDriver()
{
ChromeDriverService chromeService = ChromeDriverService.CreateDefaultService();
chromeService.HideCommandPromptWindow = true;
ChromeOptions options = new ChromeOptions();
options.PageLoadStrategy = PageLoadStrategy.None;
options.AddArgument("--headless --disable-cookies --blink-settings=imagesEnabled=false");
return new ChromeDriver(chromeService, options);
}