我想开发一个应用程序,我向其提供特定网站的网址,并从该网页中提取所有链接。对于每个提取的链接,我想获取HTML内容。我基于深度爬网的概念。我的目的是获取网站的所有电子邮件地址。下面是我的源代码:
static string ExtractEmails(string data)
{
//instantiate with this pattern
Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
//find items that matches with our pattern
MatchCollection emailMatches = emailRegex.Matches(data);
//StringBuilder sb = new StringBuilder();
string s = "";
foreach (Match emailMatch in emailMatches)
{
//sb.AppendLine(emailMatch.Value);
s += emailMatch.Value + ",";
}
return s;
}
static readonly List<ParsResult> _results = new List<ParsResult>();
static Int32 _maxDepth = 4;
static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
{
string email = "";
if (depth >= _maxDepth) return email;
String html;
using (var wc = new WebClient())
html = wc.DownloadString(urlToCheck ?? parent.Url);
var doc = new HtmlDocument();
doc.LoadHtml(html);
var aNods = doc.DocumentNode.SelectNodes("//a");
if (aNods == null || !aNods.Any()) return email;
foreach (var aNode in aNods)
{
var url = aNode.Attributes["href"];
if (url == null)
continue;
var wc2 = new WebClient();
String html2 = wc2.DownloadString(url.Value);
email = ExtractEmails(html2);
Console.WriteLine(email);
var result = new ParsResult
{
Depth = depth,
Parent = parent,
Url = url.Value
};
_results.Add(result);
Console.WriteLine("{0} - {1}", depth, result.Url);
Foo(depth: depth + 1, parent: result);
return email;
}
return email;
}
static void Main(string[] args)
{
String res = Foo("http://www.mobileridoda.com", 0);
Console.WriteLine("emails " + res);
}
我想在控制台中分发由主页DOM内的所有链接的所有页面提取的所有电子邮件,但是在控制台中不分发任何电子邮件。我该如何解决这个问题?谢谢
发现一些错误但无后顾之忧的原因,并详细说明了为什么以及如何解决这些问题。
在您的foreach循环中,当您遍历所有URL时,最后使用的是return语句,实际上对于每个URL列表仅循环一次。
当您遍历循环时,您正在覆盖电子邮件(我将其视为csv)。使用+ =继续添加。
在forEach循环中调用Foo时,您什么也不返回。您需要使用电子邮件+ = Foo(xyz)。
您正在浏览的是已经处理过的URL,可能导致无限循环。我添加了一个字符串列表来跟踪您已经访问过的URL,以防止您可能陷入无限循环。
这是一个完整的可行解决方案。
static string ExtractEmails(string data)
{
//instantiate with this pattern
Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
//find items that matches with our pattern
MatchCollection emailMatches = emailRegex.Matches(data);
//StringBuilder sb = new StringBuilder();
string s = "";
foreach (Match emailMatch in emailMatches)
{
//sb.AppendLine(emailMatch.Value);
s += emailMatch.Value + ",";
}
return s;
}
static readonly List<ParsResult> _results = new List<ParsResult>();
static Int32 _maxDepth = 4;
static List<string> urlsAlreadyVisited = new List<string>();
static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
{
if (urlsAlreadyVisited.Contains(urlToCheck))
return string.Empty;
else
urlsAlreadyVisited.Add(urlToCheck);
string email = "";
if (depth >= _maxDepth) return email;
String html;
using (var wc = new WebClient())
html = wc.DownloadString(urlToCheck ?? parent.Url);
var doc = new HtmlDocument();
doc.LoadHtml(html);
var aNods = doc.DocumentNode.SelectNodes("//a");
if (aNods == null || !aNods.Any()) return email;
// Get Distinct URLs from all the URls on this page.
List<string> allUrls = aNods.ToList().Select(x => x.Attributes["href"].Value).Where(url => url.StartsWith("http")).Distinct().ToList();
foreach (string url in allUrls)
{
var wc2 = new WebClient();
try
{
email += ExtractEmails(wc2.DownloadString(url));
}
catch { /* Swallow Exception ... URL not found or other errors. */ continue; }
Console.WriteLine(email);
var result = new ParsResult
{
Depth = depth,
Parent = parent,
Url = url
};
_results.Add(result);
Console.WriteLine("{0} - {1}", depth, result.Url);
email += Foo(depth: depth + 1, parent: result);
}
return email;
}
public class ParsResult
{
public int Depth { get; set; }
public ParsResult Parent { get; set; }
public string Url { get; set; }
}
// ========== MAIN CLASS ==========
static void Main(string[] args)
{
String res = Foo("http://www.mobileridoda.com", 0);
Console.WriteLine("emails " + res);
}