为页面DOM内的每个链接从网站提取电子邮件地址

问题描述 投票:0回答:1

我想开发一个应用程序,我向其提供特定网站的网址,并从该网页中提取所有链接。对于每个提取的链接,我想获取HTML内容。我基于深度爬网的概念。我的目的是获取网站的所有电子邮件地址。下面是我的源代码:

 static string ExtractEmails(string data)
 {

            //instantiate with this pattern 
            Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
            //find items that matches with our pattern
            MatchCollection emailMatches = emailRegex.Matches(data);

            //StringBuilder sb = new StringBuilder();
            string s = "";
            foreach (Match emailMatch in emailMatches)
            {
                //sb.AppendLine(emailMatch.Value);
                s += emailMatch.Value + ",";
            }
            return s;
 }

     static readonly List<ParsResult> _results = new List<ParsResult>();
        static Int32 _maxDepth = 4;
        static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
        {
            string email = "";
            if (depth >= _maxDepth) return email;
            String html;
            using (var wc = new WebClient())
                html = wc.DownloadString(urlToCheck ?? parent.Url);

            var doc = new HtmlDocument();
            doc.LoadHtml(html);
            var aNods = doc.DocumentNode.SelectNodes("//a");
            if (aNods == null || !aNods.Any()) return email;
            foreach (var aNode in aNods)
            {
                var url = aNode.Attributes["href"];
                if (url == null)
                    continue;

                var wc2 = new WebClient();
                String html2 = wc2.DownloadString(url.Value);
                email = ExtractEmails(html2);
                Console.WriteLine(email);
                var result = new ParsResult
                {
                    Depth = depth,
                    Parent = parent,
                    Url = url.Value
                };
                _results.Add(result);
                Console.WriteLine("{0} - {1}", depth, result.Url);
                Foo(depth: depth + 1, parent: result);
                return email;
            }
            return email;
        }

static void Main(string[] args)
{
    String res = Foo("http://www.mobileridoda.com", 0);
    Console.WriteLine("emails " + res);
}

我想在控制台中分发由主页DOM内的所有链接的所有页面提取的所有电子邮件,但是在控制台中不分发任何电子邮件。我该如何解决这个问题?谢谢

c# web web-scraping web-crawler html-agility-pack
1个回答
0
投票

发现一些错误但无后顾之忧的原因,并详细说明了为什么以及如何解决这些问题。

  1. 在您的foreach循环中,当您遍历所有URL时,最后使用的是return语句,实际上对于每个URL列表仅循环一次。

  2. 当您遍历循环时,您正在覆盖电子邮件(我将其视为csv)。使用+ =继续添加。

  3. 在forEach循环中调用Foo时,您什么也不返回。您需要使用电子邮件+ = Foo(xyz)。

  4. 您正在浏览的是已经处理过的URL,可能导致无限循环。我添加了一个字符串列表来跟踪您已经访问过的URL,以防止您可能陷入无限循环。

这是一个完整的可行解决方案。

    static string ExtractEmails(string data)
    {

        //instantiate with this pattern 
        Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
        //find items that matches with our pattern
        MatchCollection emailMatches = emailRegex.Matches(data);

        //StringBuilder sb = new StringBuilder();
        string s = "";
        foreach (Match emailMatch in emailMatches)
        {
            //sb.AppendLine(emailMatch.Value);
            s += emailMatch.Value + ",";
        }
        return s;
    }

    static readonly List<ParsResult> _results = new List<ParsResult>();
    static Int32 _maxDepth = 4;
    static List<string> urlsAlreadyVisited = new List<string>();

    static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
    {
        if (urlsAlreadyVisited.Contains(urlToCheck))
            return string.Empty;
        else
            urlsAlreadyVisited.Add(urlToCheck);

        string email = "";
        if (depth >= _maxDepth) return email;
        String html;
        using (var wc = new WebClient())
            html = wc.DownloadString(urlToCheck ?? parent.Url);

        var doc = new HtmlDocument();
        doc.LoadHtml(html);
        var aNods = doc.DocumentNode.SelectNodes("//a");
        if (aNods == null || !aNods.Any()) return email;

        // Get Distinct URLs from all the URls on this page.
        List<string> allUrls = aNods.ToList().Select(x => x.Attributes["href"].Value).Where(url => url.StartsWith("http")).Distinct().ToList();

        foreach (string url in allUrls)
        {
            var wc2 = new WebClient();
            try
            {
                email += ExtractEmails(wc2.DownloadString(url));
            }
            catch { /* Swallow Exception ... URL not found or other errors. */ continue; }

            Console.WriteLine(email);
            var result = new ParsResult
            {
                Depth = depth,
                Parent = parent,
                Url = url
            };
            _results.Add(result);
            Console.WriteLine("{0} - {1}", depth, result.Url);
            email += Foo(depth: depth + 1, parent: result);
        }
        return email;
    }
    public class ParsResult
    {
        public int Depth { get; set; }
        public ParsResult Parent { get; set; }
        public string Url { get; set; }
    }

    // ========== MAIN CLASS ==========

    static void Main(string[] args)
    {
        String res = Foo("http://www.mobileridoda.com", 0);
        Console.WriteLine("emails " + res);
    }
© www.soinside.com 2019 - 2024. All rights reserved.