C# 如何从Web浏览器控件的选定文本中提取html表格

问题描述 投票:0回答:1

我有一个网络浏览器控件,可以在其中加载网站。网页有许多表格数据,用户将选择这些数据并需要解析这些数据并在 datagridview 中显示。

这是我从网络浏览器控件中提取选定文本的方式。

private string GetSelectedText()
        {
            dynamic document = webBrowser1.Document.DomDocument;
            dynamic selection = document.selection;
            dynamic text = selection.createRange().text;
            return (string)text;
        }

现在从选定的文本中正确提取数据变得非常困难。所以我的问题是是否可以从选定的文本中获取html数据

这些是我需要解析数据的网站。 https://www.sec.gov/Archives/edgar/data/1108134/000110813423000018/bhlb-20230630.htm https://www.sec.gov/Archives/edgar/data/66740/000006674023000058/mmm-20230630.htm

这是我当前的例程,我用它来解析选定的数据,但我遵循的方法不是很好。

public string SelectedText { get; set; }
private void Form2_Load(object sender, EventArgs e)
{
    bool startparse = false;
    int colCounter = 1;
    DataTable dt = new DataTable();
    string selectedtext = SelectedText;
    string[] lines = null;
    List<string> colvalues = null;
    //list of char need to replace from selected line item name
    // storing new lineitem & carriage return
    string[] stringSeparators = new string[] { "\r\n" };
    char[] patternone = new char[] { '%', '€', ';', ',', '.', '$', '£', '(', ')' };

    #region Data parsing logic from browser & storing into datatable
    //splitting selected text
    lines = selectedtext.Split(stringSeparators, StringSplitOptions.None);

    List<string> columns = null;
    string strLeftColumnName = "";
    string tmp = "";
    string lineitem = "", strValues = "", strTmpdata, strNewValues = "";

    #region Extract data for each rows
    foreach (string s in lines)
    {
        columns = null;
        tmp = "";
        lineitem = "";
        strValues = "";
        strTmpdata = "";
        strNewValues = "";

        #region Extract data for building columns
        foreach (string line in lines)
        {
            tmp = line;
            //if (line.Contains("Dollars in millions"))
            //{
                var match = Regex.Match(line, "\\(\\D*\\)", RegexOptions.IgnoreCase);
                if (match.Success)
                {
                    strLeftColumnName = match.Groups[0].Value;
                }
                tmp = tmp.Trim().Replace(strLeftColumnName.Trim(), "");
                columns = tmp.Trim().Split(new char[] { ' ' }).ToList();
                columns.Insert(0, strLeftColumnName);
                break;
            //}
        }
        #endregion

        #region Build Datagrid columns
        if (columns != null && columns.Count > 0)
        {
            if (dgv.Columns.Count < columns.Count)
            {
                foreach (string col in columns)
                {
                    if (col.All(char.IsNumber))
                    {
                        dgv.Columns.Add("col_" + colCounter, "");
                        dgv.Columns["col_" + colCounter].SortMode = DataGridViewColumnSortMode.NotSortable;
                    }
                    else
                    {
                        dgv.Columns.Add("col_" + colCounter, "");
                        dgv.Columns["col_" + colCounter].SortMode = DataGridViewColumnSortMode.NotSortable;
                    }
                    colCounter++;
                }
            }
        }
        #endregion

        if (s != "" && (s.Contains("Dollars in millions") || startparse))
        {

            if (s.Contains("Net changes related to available-for-sale securities"))
            {

            }

            strTmpdata = s;

            //Here storing lineitem name
            lineitem = Regex.Replace(s.Trim(), @"[\d-1]", string.Empty);
            //lineitem = Regex.Replace(s.Trim(), @"[^A-Za-z0-9 -]", string.Empty);
            lineitem = ReplaceMultipleChar(lineitem, patternone, string.Empty);
            lineitem = lineitem.Trim();

            if (lineitem != "")
            {
                //here split numeric data only
                if (strTmpdata.Length > lineitem.Length)
                {
                    //strValues = strTmpdata.Substring(lineitem.Length, (strTmpdata.Length - lineitem.Length));
                    //lineitem = Regex.Escape(lineitem);
                    //strTmpdata =Regex.Escape(strTmpdata);
                    //strTmpdata = Regex.Replace(strTmpdata, lineitem, "");

                    strTmpdata = GetNumericData(strTmpdata);
                    //strValues = ReplaceWholeWord( strTmpdata, lineitem,"");
                    strValues = strTmpdata.Trim();
                    strValues = strValues.Replace("(", "-").Replace(")", " ").Replace(",", "").Trim();
                    //strNewValues = strValues;
                    //for (int i = 0; i < strValues.Length; i++)
                    //{
                    //    if (Char.IsDigit(strValues[i]) || strValues[i] == '-' || strValues[i] == ' ' || strValues[i] == '.')
                    //        strNewValues += strValues[i];
                    //}
                }

                //strValues = strNewValues.Trim();
                colvalues = strValues.Trim().Split(new char[] { ' ' }).ToList();
                if (colvalues.Count > 0)
                {
                    colvalues.Insert(0, lineitem);
                    dgv.Rows.Add(colvalues.ToArray());
                }
            }
            startparse = true;
        }
    }
    #endregion

    #endregion
}
private string GetNumericData(string input)
{
    string output = "";
    for (int i = 0; i < input.Length; i++)
    {
        if (input[i] == '3')
        {

        }
        if (input[i] == '.' || input[i] == ' ' || input[i] == '-' || input[i] == '(' || input[i] == ')' || Char.IsDigit(input[i]))
        {
            if (input[i] == '(' && Char.IsDigit(input[i + 1]) && (i + 1) < input.Length)
            {
                output += input[i];
            }
            else if (input[i] == ')' && Char.IsDigit(input[i - 1]) && i > 0)
            {
                output += input[i];
            }
            else if (input[i] == '.' || input[i] == '-' || Char.IsDigit(input[i]) || input[i] == ' ')
            {
                output += input[i];
            }
        }
    }
    return output;
}
public string ReplaceMultipleChar(string s, char[] separators, string newVal)
{
    string[] temp;

    temp = s.Split(separators, StringSplitOptions.RemoveEmptyEntries);
    return String.Join(newVal, temp);

}

public string ReplaceAll(string s, string separators, string newVal)
{
    return Regex.Replace(s, separators, newVal);
}

public string ReplaceWholeWord(string original, string wordToFind, string replacement, RegexOptions regexOptions = RegexOptions.None)
{
    string pattern = String.Format(@"\b{0}\b", wordToFind);
    string ret = Regex.Replace(original, pattern, replacement, regexOptions);
    return ret;
}

请有人帮助我如何从 Web 浏览器控件获取所选文本的 html,或讨论任何其他好的方法来解析我需要在 datagridview 中显示的所选表格数据。

谢谢

c# parsing datagridview webbrowser-control
1个回答
0
投票

这就是网页抓取。

一些现有工具已经为此而制作。

但是您可以直接使用 js + Xpath 从网络浏览器的控制台中提取。

请参阅我已经在 C# + xpath 中使用的相关代码摘录:

public class MyData {
  public string data_1 {get;set;}
  public string data_2 {get;set;}
}
//...
string url = "your url";
var client = new RestClient(url);
var request = new RestRequest("", Method.Get);
request.AddHeader("",""); //if needed
var res = client.Execute(request);

if (res.IsSuccessStatusCode is not true) throw new ArgumentException();

HtmlDocument xdc = new HtmlDocument();
string sanitazed = Regex.Replace(res.Content, "&nbsp", "");
xdc.LoadHtml(sanitazed);

string mydata1 = xdc.DocumentNode.SelectNodes("you xpath 1")?.First() ?.InnerText ?? "";
string mydata2 = xdc.DocumentNode.SelectNodes("you xpath 2")?.First() ?.InnerText ?? "";
MyData result = new() { data_1 : mydata1, data_1 : mydata2 }
 
// or alternative for loop, 

var nodes = xdc.DocumentNode.SelectNodes("xpath that look like //table//tbody//tr/td[5]/a");
if (nodes is null) throw new Exception(" xpath on error, please check");
foreach (var node in nodes) {
  string mydata1 = xdc.DocumentNode.SelectNodes("you xpath 1")?.First() ?.InnerText ?? "";
string mydata2 = xdc.DocumentNode.SelectNodes("you xpath 2")?.First() ?.InnerText ?? "";
  // or with attribut
  var data1 = node.GetAttributeValue("href","?");
  var data2 = node.GetAttributeValue("href","?");
  
}
            
© www.soinside.com 2019 - 2024. All rights reserved.