抓取 Barchart.com 财务数据

问题描述 投票:0回答:1

我正在尝试抓取 https://www.barchart.com/stocks/quotes/IBM/venue-statement/

上表格中的财务数据

使用检查元素我没有看到任何 XHR/fetch 请求,但看起来数据是通过名为 global-MBHFEFVQ.js 的 js 文件生成的,但很难遵循混淆的代码。

对于 barchart.com 网站上的一些其他数据,看起来可以通过 API 调用来抓取,如本文中所述:如何从 barchart.com 中抓取这些股票代码?。但我不确定损益表数据是否也是如此。感谢任何帮助,因为我是网络抓取新手。

我将使用 php 来抓取数据,但其他语言也可以。

现在我只是获取整个页面并提取我感兴趣的数据作为子字符串,但这并不理想,因为我必须对网站中的所有其他开销进行排序,并且必须循环遍历每个“reportPage”获取每年数据的网址。

    $url = "https://www.barchart.com/stocks/quotes/IBM/income-statement/quarterly?reportPage=2";
    $html = file_get_contents($url);
    $date_start = stripos($html, "report__row-dates");
    $date_end = stripos($html, "</tr>", $offset = $date_start);
    $dates = substr($html, $date_start, $date_end - $date_start);
web-scraping web-inspector
1个回答
0
投票

使用检查元素我没有看到任何 XHR/fetch 请求,但看起来数据是通过名为 global-MBHFEFVQ.js 的 js 文件生成的

不知道你从哪里得到它,它都嵌入在 HTML 中,

<?php

declare(strict_types=1);
$html = file_get_contents('https://www.barchart.com/stocks/quotes/IBM/income-statement/annual');
//var_dump($html);die();
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xp = new DOMXPath($dom);
$tbody = $xp->query('//tr[contains(@class,"bc-financial-report")]/parent::tbody')->item(0);
$trs = $xp->query('./tr', $tbody);
$data_keys = [];
foreach($trs->item(0)->getElementsByTagName('td') as $td){
    $data_keys[] = trim($td->textContent);
}
$data = [];
for($i=1;$i<$trs->length;++$i){
    $tr = $trs->item($i);
    $tds = $xp->query('./td', $tr);
    $row = [];
    foreach($tds as $td){
        $row[] = trim($td->textContent);
    }
    $data[] = array_combine($data_keys, $row);
}
var_export($data);

给予

array (
  0 => 
  array (
    ' ' => 'Sales',
    '12-2022' => '60,530,000',
    '12-2021' => '57,350,000',
    '12-2020' => '55,179,000',
    '12-2019' => '57,714,000',
    '12-2018' => '79,591,000',
  ),
  1 => 
  array (
    ' ' => 'Cost of Goods',
    '12-2022' => '27,842,000',
    '12-2021' => '25,865,000',
    '12-2020' => '24,314,000',
    '12-2019' => '26,180,000',
    '12-2018' => '42,654,000',
  ),
  2 => 
  array (
    ' ' => 'Gross Profit',
    '12-2022' => '32,687,000',
    '12-2021' => '31,486,000',
    '12-2020' => '30,865,000',
    '12-2019' => '31,533,000',
    '12-2018' => '36,936,000',
  ),
  3 => 
  array (
    ' ' => 'Operating Expenses',
    '12-2022' => '25,176,000',
    '12-2021' => '25,233,000',
    '12-2020' => '26,823,000',
    '12-2019' => '24,634,000',
    '12-2018' => '24,745,000',
  ),
  4 => 
  array (
    ' ' => 'Operating Income',
    '12-2022' => '7,512,000',
    '12-2021' => '6,252,000',
    '12-2020' => '4,042,000',
    '12-2019' => '6,900,000',
    '12-2018' => '12,192,000',
  ),
  5 => 
  array (
    ' ' => 'Interest Expense',
    '12-2022' => '1,216,000',
    '12-2021' => '1,155,000',
    '12-2020' => '1,288,000',
    '12-2019' => '1,344,000',
    '12-2018' => '723,000',
  ),
  6 => 
  array (
    ' ' => 'Other Income',
    '12-2022' => '-5,140,000',
    '12-2021' => '-260,000',
    '12-2020' => '-182,000',
    '12-2019' => '1,650,000',
    '12-2018' => '-127,000',
  ),
  7 => 
  array (
    ' ' => 'Pre-tax Income',
    '12-2022' => '1,156,000',
    '12-2021' => '4,837,000',
    '12-2020' => '2,572,000',
    '12-2019' => '7,206,000',
    '12-2018' => '11,342,000',
  ),
  8 => 
  array (
    ' ' => 'Income Tax',
    '12-2022' => '-626,000',
    '12-2021' => '124,000',
    '12-2020' => '-1,360,000',
    '12-2019' => '60,000',
    '12-2018' => '2,619,000',
  ),
  9 => 
  array (
    ' ' => 'Net Income Continuous',
    '12-2022' => '1,783,000',
    '12-2021' => '4,712,000',
    '12-2020' => '3,932,000',
    '12-2019' => '7,146,000',
    '12-2018' => '8,723,000',
  ),
  10 => 
  array (
    ' ' => 'Net Income Discontinuous',
    '12-2022' => '-143,000',
    '12-2021' => '1,030,000',
    '12-2020' => '1,658,000',
    '12-2019' => '2,285,000',
    '12-2018' => '5,000',
  ),
  11 => 
  array (
    ' ' => 'Net Income',
    '12-2022' => '$1,640,000',
    '12-2021' => '$5,742,000',
    '12-2020' => '$5,590,000',
    '12-2019' => '$9,431,000',
    '12-2018' => '$8,728,000',
  ),
  12 => 
  array (
    ' ' => 'EPS Basic Total Ops',
    '12-2022' => '1.82',
    '12-2021' => '6.41',
    '12-2020' => '6.28',
    '12-2019' => '10.63',
    '12-2018' => '9.57',
  ),
  13 => 
  array (
    ' ' => 'EPS Basic Continuous Ops',
    '12-2022' => '1.97',
    '12-2021' => '5.26',
    '12-2020' => '4.42',
    '12-2019' => '8.05',
    '12-2018' => '9.56',
  ),
  14 => 
  array (
    ' ' => 'EPS Basic Discontinuous Ops',
    '12-2022' => '-0.16',
    '12-2021' => '1.15',
    '12-2020' => '1.86',
    '12-2019' => '2.58',
    '12-2018' => '0.01',
  ),
  15 => 
  array (
    ' ' => 'EPS Diluted Total Ops',
    '12-2022' => '1.80',
    '12-2021' => '6.35',
    '12-2020' => '6.23',
    '12-2019' => '10.56',
    '12-2018' => '9.52',
  ),
  16 => 
  array (
    ' ' => 'EPS Diluted Continuous Ops',
    '12-2022' => '1.95',
    '12-2021' => '5.21',
    '12-2020' => '4.38',
    '12-2019' => '8.00',
    '12-2018' => '9.51',
  ),
  17 => 
  array (
    ' ' => 'EPS Diluted Discontinuous Ops',
    '12-2022' => '-0.16',
    '12-2021' => '1.14',
    '12-2020' => '1.85',
    '12-2019' => '2.56',
    '12-2018' => '0.01',
  ),
  18 => 
  array (
    ' ' => 'EPS Diluted Before Non-Recurring Items',
    '12-2022' => '9.13',
    '12-2021' => '7.93',
    '12-2020' => '8.67',
    '12-2019' => '12.81',
    '12-2018' => '13.81',
  ),
  19 => 
  array (
    ' ' => 'EBITDA(a)',
    '12-2022' => '$12,314,000',
    '12-2021' => '$12,669,000',
    '12-2020' => '$10,737,000',
    '12-2019' => '$12,959,000',
    '12-2018' => '$16,672,000',
  ),
)
© www.soinside.com 2019 - 2024. All rights reserved.