我正在尝试抓取 https://www.barchart.com/stocks/quotes/IBM/venue-statement/
上表格中的财务数据使用检查元素我没有看到任何 XHR/fetch 请求,但看起来数据是通过名为 global-MBHFEFVQ.js 的 js 文件生成的,但很难遵循混淆的代码。
对于 barchart.com 网站上的一些其他数据,看起来可以通过 API 调用来抓取,如本文中所述:如何从 barchart.com 中抓取这些股票代码?。但我不确定损益表数据是否也是如此。感谢任何帮助,因为我是网络抓取新手。
我将使用 php 来抓取数据,但其他语言也可以。
现在我只是获取整个页面并提取我感兴趣的数据作为子字符串,但这并不理想,因为我必须对网站中的所有其他开销进行排序,并且必须循环遍历每个“reportPage”获取每年数据的网址。
$url = "https://www.barchart.com/stocks/quotes/IBM/income-statement/quarterly?reportPage=2";
$html = file_get_contents($url);
$date_start = stripos($html, "report__row-dates");
$date_end = stripos($html, "</tr>", $offset = $date_start);
$dates = substr($html, $date_start, $date_end - $date_start);
使用检查元素我没有看到任何 XHR/fetch 请求,但看起来数据是通过名为 global-MBHFEFVQ.js 的 js 文件生成的
不知道你从哪里得到它,它都嵌入在 HTML 中,
<?php
declare(strict_types=1);
$html = file_get_contents('https://www.barchart.com/stocks/quotes/IBM/income-statement/annual');
//var_dump($html);die();
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xp = new DOMXPath($dom);
$tbody = $xp->query('//tr[contains(@class,"bc-financial-report")]/parent::tbody')->item(0);
$trs = $xp->query('./tr', $tbody);
$data_keys = [];
foreach($trs->item(0)->getElementsByTagName('td') as $td){
$data_keys[] = trim($td->textContent);
}
$data = [];
for($i=1;$i<$trs->length;++$i){
$tr = $trs->item($i);
$tds = $xp->query('./td', $tr);
$row = [];
foreach($tds as $td){
$row[] = trim($td->textContent);
}
$data[] = array_combine($data_keys, $row);
}
var_export($data);
给予
array (
0 =>
array (
' ' => 'Sales',
'12-2022' => '60,530,000',
'12-2021' => '57,350,000',
'12-2020' => '55,179,000',
'12-2019' => '57,714,000',
'12-2018' => '79,591,000',
),
1 =>
array (
' ' => 'Cost of Goods',
'12-2022' => '27,842,000',
'12-2021' => '25,865,000',
'12-2020' => '24,314,000',
'12-2019' => '26,180,000',
'12-2018' => '42,654,000',
),
2 =>
array (
' ' => 'Gross Profit',
'12-2022' => '32,687,000',
'12-2021' => '31,486,000',
'12-2020' => '30,865,000',
'12-2019' => '31,533,000',
'12-2018' => '36,936,000',
),
3 =>
array (
' ' => 'Operating Expenses',
'12-2022' => '25,176,000',
'12-2021' => '25,233,000',
'12-2020' => '26,823,000',
'12-2019' => '24,634,000',
'12-2018' => '24,745,000',
),
4 =>
array (
' ' => 'Operating Income',
'12-2022' => '7,512,000',
'12-2021' => '6,252,000',
'12-2020' => '4,042,000',
'12-2019' => '6,900,000',
'12-2018' => '12,192,000',
),
5 =>
array (
' ' => 'Interest Expense',
'12-2022' => '1,216,000',
'12-2021' => '1,155,000',
'12-2020' => '1,288,000',
'12-2019' => '1,344,000',
'12-2018' => '723,000',
),
6 =>
array (
' ' => 'Other Income',
'12-2022' => '-5,140,000',
'12-2021' => '-260,000',
'12-2020' => '-182,000',
'12-2019' => '1,650,000',
'12-2018' => '-127,000',
),
7 =>
array (
' ' => 'Pre-tax Income',
'12-2022' => '1,156,000',
'12-2021' => '4,837,000',
'12-2020' => '2,572,000',
'12-2019' => '7,206,000',
'12-2018' => '11,342,000',
),
8 =>
array (
' ' => 'Income Tax',
'12-2022' => '-626,000',
'12-2021' => '124,000',
'12-2020' => '-1,360,000',
'12-2019' => '60,000',
'12-2018' => '2,619,000',
),
9 =>
array (
' ' => 'Net Income Continuous',
'12-2022' => '1,783,000',
'12-2021' => '4,712,000',
'12-2020' => '3,932,000',
'12-2019' => '7,146,000',
'12-2018' => '8,723,000',
),
10 =>
array (
' ' => 'Net Income Discontinuous',
'12-2022' => '-143,000',
'12-2021' => '1,030,000',
'12-2020' => '1,658,000',
'12-2019' => '2,285,000',
'12-2018' => '5,000',
),
11 =>
array (
' ' => 'Net Income',
'12-2022' => '$1,640,000',
'12-2021' => '$5,742,000',
'12-2020' => '$5,590,000',
'12-2019' => '$9,431,000',
'12-2018' => '$8,728,000',
),
12 =>
array (
' ' => 'EPS Basic Total Ops',
'12-2022' => '1.82',
'12-2021' => '6.41',
'12-2020' => '6.28',
'12-2019' => '10.63',
'12-2018' => '9.57',
),
13 =>
array (
' ' => 'EPS Basic Continuous Ops',
'12-2022' => '1.97',
'12-2021' => '5.26',
'12-2020' => '4.42',
'12-2019' => '8.05',
'12-2018' => '9.56',
),
14 =>
array (
' ' => 'EPS Basic Discontinuous Ops',
'12-2022' => '-0.16',
'12-2021' => '1.15',
'12-2020' => '1.86',
'12-2019' => '2.58',
'12-2018' => '0.01',
),
15 =>
array (
' ' => 'EPS Diluted Total Ops',
'12-2022' => '1.80',
'12-2021' => '6.35',
'12-2020' => '6.23',
'12-2019' => '10.56',
'12-2018' => '9.52',
),
16 =>
array (
' ' => 'EPS Diluted Continuous Ops',
'12-2022' => '1.95',
'12-2021' => '5.21',
'12-2020' => '4.38',
'12-2019' => '8.00',
'12-2018' => '9.51',
),
17 =>
array (
' ' => 'EPS Diluted Discontinuous Ops',
'12-2022' => '-0.16',
'12-2021' => '1.14',
'12-2020' => '1.85',
'12-2019' => '2.56',
'12-2018' => '0.01',
),
18 =>
array (
' ' => 'EPS Diluted Before Non-Recurring Items',
'12-2022' => '9.13',
'12-2021' => '7.93',
'12-2020' => '8.67',
'12-2019' => '12.81',
'12-2018' => '13.81',
),
19 =>
array (
' ' => 'EBITDA(a)',
'12-2022' => '$12,314,000',
'12-2021' => '$12,669,000',
'12-2020' => '$10,737,000',
'12-2019' => '$12,959,000',
'12-2018' => '$16,672,000',
),
)