rvest - 删除不想要的文本段落

问题描述 投票:1回答:1

我想从以下rvest代码的输出去除大量的段落

 library(rvest)

    link <- 'https://www.duedil.com/company/gb/02666908/yorwaste-limited/financials'
    doc <- read_html(link)  %>% html_nodes("script:contains('js-financials-component')") %>% html_text()

基本上,我想留下下面我想输出成JSON格式只是文本。我不知道在哪里我可以用这个启动,请能有人指点我?非常感谢

{
          companyName: {"name":"Yorwaste Limited"},
          numAccounts: 1,
          accounts: [{"title":"Summary","rows":[{"label":"Reporting Period (Months)","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":12,"formattedValue":"12","delta":null}]},{"label":"Consolidated Accounts","description":"Financial statements of the parent (company) and its subsidiaries are presented as those of a single economic entity.","chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":"Y","formattedValue":"Y","delta":null}]},{"label":"Number of Employees","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":318,"formattedValue":"318","delta":64.77}]},{"label":"Turnover","description":"Revenue generated from business activities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40328232,"formattedValue":"40,328,232","delta":3.67}]},{"label":"EBITDA","description":"Earnings before interest, tax, depreciation and amortization.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":2834874,"formattedValue":"2,834,874","delta":62.78}]},{"label":"Post-tax Profit","description":"Profit generated after taxation.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":885230,"formattedValue":"885,230","delta":52.17}]},{"label":"Total Assets","description":"The value of all assets on the Balance Sheet.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40264952,"formattedValue":"40,264,952","delta":25.58}]},{"label":"Net Assets","description":"Total Assets less Total Liabilities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":10613963,"formattedValue":"10,613,963","delta":9.1}]},{"label":"Return on Capital Employed (%)","description":"Operating Profit expressed as a percentage of average Capital Employed.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":6.97,"formattedValue":"6.97","delta":null}]},{"label":"Debt to Capital (%)","description":"Total Liabilities expressed as a percentage of Total Assets.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":73.64,"formattedValue":"73.64","delta":null}]}]}],
          growth: [{"title":"Summary","rows":[{"label":"Reporting Period (Months)","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":12,"formattedValue":"12"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Consolidated Accounts","description":"Financial statements of the parent (company) and its subsidiaries are presented as those of a single economic entity.","chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":"Y","formattedValue":"Y"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Number of Employees","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":318,"formattedValue":"318"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Turnover","description":"Revenue generated from business activities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40328232,"formattedValue":"40,328,232"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"EBITDA","description":"Earnings before interest, tax, depreciation and amortization.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":2834874,"formattedValue":"2,834,874"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Post-tax Profit","description":"Profit generated after taxation.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":885230,"formattedValue":"885,230"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Total Assets","description":"The value of all assets on the Balance Sheet.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40264952,"formattedValue":"40,264,952"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Net Assets","description":"Total Assets less Total Liabilities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":10613963,"formattedValue":"10,613,963"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Return on Capital Employed (%)","description":"Operating Profit expressed as a percentage of average Capital Employed.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":6.97,"formattedValue":"6.97"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Debt to Capital (%)","description":"Total Liabilities expressed as a percentage of Total Assets.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":73.64,"formattedValue":"73.64"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}}]}]
r web-scraping rvest stringr httr
1个回答
1
投票

你可能只是下载网页的源内容,并使用正则表达式提取必要的片段:

library(httr)
library(stringr)

r <- GET('https://www.duedil.com/company/gb/02666908/yorwaste-limited/financials')
q <- str_match(r, "new Widget\\([\\s\\S]*?(\\{\\s*companyName:[\\s\\S]*?\\})\\)")
d <- q[2]

免责声明qazxsw POI

© www.soinside.com 2019 - 2024. All rights reserved.