从非html数据表中提取数据

问题描述 投票:0回答:1

我正在尝试从图形中提取数据点......但没有任何运气。没有任何线索,为什么不呢。

我总是得到“通过强制引入的 NA”

我的想法是读取 URL 并查看日期,以识别列出图形中的值的部分。然而,我似乎无法掌握它们。

url <- "https://www.transfermarkt.com/manuel-neuer/marktwertverlauf/spieler/17259"

data <- readLines(con = url)

data <- paste0(data, collapse = "\n")


start.index <- gregexpr(pattern = "'data':\\[", text = data)
end.index <- gregexpr(pattern = "\\]", text = data)

end.index[[1]] <- end.index[[1]][which(end.index[[1]] > start.index[[1]])[1]]

data <- substring(
    text = data,
    first = start.index[[1]],
    last = end.index[[1]]
)




#date 
#market value 
#club 
#age 


# Get Dates
date.s.index <- gregexpr(pattern = "'datum_mw':'", text = data)
date.e.index <- gregexpr(pattern = "',", text = data)

date.index <- rbind(
    cbind(date.s.index[[1]], 0),
    cbind(date.e.index[[1]], 1)
)

date.index <- date.index[order(date.index[, 1]), ]

d.index <- date.index[, 2] == 0
d.next.index <- c(FALSE, d.index[-length(d.index)])

date.start.index <- date.index[d.index, 1]
date.end.index <- date.index[d.next.index, 1]
date.res <- substring(
    text = data,
    first = date.start.index + attr(date.s.index[[1]], "match.length")[1], 
    last = date.end.index - 1
)
date.res <- gsub(pattern = "\\\\x20", replacement = " ", x = date.res)


# Get Values
value.s.index <- gregexpr(pattern = "'y':", text = data)
value.e.index <- gregexpr(pattern = ",", text = data)

value.index <- rbind(
    cbind(value.s.index[[1]], 0),
    cbind(value.e.index[[1]], 1)
)

value.index <- value.index[order(value.index[, 1]), ]

v.index <- value.index[, 2] == 0
v.next.index <- c(FALSE, v.index[-length(v.index)])

value.start.index <- value.index[v.index, 1]
value.end.index <- value.index[v.next.index, 1]
value.res <- substring(
    text = data,
    first = value.start.index + attr(value.s.index[[1]], "match.length")[1], 
    last = value.end.index - 1
)


# Get Clubs
club.s.index <- gregexpr(pattern = "'verein':'", text = data)
club.e.index <- gregexpr(pattern = "',", text = data)

club.index <- rbind(
    cbind(club.s.index[[1]], 0),
    cbind(club.e.index[[1]], 1)
)

club.index <- club.index[order(club.index[, 1]), ]

c.index <- club.index[, 2] == 0
c.next.index <- c(FALSE, c.index[-length(c.index)])

club.start.index <- club.index[c.index, 1]
club.end.index <- club.index[c.next.index, 1]
club.res <- substring(
    text = data,
    first = club.start.index + attr(club.s.index[[1]], "match.length")[1], 
    last = club.end.index - 1
)
club.res <- gsub(pattern = "\\\\x20", replacement = " ", x = club.res)


# Get Age

age.s.index <- gregexpr(pattern = "'age':", text = data)
age.e.index <- gregexpr(pattern = ",", text = data)

age.index <- rbind(
    cbind(age.s.index[[1]], 0),
    cbind(age.e.index[[1]], 1)
)

age.index <- age.index[order(age.index[, 1]), ]

a.index <- age.index[, 2] == 0
a.next.index <- c(FALSE, a.index[-length(a.index)])

age.start.index <- age.index[a.index, 1]
age.end.index <- age.index[a.next.index, 1]
age.res <- substring(
    text = data,
    first = age.start.index + attr(age.s.index[[1]], "match.length")[1], 
    last = age.end.index - 1
)





res <- data.frame(
    Date = date.res,
    MarketValue = as.numeric(value.res),
    Club = club.res,
    Age = as.numeric(age.res)
)
r rvest
1个回答
0
投票

图表数据是从不同的 URL

https://www.transfermarkt.com/ceapi/marketValueDevelopment/graph/17259
提取的。由于它是 JSON,我们可以直接使用
jsonlite::read_json(url)
来解析它。尽管他们对于用户代理似乎有点挑剔(
jsonlite
请求在交互式 RStudio 会话中工作,通过 reprex 调用时会出现
403
错误),所以让我们使用
httr2
来获得更方便的用户代理配置:

library(httr2)
request("https://www.transfermarkt.com/ceapi/marketValueDevelopment/graph/17259") |>
  req_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64)") |>
  req_perform() |>
  resp_body_json(simplifyVector = TRUE) |>
  getElement(1) |>
  tibble::as_tibble()
#> # A tibble: 43 × 7
#>                x        y mw      datum_mw     verein            age   wappen   
#>            <dbl>    <int> <chr>   <chr>        <chr>             <chr> <chr>    
#>  1 1111014000000    75000 €75k    Mar 17, 2005 FC Schalke 04 U19 18    "https:/…
#>  2 1126821600000   150000 €150k   Sep 16, 2005 FC Schalke 04     19    "https:/…
#>  3 1156111200000   350000 €350k   Aug 21, 2006 FC Schalke 04     20    ""       
#>  4 1168815600000  1500000 €1.50m  Jan 15, 2007 FC Schalke 04     20    ""       
#>  5 1182376800000  3000000 €3.00m  Jun 21, 2007 FC Schalke 04     21    ""       
#>  6 1199142000000  4500000 €4.50m  Jan 1, 2008  FC Schalke 04     21    ""       
#>  7 1212530400000  7000000 €7.00m  Jun 4, 2008  FC Schalke 04     22    ""       
#>  8 1232492400000  9000000 €9.00m  Jan 21, 2009 FC Schalke 04     22    ""       
#>  9 1244584800000 11000000 €11.00m Jun 10, 2009 FC Schalke 04     23    ""       
#> 10 1263682800000 15000000 €15.00m Jan 17, 2010 FC Schalke 04     23    ""       
#> # ℹ 33 more rows

创建于 2023-12-31,使用 reprex v2.0.2

© www.soinside.com 2019 - 2024. All rights reserved.