我正在尝试从图形中提取数据点......但没有任何运气。没有任何线索,为什么不呢。
我总是得到“通过强制引入的 NA”
我的想法是读取 URL 并查看日期,以识别列出图形中的值的部分。然而,我似乎无法掌握它们。
url <- "https://www.transfermarkt.com/manuel-neuer/marktwertverlauf/spieler/17259"
data <- readLines(con = url)
data <- paste0(data, collapse = "\n")
start.index <- gregexpr(pattern = "'data':\\[", text = data)
end.index <- gregexpr(pattern = "\\]", text = data)
end.index[[1]] <- end.index[[1]][which(end.index[[1]] > start.index[[1]])[1]]
data <- substring(
text = data,
first = start.index[[1]],
last = end.index[[1]]
)
#date
#market value
#club
#age
# Get Dates
date.s.index <- gregexpr(pattern = "'datum_mw':'", text = data)
date.e.index <- gregexpr(pattern = "',", text = data)
date.index <- rbind(
cbind(date.s.index[[1]], 0),
cbind(date.e.index[[1]], 1)
)
date.index <- date.index[order(date.index[, 1]), ]
d.index <- date.index[, 2] == 0
d.next.index <- c(FALSE, d.index[-length(d.index)])
date.start.index <- date.index[d.index, 1]
date.end.index <- date.index[d.next.index, 1]
date.res <- substring(
text = data,
first = date.start.index + attr(date.s.index[[1]], "match.length")[1],
last = date.end.index - 1
)
date.res <- gsub(pattern = "\\\\x20", replacement = " ", x = date.res)
# Get Values
value.s.index <- gregexpr(pattern = "'y':", text = data)
value.e.index <- gregexpr(pattern = ",", text = data)
value.index <- rbind(
cbind(value.s.index[[1]], 0),
cbind(value.e.index[[1]], 1)
)
value.index <- value.index[order(value.index[, 1]), ]
v.index <- value.index[, 2] == 0
v.next.index <- c(FALSE, v.index[-length(v.index)])
value.start.index <- value.index[v.index, 1]
value.end.index <- value.index[v.next.index, 1]
value.res <- substring(
text = data,
first = value.start.index + attr(value.s.index[[1]], "match.length")[1],
last = value.end.index - 1
)
# Get Clubs
club.s.index <- gregexpr(pattern = "'verein':'", text = data)
club.e.index <- gregexpr(pattern = "',", text = data)
club.index <- rbind(
cbind(club.s.index[[1]], 0),
cbind(club.e.index[[1]], 1)
)
club.index <- club.index[order(club.index[, 1]), ]
c.index <- club.index[, 2] == 0
c.next.index <- c(FALSE, c.index[-length(c.index)])
club.start.index <- club.index[c.index, 1]
club.end.index <- club.index[c.next.index, 1]
club.res <- substring(
text = data,
first = club.start.index + attr(club.s.index[[1]], "match.length")[1],
last = club.end.index - 1
)
club.res <- gsub(pattern = "\\\\x20", replacement = " ", x = club.res)
# Get Age
age.s.index <- gregexpr(pattern = "'age':", text = data)
age.e.index <- gregexpr(pattern = ",", text = data)
age.index <- rbind(
cbind(age.s.index[[1]], 0),
cbind(age.e.index[[1]], 1)
)
age.index <- age.index[order(age.index[, 1]), ]
a.index <- age.index[, 2] == 0
a.next.index <- c(FALSE, a.index[-length(a.index)])
age.start.index <- age.index[a.index, 1]
age.end.index <- age.index[a.next.index, 1]
age.res <- substring(
text = data,
first = age.start.index + attr(age.s.index[[1]], "match.length")[1],
last = age.end.index - 1
)
res <- data.frame(
Date = date.res,
MarketValue = as.numeric(value.res),
Club = club.res,
Age = as.numeric(age.res)
)
图表数据是从不同的 URL
https://www.transfermarkt.com/ceapi/marketValueDevelopment/graph/17259
提取的。由于它是 JSON,我们可以直接使用 jsonlite::read_json(url)
来解析它。尽管他们对于用户代理似乎有点挑剔(jsonlite
请求在交互式 RStudio 会话中工作,通过 reprex 调用时会出现 403
错误),所以让我们使用 httr2
来获得更方便的用户代理配置:
library(httr2)
request("https://www.transfermarkt.com/ceapi/marketValueDevelopment/graph/17259") |>
req_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64)") |>
req_perform() |>
resp_body_json(simplifyVector = TRUE) |>
getElement(1) |>
tibble::as_tibble()
#> # A tibble: 43 × 7
#> x y mw datum_mw verein age wappen
#> <dbl> <int> <chr> <chr> <chr> <chr> <chr>
#> 1 1111014000000 75000 €75k Mar 17, 2005 FC Schalke 04 U19 18 "https:/…
#> 2 1126821600000 150000 €150k Sep 16, 2005 FC Schalke 04 19 "https:/…
#> 3 1156111200000 350000 €350k Aug 21, 2006 FC Schalke 04 20 ""
#> 4 1168815600000 1500000 €1.50m Jan 15, 2007 FC Schalke 04 20 ""
#> 5 1182376800000 3000000 €3.00m Jun 21, 2007 FC Schalke 04 21 ""
#> 6 1199142000000 4500000 €4.50m Jan 1, 2008 FC Schalke 04 21 ""
#> 7 1212530400000 7000000 €7.00m Jun 4, 2008 FC Schalke 04 22 ""
#> 8 1232492400000 9000000 €9.00m Jan 21, 2009 FC Schalke 04 22 ""
#> 9 1244584800000 11000000 €11.00m Jun 10, 2009 FC Schalke 04 23 ""
#> 10 1263682800000 15000000 €15.00m Jan 17, 2010 FC Schalke 04 23 ""
#> # ℹ 33 more rows
创建于 2023-12-31,使用 reprex v2.0.2