我对r很陌生,所以这可能是一个愚蠢的疑问。
我有一个数据框,我想在其中循环浏览特定列中的行,并检查其中的名称是否在另一个变量中。虽然,每一行中的值可能有多个名称,以分号分隔。我需要检查每个名称。我尝试了嵌套循环,但得到了重复值的列表。我的代码如下所述:
# Column in df I want to modify:
company.tickers
----------
CARD3
CSAN3
CVCB3
ELET3;ELET5;ELET6
ENBR3
FESA3;FESA4
OIBR3;OIBR4
PETR3;PETR4
PTBL3
TUPY3
VLID3
# stock names I would like to keep
stocks <- c("CARD3", "TUPY3", "OIBR3", "FESA4", "PTBL3", "VLID3","CNTO3","CSAN3","ELET3","PETR4","ENBR3")
result=list()
# Cycle through rows
for (i in 1:length(df.statements$company.tickers)){
print(df.statements$company.tickers[i])
stock.tickers <- strsplit(row,";")
# Cycle through names in a cell
for (j in 1:length(stock.tickers)) {
if (stock.tickers[j] %in% stocks){
print(stock.tickers[j])
result <- c( result, stock.tickers[j])
}
}
}
# My expected result is the following column:
company.tickers
----------
CARD3
CSAN3
CVCB3
ELET3
ENBR3
FESA4
OIBR3
PETR4
PTBL3
TUPY3
VLID3
也许是这样?
#build regex
stocks.regex <- paste0( stocks, collapse = "|")
#subset using grepl ans the new regex
subset( df, grepl( stocks.regex, df$company.tickers ) )
样本数据
library(data.table)
df <- setDF(fread("company.tickers
CARD3
CSAN3
CVCB3
ELET3;ELET5;ELET6
ENBR3
FESA3;FESA4
OIBR3;OIBR4
PETR3;PETR4
PTBL3
TUPY3
VLID3", sep = ","))
stocks <- c("CARD3", "TUPY3", "OIBR3", "FESA4", "PTBL3", "VLID3","CNTO3","CSAN3","ELET3","PETR4","ENBR3")
tidyverse
替代Wimpel非常聪明的答案:
suppressPackageStartupMessages(library(dplyr))
company.tickers = c(
"CARD3",
"CSAN3",
"CVCB3",
"ELET3;ELET5;ELET6",
"ENBR3",
"FESA3;FESA4",
"OIBR3;OIBR4",
"PETR3;PETR4",
"PTBL3",
"TUPY3",
"VLID3")
stocks <- c("CARD3", "TUPY3", "OIBR3", "FESA4", "PTBL3", "VLID3", "CNTO3", "CSAN3", "ELET3", "PETR4", "ENBR3")
df <- dplyr::tibble(company.tickers)
filter_df <- function(x, df, col) {
df %>%
dplyr::filter(stringr::str_detect(.data[[col]], x))
}
purrr::map_dfr(stocks, ~ filter_df(., df = df, col = "company.tickers")) %>%
dplyr::distinct()
#> # A tibble: 10 x 1
#> company.tickers
#> <chr>
#> 1 CARD3
#> 2 TUPY3
#> 3 OIBR3;OIBR4
#> 4 FESA3;FESA4
#> 5 PTBL3
#> 6 VLID3
#> 7 CSAN3
#> 8 ELET3;ELET5;ELET6
#> 9 PETR3;PETR4
#> 10 ENBR3
由reprex package(v0.3.0)在2020-03-03创建
这里是使用tidyr::separate
的尝试,将股票行代码栏暂时拆分为单独的列。延长/整理数据,然后对其进行过滤以获得所需的内容。
我非常感谢您提供有助于改进这项技术的评论。
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(purrr))
company.tickers = c(
"CARD3",
"CSAN3",
"CVCB3",
"ELET3;ELET5;ELET6",
"ENBR3",
"FESA3;FESA4",
"OIBR3;OIBR4",
"PETR3;PETR4",
"PTBL3",
"TUPY3",
"VLID3")
random.data <- runif(length(company.tickers))
stocks <- c("CARD3", "TUPY3", "OIBR3", "FESA4", "PTBL3", "VLID3", "CNTO3", "CSAN3", "ELET3", "PETR4", "ENBR3")
# construct a data frame
df <- dplyr::tibble(company.tickers, random.data)
# work out how many columns `separate` will need, and create a vector of unusual column names
# it feels weird that we need to do this
# but without this I always get an error from `separate` in the next step
new_cols <- paste0("zqzcol", 1:max(map_int(strsplit(df$company.tickers, ";"), length)))
# temporarily create new columns using `separate`
# then use `pivot_longer` to reabsorb these into long, tidy data
# then filter this by what is in `stocks`
# then tidy up using `select` (optional)
df %>%
tidyr::separate(col = company.tickers, sep = ";", into = new_cols) %>%
pivot_longer(cols = starts_with("zqzcol"), values_to = "company.tickers", values_drop_na = TRUE) %>%
filter(company.tickers %in% stocks) %>%
select(company.tickers, everything(), -name)
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 10 rows [1, 2, 3,
#> 5, 6, 7, 8, 9, 10, 11].
#> # A tibble: 10 x 2
#> company.tickers random.data
#> <chr> <dbl>
#> 1 CARD3 0.568
#> 2 CSAN3 0.0370
#> 3 ELET3 0.119
#> 4 ENBR3 0.276
#> 5 FESA4 0.196
#> 6 OIBR3 0.301
#> 7 PETR4 0.504
#> 8 PTBL3 0.712
#> 9 TUPY3 0.790
#> 10 VLID3 0.956
由reprex package(v0.3.0)在2020-03-05创建