这是我的数据框:
或者您可以复制它:
structure(list(A_levels = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
College_Uni = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), CSEs = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), NVQ_HND_HNC = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), O_levels_GCSEs = c(1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1), Other_prof_qual = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), Prefer_not_to_answer = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), None_of_the_above = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), id = 5000:5010), row.names = c(NA, -11L), class = c("tbl_df",
"tbl", "data. Frame")) -> df
我想改变一个 edu_level 的新变量,它的值是这样的:
#my vairabe edu_level
#College_Uni high
#A_levels medium
#O_levels_GCSEs medium
#CSEs) low
#NVQ_HND_HNC low
#Other_prof_qual low
#Prefer_not_to_answer NA
#None_of_the_above NA
我想可能因为是选择题,所以有些人不止两个级别,所以我不知道如何指定分类。
我想先选择它的edu_level的优先级是highest degree。比如它同时有College_Uni和A_levels,那么edu_level的值就高
我创建了一个具有不同教育水平的数据框来展示解决方案;
library(dplyr)
library(tidyr)
myvars %>%
mutate(num_level = case_when(is.na(edu_level) ~ 0,
edu_level == "low" ~ 1,
edu_level == "medium" ~ 2,
edu_level == "high" ~ 3)) -> myvars
df1 %>%
pivot_longer(-id, names_to = "degree") %>%
filter(value != 0) %>%
right_join(., myvars, join_by(degree == variable)) %>%
slice(which.max(num_level), .by = id) %>%
select(-value, -num_level) %>% ## you can also drop "degree"
left_join(., df1)
#> Joining with `by = join_by(id)`
#> # A tibble: 11 x 11
#> id degree edu_level A_levels College_Uni CSEs NVQ_HND_HNC O_levels_GCSEs
#> <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 5000 Colleg~ high 0 1 0 0 1
#> 2 5001 A_leve~ medium 1 0 0 0 1
#> 3 5002 A_leve~ medium 1 0 0 0 1
#> 4 5003 O_leve~ medium 0 0 0 1 1
#> 5 5004 Colleg~ high 0 1 0 0 1
#> 6 5005 Other_~ low 0 0 0 0 0
#> 7 5006 Colleg~ high 0 1 0 1 1
#> 8 5007 CSEs low 0 0 1 0 0
#> 9 5008 Colleg~ high 0 1 0 0 0
#> 10 5009 Prefer~ <NA> 0 0 0 0 0
#> 11 5010 None_o~ <NA> 0 0 0 0 0
#> # i 3 more variables: Other_prof_qual <dbl>, Prefer_not_to_answer <dbl>,
#> # None_of_the_above <dbl>
data.frame(A_levels = c(0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),
College_Uni = c(1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0),
CSEs = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
NVQ_HND_HNC = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0),
O_levels_GCSEs = c(1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0),
Other_prof_qual = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0),
Prefer_not_to_answer = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0),
None_of_the_above = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
id = 5000:5010) -> df1
read.table(text = "variable edu_level
College_Uni high
A_levels medium
O_levels_GCSEs medium
CSEs low
NVQ_HND_HNC low
Other_prof_qual low
Prefer_not_to_answer NA
None_of_the_above NA", header = T, stringsAsFactor = F) -> myvars
这是一种方法
library(dplyr)
keyval$edu_level <- ordered(keyval$edu_level, levels = c("low", "medium", "high"))
df1 %>%
mutate(edu_level = do.call(pmax, c(across(any_of(keyval$variable),
~ case_when(as.logical(.x) ~ keyval$edu_level[match(cur_column(),
keyval$variable)])), na.rm = TRUE))
)
-输出
A_levels College_Uni CSEs NVQ_HND_HNC O_levels_GCSEs Other_prof_qual Prefer_not_to_answer None_of_the_above id edu_level
1 0 1 0 0 1 0 0 0 5000 high
2 1 0 0 0 1 0 0 0 5001 medium
3 1 0 0 0 1 0 0 0 5002 medium
4 0 0 0 1 1 0 0 0 5003 medium
5 0 1 0 0 1 0 0 0 5004 high
6 0 0 0 0 0 1 0 0 5005 low
7 0 1 0 1 1 0 0 0 5006 high
8 0 0 1 0 0 0 0 0 5007 low
9 0 1 0 0 0 0 0 0 5008 high
10 0 0 0 0 0 0 1 0 5009 <NA>
11 0 0 0 0 0 0 0 1 5010 <NA>
试试这个,如果有帮助请告诉我。我将您的调查从 0 和 1 转换为排名数字,然后从每一行中找到最大数字。
可重现的数据
dat <- data.frame(A_levels = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
College_Uni = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
CSEs = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
NVQ_HND_HNC = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
O_levels_GCSEs = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
Other_prof_qual = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Prefer_not_to_answer = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
None_of_the_above = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
id = 5000:5010)
scoring_df <- data.frame(variable = c("A_levels", "College_Uni", "CSEs", "NVQ_HND_HNC", "O_levels_GCSEs", "Other_prof_qual", "Prefer_not_to_answer","None_of_the_above"),
edu_level = c("medium", "high", "low","low","medium","low",NA, NA),
edu_score = c(2,3,1,1,2,1,NA,NA))
# I added the "edu_score"
作业
# remove the id column
dat_scores_only <- dat[, -which(colnames(dat) == "id")]
# fetch the numeric scores for each column
scoring_ordered <- scoring_df$edu_score[match(scoring_df$variable, colnames(dat) )]
# replace 0s & 1s with scores using multiplication, https://stackoverflow.com/questions/36111444/multiply-columns-in-a-data-frame-by-a-vector
results <- t(t(dat_scores_only)*scoring_ordered)
# now check for the maximum level per row
# using apply(matrix, by row = 1 and by column = 2, function is max, argument is to ignore NAs)
max_values_by_row <- apply(results, 1, max, na.rm = T)
# and look back up the score in your scoring df to get the words you wanted per row as a new variable "edu_level"
dat$edu_level <- scoring_df[match(max_values_by_row,scoring_df$edu_score), "edu_level"]
结果(见最右栏)
dat
# A_levels College_Uni CSEs NVQ_HND_HNC O_levels_GCSEs Other_prof_qual Prefer_not_to_answer None_of_the_above id edu_score
#1 0 1 0 0 1 0 0 0 5000 high
#2 0 1 0 0 1 0 0 0 5001 high
#3 0 1 0 0 1 0 0 0 5002 high
#4 0 1 0 0 1 0 0 0 5003 high
#5 0 1 0 0 1 0 0 0 5004 high
#6 0 1 0 0 1 0 0 0 5005 high
#7 0 1 0 0 1 0 0 0 5006 high
#8 0 1 0 0 1 0 0 0 5007 high
#9 0 1 0 0 1 0 0 0 5008 high
#10 0 1 0 0 1 0 0 0 5009 high
#11 0 1 0 0 1 0 0 0 5010 high