扫描一行数据,并根据该行的值创建一个新的长数据集,其中有一列包含二进制值

问题描述 投票:0回答:2

这是我的初始数据

long5<-structure(list(id = c("R_88j7lG37gLfxk22", "R_6DK8lERVf8lSQf4", 
"R_eG8g4wMm8JsqNlI", "R_9TCgsW0sLA4xHOm", "R_6J5Obu2AvpCeu9w"
), .choice.t1 = c("2", "3", "3", "2", "1"), .choice.t2 = c("1", 
"3", "1", "3", "1"), .choice.t3 = c("1", "2", "2", "2", "3"), 
    .choice.t4 = c("2", "1", "3", "2", "1")), row.names = c(NA, 
-5L), class = c("tbl_df", "tbl", "data.frame"))

它的第一行是这样的:

我想将其转换为:

正如您所看到的,对于每个 id,我都会检查 t1 到 t4 选择(从 1 到 3),然后创建一个名为

choice
的新列,其中每种颜色的 second 行中有 1,例如,如果选择是 2 ,如果选择为 1,则每种颜色的 first 行 1 个;如果选择为 3,则每种颜色的 3rd 行 1。我的实际数据集有更多行

我尝试这样做,但它不起作用:

names(long5)[-1] <- 
  strsplit(names(long5)[-1], '_') |>
  sapply(\(x) paste(paste(c(rev(x[-1])), collapse='_'), x[1], sep='.'))
res <- reshape(as.data.frame(long5), varying=-1, direction='l') |>
  reshape(direction='l', varying=-(1:3), new.row.names=1:1e9, sep='_') |>
  type.convert(as.is=FALSE)
all<-cbind(res[1:3], model.matrix(~ 0 + env + eth + pri, res, 
                                  contrasts.arg = list(eth=contrasts(res$eth, contrasts=FALSE),
                                                       pri=contrasts(res$pri, contrasts=FALSE))))
r
2个回答
0
投票

我建议保留

.choice.t#
名称,并在每个
c(id, name)
分组中添加序列,这样,如果您的数据被重新排列,它将是明确的。

dplyr

library(dplyr)
library(tidyr) # pivot_longer
pivot_longer(long5, cols = -id) %>%
  reframe(choices = 1:3, choice = +(choices == value), .by = c("id", "name"))
# # A tibble: 60 × 4
#    id                name       choices choice
#    <chr>             <chr>        <int>  <int>
#  1 R_88j7lG37gLfxk22 .choice.t1       1      0
#  2 R_88j7lG37gLfxk22 .choice.t1       2      1
#  3 R_88j7lG37gLfxk22 .choice.t1       3      0
#  4 R_88j7lG37gLfxk22 .choice.t2       1      1
#  5 R_88j7lG37gLfxk22 .choice.t2       2      0
#  6 R_88j7lG37gLfxk22 .choice.t2       3      0
#  7 R_88j7lG37gLfxk22 .choice.t3       1      1
#  8 R_88j7lG37gLfxk22 .choice.t3       2      0
#  9 R_88j7lG37gLfxk22 .choice.t3       3      0
# 10 R_88j7lG37gLfxk22 .choice.t4       1      0
# # ℹ 50 more rows
# # ℹ Use `print(n = ...)` to see more rows

基础 R + 重塑2

out <- reshape2::melt(long5, id.vars = "id")
out <- do.call(rbind,
  lapply(1:nrow(out), function(rn) {
    transform(out[rn,], choices = 1:3, row.names = NULL) |>
    transform(choice = +(choices == value))
  }))
head(out)
#                  id   variable value choices choice
# 1 R_88j7lG37gLfxk22 .choice.t1     2       1      0
# 2 R_88j7lG37gLfxk22 .choice.t1     2       2      1
# 3 R_88j7lG37gLfxk22 .choice.t1     2       3      0
# 4 R_6DK8lERVf8lSQf4 .choice.t1     3       1      0
# 5 R_6DK8lERVf8lSQf4 .choice.t1     3       2      0
# 6 R_6DK8lERVf8lSQf4 .choice.t1     3       3      1

0
投票
library(tidyverse)

# define dummy data
df<-structure(list(id = "R_88j7lG37gLfxk22", t1_choice = "2", t2_choice = "1", 
                     t3_choice = "1", t4_choice = "2"), row.names = c(NA, -1L), class = c("tbl_df", 
                                                                                          "tbl", "data.frame"))

# reshape longer
df_long <- df |> 
  pivot_longer(
    !id,
    names_to = "task",
    names_pattern = "t(\\d+).*",
    values_to = "choice"
  )

# expand dummies
expand_dummy <- function(x) {
  out <- rep(0, times = 3)
  out[as.numeric(x)] <- 1
  out
}

df_long |> 
  group_by(id, task) |> 
  reframe(choice = expand_dummy(choice))
#> # A tibble: 12 × 3
#>    id                task  choice
#>    <chr>             <chr>  <dbl>
#>  1 R_88j7lG37gLfxk22 1          0
#>  2 R_88j7lG37gLfxk22 1          1
#>  3 R_88j7lG37gLfxk22 1          0
#>  4 R_88j7lG37gLfxk22 2          1
#>  5 R_88j7lG37gLfxk22 2          0
#>  6 R_88j7lG37gLfxk22 2          0
#>  7 R_88j7lG37gLfxk22 3          1
#>  8 R_88j7lG37gLfxk22 3          0
#>  9 R_88j7lG37gLfxk22 3          0
#> 10 R_88j7lG37gLfxk22 4          0
#> 11 R_88j7lG37gLfxk22 4          1
#> 12 R_88j7lG37gLfxk22 4          0

创建于 2024-01-08,使用 reprex v2.0.2

© www.soinside.com 2019 - 2024. All rights reserved.