我有一组居民人口统计数据。我需要清理数据。人们把系统中的东西弄错了。
在第一次互动时,某人可能不知道该人的种族或民族,但在与同一个人第二次或第三次互动时,他们可能会获得信息。
所以,我希望将新信息复制到前面的行中。每次交互都会有一个事件编号,但如果他们与同一个人交互,则会附加一个 resident.ID。您会注意到,回复无处不在。
这是示例数据集:
df <- structure(list(Incident.Number = c("2024-00087969", "2024-00086500",
"2024-00084279", "2024-00083770", "2024-00082302", "2023-00436775",
"2023-00336547", "2023-00163396", "2023-00072504", "2023-00072378",
"2023-00071167", "2023-00097209", "2023-00079243", "2023-00077822",
"2024-00036963", "2023-00204364", "2023-00183672", "2023-00023113",
"2023-00266448", "2023-00258926", "2023-00257665", "2023-00269442",
"2023-00266719", "2023-00146792", "2023-00210787", "2024-00067990",
"2024-00103119", "2023-00446192", "2023-00403647", "2023-00158407",
"2023-00080653"), Resident.ID = c(NA, NA, NA, NA, NA, "653367039",
"653367039", "653367039", "653367039", "653367039", "653367039",
"393246263", "393246263", "393246263", "393246263", "223171241",
"223171241", "223171241", "404653450", "404653450", "404653450",
"488298040", "488298040", "488298040", "573993743", "573993743",
"573993743", "621872041", "621872041", "621872041", "747325855"
), Sex = c("Unknown", "Female", "Female", "Female", "Female",
"Female", "Female", "Unknown", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Male", "Female",
"Female", "Other", "Other", "Female", "Female", "Female", "Male",
"Male", "Male", "Male", "Male", "Male", "Female"), Ethnicity = c("Non-Hispanic",
"Non-Hispanic", "Unknown", "Unknown", "Unknown", "Hispanic",
"Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispanic", "Non-Hispanic",
"Hispanic", "Hispanic", "Unknown", "Non-Hispanic", "Non-Hispanic",
"Non-Hispanic", "Unknown", "Non-Hispanic", "Non-Hispanic", "Non-Hispanic",
"Unknown", "Unknown", "Non-Hispanic", "Unknown", "Unknown", "Non-Hispanic",
"Preferred not to Answer", "Unknown", "Preferred not to Answer"
), Race = c("Black or African American", "Black or African American",
"Black or African American", "White", "White", "Preferred not to Answer",
"White", "White", "White", "Preferred not to Answer", "White",
"White", "White", "White", "White", "White", "White", "White",
"White", "White", "White", "White", "White", "White", "Black or African American",
"Black or African American", "Black or African American", "Black or African American",
"Black or African American", "Black or African American", "White"
)), row.names = 553:583, class = "data.frame")
对于 ID 653367039,我希望所有行都显示性别:女性、种族:白人、民族:非西班牙裔
对于 ID 393246263,某些人口统计数据存在冲突,因此应采用出现次数最多的值。返回性别:女性/种族:西班牙裔/种族:白人
如果居民 ID 为 NA,则人口统计列中不应发生任何变化。
我尝试了以下代码:
这开始用未知的数据替换实际的人口统计数据,或者当我想要相反的数据时宁愿不回答。
# Function to replace "Unknown" or "Preferred not to Answer" with the most common non-"Unknown"/"Preferred not to Answer" value
replace_unknown <- function(x) {
if (length(unique(x)) > 2) {
# If there are more than two unique values (excluding NA, Unknown, and Preferred not to Answer),
# replace "Unknown" or "Preferred not to Answer" with the most common non-"Unknown"/"Preferred not to Answer" value
table_x <- table(x)
max_val <- names(sort(table_x[table_x != max(table_x)], decreasing = TRUE))[1]
x <- ifelse(x %in% c("Unknown", "Preferred not to Answer"), max_val, x)
} else {
# If there are only "Unknown" and "Preferred not to Answer" values, default to "Unknown" or "Preferred not to Answer"
x <- ifelse(x == "Unknown", "Unknown", "Preferred not to Answer")
}
return(x)
}
# Apply the logic to the dataframe
df_cleaned <- df %>%
group_by(Resident.ID) %>%
mutate(
Race = ifelse(is.na(Resident.ID), Race, {
# Replace "Unknown" or "Preferred not to Answer" with the most common non-"Unknown"/"Preferred not to Answer" value
race_table <- table(Race)
max_race <- names(sort(race_table[race_table != max(race_table)], decreasing = TRUE))[1]
max_race <- ifelse(length(max_race) == 0, "Unknown", max_race)
replace_unknown(Race)
}),
Ethnicity = ifelse(is.na(Resident.ID), Ethnicity, {
# Replace "Unknown" or "Preferred not to Answer" with the most common non-"Unknown"/"Preferred not to Answer" value
ethnicity_table <- table(Ethnicity)
max_ethnicity <- names(sort(ethnicity_table[ethnicity_table != max(ethnicity_table)], decreasing = TRUE))[1]
max_ethnicity <- ifelse(length(max_ethnicity) == 0, "Unknown", max_ethnicity)
replace_unknown(Ethnicity)
}),
Sex = ifelse(is.na(Resident.ID), Sex, {
# Replace "Unknown" or "Preferred not to Answer" with the most common non-"Unknown"/"Preferred not to Answer" value
sex_table <- table(Sex)
max_sex <- names(sort(sex_table[sex_table != max(sex_table)], decreasing = TRUE))[1]
max_sex <- ifelse(length(max_sex) == 0, "Unknown", max_sex)
replace_unknown(Sex)
})
)
我一直在尝试不同的版本,但它不起作用。我期待着一些帮助。 TT.TT
根据
Resident.ID
替换为最流行的有效值:
library(dplyr)
df %>%
mutate(
across(Sex:Race, \(x) {
x_valid <- x[!(x %in% c("Unknown", "Preferred not to Answer"))]
if (length(x_valid) == 0 || all(is.na(Resident.ID))) x
else names(which.max(table(x_valid)))
}),
.by = Resident.ID
)
结果:
Incident.Number Resident.ID Sex Ethnicity Race
553 2024-00087969 <NA> Unknown Non-Hispanic Black or African American
554 2024-00086500 <NA> Female Non-Hispanic Black or African American
555 2024-00084279 <NA> Female Unknown Black or African American
556 2024-00083770 <NA> Female Unknown White
557 2024-00082302 <NA> Female Unknown White
558 2023-00436775 653367039 Female Hispanic White
559 2023-00336547 653367039 Female Hispanic White
560 2023-00163396 653367039 Female Hispanic White
561 2023-00072504 653367039 Female Hispanic White
562 2023-00072378 653367039 Female Hispanic White
563 2023-00071167 653367039 Female Hispanic White
564 2023-00097209 393246263 Female Hispanic White
565 2023-00079243 393246263 Female Hispanic White
566 2023-00077822 393246263 Female Hispanic White
567 2024-00036963 393246263 Female Hispanic White
568 2023-00204364 223171241 Female Non-Hispanic White
569 2023-00183672 223171241 Female Non-Hispanic White
570 2023-00023113 223171241 Female Non-Hispanic White
571 2023-00266448 404653450 Other Non-Hispanic White
572 2023-00258926 404653450 Other Non-Hispanic White
573 2023-00257665 404653450 Other Non-Hispanic White
574 2023-00269442 488298040 Female Non-Hispanic White
575 2023-00266719 488298040 Female Non-Hispanic White
576 2023-00146792 488298040 Female Non-Hispanic White
577 2023-00210787 573993743 Male Non-Hispanic Black or African American
578 2024-00067990 573993743 Male Non-Hispanic Black or African American
579 2024-00103119 573993743 Male Non-Hispanic Black or African American
580 2023-00446192 621872041 Male Non-Hispanic Black or African American
581 2023-00403647 621872041 Male Non-Hispanic Black or African American
582 2023-00158407 621872041 Male Non-Hispanic Black or African American
583 2023-00080653 747325855 Female Preferred not to Answer White