计算患病患者食用最多的食物

问题描述 投票:-2回答:2

我有一个如下的数据集,其中有很多列。有一些标题为:

的列
baked_hamburgur,spinach,mashed_potato,cabbages,jello,rolls,brown,milk,coffee,water,cakes,vanilla,chocolate,fruitsalad

也有其他专栏,但到目前为止,我对以上专栏只感兴趣。

这些列的每一行中的值是:是或否。

此数据的屏幕截图如下,因为我无法在问题本身中附加/共享此文件。

My Dataset with many columns for Data Analysis of Ill Patients

dput(head())输出如下:

> dput(head(illness_data))

structure(list(Age = structure(c(18L, 26L, 22L, 25L, 29L, 13L
), .Label = c("10", "106", "11", "12", "14", "15", "16", "17", 
"18", "19", "2", "20", "22", "23", "24", "25", "26", "27", "28", 
"30", "31", "32", "33", "34", "36", "38", "39", "4", "42", "43", 
"44", "45", "46", "48", "5", "7", "8", "9", "seven"), class = "factor"), 
    sex = structure(c(3L, 2L, 3L, 3L, 2L, 3L), .Label = c("-1", 
    "Female", "Male"), class = "factor"), timesupper = c(2000L, 
    1830L, 1830L, 1930L, 1930L, 1930L), ill = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "yes", class = "factor"), onsetdate = structure(c(4L, 
    4L, 4L, 1L, 1L, 4L), .Label = c("18-Apr", "18-Jun", "18/4", 
    "19-Apr"), class = "factor"), onsettime = c(30L, 30L, 30L, 
    2230L, 2230L, 200L), baked_hamburgur = structure(c(2L, 2L, 
    2L, 2L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
    spinach = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("no", 
    "yes"), class = "factor"), mashed_potato = structure(c(2L, 
    2L, 1L, 1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
    cabbages = structure(c(1L, 2L, 1L, 2L, 1L, 1L), .Label = c("no", 
    "yes"), class = "factor"), jello = structure(c(1L, 1L, 1L, 
    2L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
    rolls = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("no", 
    "yes"), class = "factor"), brown = structure(c(1L, 1L, 1L, 
    1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
    milk = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no", 
    "yes"), class = "factor"), coffee = structure(c(2L, 2L, 2L, 
    1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
    water = structure(c(1L, 1L, 1L, 2L, 2L, 1L), .Label = c("no", 
    "yes"), class = "factor"), cakes = structure(c(1L, 1L, 2L, 
    1L, 1L, 1L), .Label = c("no", "yes"), class = "factor"), 
    vanilla = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("no", 
    "yes"), class = "factor"), chocolate = structure(c(1L, 2L, 
    2L, 1L, 1L, 2L), .Label = c("no", "yes"), class = "factor"), 
    fruitsalad = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no", 
    "yes", "yes</pre></body></html>Ztext/plain\b\v\035(F]l~Ó_Ý\026R\002\001"
    ), class = "factor")), .Names = c("Age", "sex", "timesupper", 
"ill", "onsetdate", "onsettime", "baked_hamburgur", "spinach", 
"mashed_potato", "cabbages", "jello", "rolls", "brown", "milk", 
"coffee", "water", "cakes", "vanilla", "chocolate", "fruitsalad"
), row.names = c(NA, 6L), class = "data.frame")

完整的dput命令输出如下:

> dput(illness_data)

structure(list(Age = structure(c(18L, 26L, 22L, 25L, 29L, 13L, 
36L, 8L, 11L, 7L, 24L, 10L, 8L, 35L, 34L, 6L, 22L, 39L, 12L, 
9L, 36L, 17L, 9L, 20L, 37L, 27L, 32L, 30L, 21L, 24L, 3L, 18L, 
33L, 16L, 5L, 31L, 28L, 14L, 19L, 38L, 2L, 4L, 23L, 1L, 18L, 
15L), .Label = c("10", "106", "11", "12", "14", "15", "16", "17", 
"18", "19", "2", "20", "22", "23", "24", "25", "26", "27", "28", 
"30", "31", "32", "33", "34", "36", "38", "39", "4", "42", "43", 
"44", "45", "46", "48", "5", "7", "8", "9", "seven"), class = "factor"), 
    sex = structure(c(3L, 2L, 3L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 
    3L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 1L, 3L, 3L, 3L, 
    2L, 2L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 3L, 
    3L, 3L, 2L, 3L, 2L, 3L), .Label = c("-1", "Female", "Male"
    ), class = "factor"), timesupper = c(2000L, 1830L, 1830L, 
    1930L, 1930L, 1930L, 2200L, 1900L, 1930L, NA, NA, NA, NA, 
    2200L, NA, NA, NA, 2200L, NA, NA, 2200L, 2200L, NA, NA, 2200L, 
    NA, NA, NA, NA, NA, 1900L, NA, 1100L, NA, NA, NA, 2200L, 
    1930L, 1930L, 2200L, NA, NA, 1930L, 1930L, NA, NA), ill = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = "yes", class = "factor"), onsetdate = structure(c(4L, 
    4L, 4L, 1L, 1L, 4L, 4L, 2L, 4L, 4L, 4L, 1L, 1L, 4L, 1L, 3L, 
    1L, 4L, 1L, 1L, 4L, 4L, 1L, 1L, 4L, 1L, 4L, 4L, 1L, 4L, 4L, 
    1L, 1L, 1L, 1L, 1L, 4L, 4L, 1L, 4L, 4L, 4L, 4L, 1L, 4L, 1L
    ), .Label = c("18-Apr", "18-Jun", "18/4", "19-Apr"), class = "factor"), 
    onsettime = c(30L, 30L, 30L, 2230L, 2230L, 200L, 100L, 2300L, 
    200L, 1030L, 30L, 2215L, 2200L, 100L, 2300L, 2145L, 2145L, 
    100L, 2300L, 2100L, 100L, 100L, 2115L, 2330L, 100L, 2130L, 
    230L, 200L, 2130L, 30L, 100L, 2230L, 1500L, 2400L, 2300L, 
    2230L, 100L, 230L, 2330L, 100L, 30L, 30L, 100L, 2400L, 215L, 
    2300L), baked_hamburgur = structure(c(2L, 2L, 2L, 2L, 2L, 
    1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 
    1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 
    2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("no", 
    "yes"), class = "factor"), spinach = structure(c(2L, 2L, 
    2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 
    1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("no", 
    "yes"), class = "factor"), mashed_potato = structure(c(2L, 
    2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 
    2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 
    2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), cabbages = structure(c(1L, 
    2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 
    2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L
    ), .Label = c("no", "yes"), class = "factor"), jello = structure(c(1L, 
    1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 
    1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 
    1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), rolls = structure(c(2L, 
    1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 
    2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
    2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), brown = structure(c(1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 
    2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), milk = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 
    1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = c("no", "yes"), class = "factor"), coffee = structure(c(2L, 
    2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), water = structure(c(1L, 
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 
    2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L
    ), .Label = c("no", "yes"), class = "factor"), cakes = structure(c(1L, 
    1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), vanilla = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), chocolate = structure(c(1L, 
    2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 
    1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, NA, 1L, 1L, 
    2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L
    ), .Label = c("no", "yes"), class = "factor"), fruitsalad = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L
    ), .Label = c("no", "yes", "yes</pre></body></html>Ztext/plain\b\v\035(F]l~Ó_Ý\026R\002\001"
    ), class = "factor")), .Names = c("Age", "sex", "timesupper", 
"ill", "onsetdate", "onsettime", "baked_hamburgur", "spinach", 
"mashed_potato", "cabbages", "jello", "rolls", "brown", "milk", 
"coffee", "water", "cakes", "vanilla", "chocolate", "fruitsalad"
), class = "data.frame", row.names = c(NA, -46L))

R已将这些列正确读取为因子类型变量。

现在,这些列中的每一个对应于医院中每位病患者所消耗的食物。

我想用R了解病患者最消耗的食物。

请建议这样做的好方法。谢谢!

注意,除了下面这个URL中提到的选项,我没有尝试过其他任何选项。但是,我无法使其工作。

Count Factor Columns Using R

r data-science analysis eda
2个回答
1
投票

由于所有行都具有ill = 'yes',因此我们可以在每一列中计算带有yes的值的数量。基本的R方法可能是:

head(sort(colSums(illness_data[7:20] == "yes"), decreasing = TRUE), 5)

#        vanilla baked_hamburgur           cakes         spinach   mashed_potato 
#             43              29              27              26              23 

我选择了7到20列,因为那是唯一存在食品的列。另外,我仅选择了前5个值,可以通过在head命令中更改数字5来选择任何值。


1
投票

我不确定您要寻找的是什么,但这将计算食物的食用频率(使用tidyverse包装:]

library(tidyverse)
illness_data_summed <- illness_data %>%
  mutate_at(vars(-Age, -sex, -timesupper,-onsetdate,-onsettime), ~ifelse(. == "yes", 1,0)) %>% 
  summarise_at(vars(-Age, -sex, -timesupper,-onsetdate,-onsettime, -ill), ~sum(., na.rm = TRUE)) 

illness_data_summed[which(illness_data_summed == max(illness_data_summed))]

所以首先我将yes转换为1,将no转换为0,这使总和表示特定食物被食用的次数。我对所有列都执行此操作,但您不感兴趣的列除外(以var中的-表示),但您也可以将其反转,如果需要的话(例如,当您要转换的vars数量少于您不希望转换的vars数量时)要转换)。

最后一部分将导致:

  vanilla
1      43
© www.soinside.com 2019 - 2024. All rights reserved.