
问题描述 投票:2回答:1

我正在处理分类数据,并且在我以图形方式显示数据之前将数据提供给第二步。但是,我需要行来匹配条件,这就是我被卡住的地方 - 因为我不想手动操作而被卡住了。 我的数据:

x <- data.frame("Phylum" = c("Chordata", "Chordata", "Chordata", "Chordata", "Chordata", "Chordata"),
                "Class" = c("NA", "Actinopterygii", "Actinopterygii", "Actinopterygii", "Actinopterygii", "Actinopterygii"),
                "Order" = c("NA", "NA", "Gadiformes", "Gadiformes", "Gadiformes", "Gadiformes"), 
                "Family" = c("NA", "NA", "NA", "Moridae", "Moridae", "Moridae"), 
                "Genus" = c("NA", "NA", "NA", "NA", "Notophycis", "Notophycis"), 
                "Species" = c("NA", "NA", "NA", "NA", "NA", "Notophycis marginata"),
                 Number = c(21616, 12123, 1497, 730,730,730))


y <- data.frame("Phylum" = c("Chordata", "Chordata", "Chordata", "Chordata"), 
                "Class" = c("NA", "Actinopterygii", "Actinopterygii", "Actinopterygii"), 
                "Order" = c("NA", "NA", "Gadiformes", "Gadiformes"), "Family" = c("NA", "NA", "NA", "Moridae"), 
                "Genus" = c("NA", "NA", "NA", "Notophycis"), "Species" = c("NA", "NA", "NA", "Notophycis marginata"), 
                 Number = c(9493, 10626, 767, 730))


  • 数字之和(Phylum == "P1" & Class == "NA") - 数字之和(Class == "C1" & Order == "NA")如果门匹配,这将等于P1的新数字
  • 数字之和(Class == "C1" & Order== "NA") - 数字之和(Order == "O1" & Family == "NA")IF类匹配,这将等于C1的新数字等...






Phylum  Class   Order   Family  Genus   Species Reads_sum
Chordata    Elasmobranchii  Carcharhiniformes   NA  NA  NA  31
Chordata    Actinopterygii  Perciformes Scombridae  NA  NA  589
Chordata    Elasmobranchii  Carcharhiniformes   Pentanchidae    NA  NA  31
Chordata    Actinopterygii  Myctophiformes  Myctophidae Notoscopelus    NA  208
Chordata    Actinopterygii  Perciformes Scombridae  Katsuwonus  NA  589
Chordata    Actinopterygii  Myctophiformes  Myctophidae Notoscopelus    Notoscopelus caudispinosus  178
Chordata    Actinopterygii  Perciformes Scombridae  Katsuwonus  Katsuwonus pelamis  589
Cnidaria    Hydrozoa    Leptothecata    Plumulariidae   NA  NA  69
Cnidaria    Hydrozoa    Leptothecata    Plumulariidae   Plumularia  NA  69
Echinodermata   Ophiuroidea NA  NA  NA  NA  146
Echinodermata   Ophiuroidea Ophiurida   NA  NA  NA  137
Echinodermata   Ophiuroidea Ophiurida   Ophiuridae  NA  NA  137
Echinodermata   Ophiuroidea Ophiurida   Ophiuridae  Ophioplinthus   NA  137
Echinodermata   Ophiuroidea Ophiurida   Ophiuridae  Ophioplinthus   Ophioplinthus accomodata    137
Mollusca    Cephalopoda Oegopsida   Ommastrephidae  NA  NA  34311
Ochrophyta  Phaeophyceae    Ectocarpales    Acinetosporaceae    NA  NA  29


Tester$Reads_sum[Tester$Class == "Ophiuroidea" & Tester$Order == "NA"] - sum(Tester$Reads_sum[Tester$Class == "Ophiuroidea" & Tester$Order != "NA" & Tester$Family == "NA"])


for (i in unique(Tester$Class)){
  Tester$Test.1 <- ifelse(Tester$Class != "NA" & Tester$Order == "NA", 
                           Tester$Reads_sum[Tester$Class == i & Tester$Order == "NA"] - sum(Tester$Reads_sum[Tester$Class == i & Tester$Order != "NA" & Tester$Family == "NA"]), 0)



Phylum  Class   Order   Family  Genus   Species Reads_sum
Chordata    Elasmobranchii  Carcharhiniformes   Pentanchidae    NA  NA  31
Chordata    Actinopterygii  Myctophiformes  Myctophidae Notoscopelus    NA  30
Chordata    Actinopterygii  Myctophiformes  Myctophidae Notoscopelus    Notoscopelus caudispinosus  178
Chordata    Actinopterygii  Perciformes Scombridae  Katsuwonus  Katsuwonus pelamis  589
Cnidaria    Hydrozoa    Leptothecata    Plumulariidae   Plumularia  NA  69
Echinodermata   Ophiuroidea NA  NA  NA  NA  9
Echinodermata   Ophiuroidea Ophiurida   Ophiuridae  Ophioplinthus   Ophioplinthus accomodata    137
Mollusca    Cephalopoda Oegopsida   Ommastrephidae  NA  NA  34311
Ochrophyta  Phaeophyceae    Ectocarpales    Acinetosporaceae    NA  NA  29
r function loops for-loop split


我是否正确地按照c("Phylum", "Class", "Order", "Family", "Genus", "Species")的顺序思考它的数据树?并且您有兴趣找到树的每个级别,您想要删除下面的图层的值?



# using data.table package, as I find it quicker and easier to work with 
# for complex problems. Run the hashed out command below if you dont have it
# install.packages("data.table")

# turning in to a data.table, similar to data.frame, but some differences.
dt <- as.data.table(Tester)
# I am making an id, which I will use to split up this data. Different rows 
# have different structures, as its a tree structure, so I am going to break
# the data up
dt[, id := 1:.N]

# to do so i need to know the order of significance of the tree. I believe
# they go in this order:
col_structure <- c("Phylum", "Class", "Order", "Family", "Genus", "Species")

# I want to find out at which level of the tree each row is, so I am going
# to change teh shape from wide to long, and then do some row aggregation on 
# the single column, to group
melt_dt <- melt(dt, id.vars = "id", 
                measure.vars = col_structure)
# tip: try not to use "NA", but instead NA, they have different structures 
# and built in commands like is.na make them easier to differentiate
melt_dt[value == "NA", value := NA]
melt_dt <- melt_dt[!is.na(value)]
# using a data.table command .N, grouped by id, to find out how many non NA
# values there are, this will tell me where it is in the tree
group_ids <- melt_dt[, .N, by = id]

# Ok, so now I will split up each row in to where it sits in the tree
split_ids <- split(group_ids, group_ids$N)
# pull out the number of levels of tree for easy use
levels <- seq_along(split_ids)

# merge back in the original data, so we have the same data at the start, but
# split up in to new sets. Makes it easier to think about the problem
split_dt <- lapply(levels, function(x){
  out <- merge(split_ids[[x]], dt, by = "id")
  N <- as.numeric(names(split_ids)[x])
  # using keys in my data, to make easy extraction. means rather than do
  # Phylum == "a" & Class == "b" later on, if Phylum & Class are the keys,
  # then can use command J("a", "b"). See next stage
  setkeyv(out, col_structure[1:N])

# Now I'm going to add the value in. I will look at the next level of the tree
# and remove the values from that level from the reads_sum. Try it with setting
# x = 1.
# I've removed bottom element of the tree, don't know what to do with them
split_dt_with_value <- lapply(levels[1:(length(levels)-1)], function(x){
  # similar to for loop, but using data.table keys to extract data
  out <- split_dt[[x]]
  out$Test.1 <- out$Reads_sum - sapply(1:nrow(out), function(i){
    sum(split_dt[[(x+1)]][J(out[i, key(out), with = FALSE])]$Reads_sum,
        na.rm = TRUE)

# combine results, and with the bottom tree level
combined <- rbindlist(c(split_dt_with_value,
                        fill = TRUE)
# turn it back in to data frame form 
combined <- as.data.frame(combined)



© www.soinside.com 2019 - 2024. All rights reserved.