我在尝试使用 Sparklyr 根据条件计算 Spark DataFrame 列中不同值的数量时遇到问题。这是我正在使用的代码:
library(sparklyr)
library(dplyr)
####df
df <- data.frame(
appl = c("Apple", "Microsoft", "Google", "Amazon", "Facebook", "Samsung", "IBM"),
appl_y = c("y", "n", "y", "n", "y", "n", "y"),
manu = c("USA", "USA", "USA", "China", "USA", "South Korea", "USA"),
alternate_flag = c("y", "n", "y", "y", "n", "y", "n")
)
# Connect to Spark
sc <- spark_connect(master = "local")
# Create the Spark DataFrame
df_spark <- copy_to(sc, df, "df_spark")
# Group by 'manu' and summarize
result <- df_spark %>%
group_by(manu) %>%
summarize(num_appl_y = n_distinct(appl[appl_y == 'y']),
num_appl_flag=n_distinct(appl[alternate_flag == 'y'])
)
Show the result
collect(result)
目的是按 manu 列对数据进行分组,然后计算 appl 列中不同值的数量,其中每组内相应的 appl_y 和 alternate_flag 列为“y”。然而,这不起作用,当我在 Sparklyr 中这样做时,计数就结束了。
n_distinct()
翻译问题#3253SELECT
`manu`,
COUNT(DISTINCT(ARRAY(CASE WHEN (`appl_y` = "y") THEN (`appl`) END))) AS `num_appl_y`,
COUNT(DISTINCT(ARRAY(CASE WHEN (`alternate_flag` = "y") THEN (`appl`) END))) AS `num_appl_flag`
FROM `df_spark`
GROUP BY `manu`
和
NA
/ 这些数组中的空值会扰乱计数。
也许我遗漏了一些东西,但是用
"y"
和 sum(appl_y == "y")
来计算 sum(alternate_flag == "y")
值怎么样?
library(sparklyr)
library(dplyr)
df <- data.frame(
appl = c("Apple", "Microsoft", "Google", "Amazon", "Facebook", "Samsung", "IBM"),
appl_y = c("y", "n", "y", "n", "y", "n", "y"),
manu = c("USA", "USA", "USA", "China", "USA", "South Korea", "USA"),
alternate_flag = c("y", "n", "y", "y", "n", "y", "n")
)
# Connect to Spark (2.4.3)
sc <- spark_connect(master = "local")
# Create the Spark DataFrame
df_spark <- copy_to(sc, df, "df_spark")
# Spark summary
result <- df_spark %>%
group_by(manu) %>%
summarise(
num_appl_y = sum(as.integer(appl_y == "y")),
num_appl_flag = sum(as.integer(alternate_flag == "y"))
)
# Show the result
collect(result)
#> Warning: Missing values are always removed in SQL aggregation functions.
#> Use `na.rm = TRUE` to silence this warning
#> This warning is displayed once every 8 hours.
#> # A tibble: 3 × 3
#> manu num_appl_y num_appl_flag
#> <chr> <dbl> <dbl>
#> 1 South Korea 0 1
#> 2 China 0 1
#> 3 USA 4 2
show_query(result)
#> <SQL>
#> SELECT
#> `manu`,
#> SUM(CAST(`appl_y` = "y" AS INT)) AS `num_appl_y`,
#> SUM(CAST(`alternate_flag` = "y" AS INT)) AS `num_appl_flag`
#> FROM `df_spark`
#> GROUP BY `manu`
dplyr
于 df
供参考:
# reference summary
df %>%
group_by(manu) %>%
summarize(num_appl_y = n_distinct(appl[appl_y == 'y']),
num_appl_flag=n_distinct(appl[alternate_flag == 'y'])
)
#> # A tibble: 3 × 3
#> manu num_appl_y num_appl_flag
#> <chr> <int> <int>
#> 1 China 0 1
#> 2 South Korea 0 1
#> 3 USA 4 2