我正在尝试执行逻辑回归,以预测 user_type 的概率:“客户”将根据 user_type 属性“性别”、“出生年份”和“星期几”转换为“订阅者”。
这里有我的数据集的样本:
dput(head(dataset2,10))
structure(list(trip_id = 21742443:21742452, start_time = c("2019-01-01 0:04:37",
"2019-01-01 0:08:13", "2019-01-01 0:13:23", "2019-01-01 0:13:45",
"2019-01-01 0:14:52", "2019-01-01 0:15:33", "2019-01-01 0:16:06",
"2019-01-01 0:18:41", "2019-01-01 0:18:43", "2019-01-01 0:19:18"
), end_time = c("2019-01-01 0:11:07", "2019-01-01 0:15:34", "2019-01-01 0:27:12",
"2019-01-01 0:43:28", "2019-01-01 0:20:56", "2019-01-01 0:19:09",
"2019-01-01 0:19:03", "2019-01-01 0:20:21", "2019-01-01 0:47:30",
"2019-01-01 0:24:54"), bikeid = c(2167L, 4386L, 1524L, 252L,
1170L, 2437L, 2708L, 2796L, 6205L, 3939L), tripduration = c("390",
"441", "829", "1,783.00", "364", "216", "177", "100", "1,727.00",
"336"), from_station_id = c(199L, 44L, 15L, 123L, 173L, 98L,
98L, 211L, 150L, 268L), from_station_name = c("Wabash Ave & Grand Ave",
"State St & Randolph St", "Racine Ave & 18th St", "California Ave & Milwaukee Ave",
"Mies van der Rohe Way & Chicago Ave", "LaSalle St & Washington St",
"LaSalle St & Washington St", "St. Clair St & Erie St", "Fort Dearborn Dr & 31st St",
"Lake Shore Dr & North Blvd"), to_station_id = c(84L, 624L, 644L,
176L, 35L, 49L, 49L, 142L, 148L, 141L), to_station_name = c("Milwaukee Ave & Grand Ave",
"Dearborn St & Van Buren St (*)", "Western Ave & Fillmore St (*)",
"Clark St & Elm St", "Streeter Dr & Grand Ave", "Dearborn St & Monroe St",
"Dearborn St & Monroe St", "McClurg Ct & Erie St", "State St & 33rd St",
"Clark St & Lincoln Ave"), user_type = c("Subscriber", "Subscriber",
"Subscriber", "Subscriber", "Subscriber", "Subscriber", "Subscriber",
"Subscriber", "Subscriber", "Subscriber"), gender = c("Male",
"Female", "Female", "Male", "Male", "Female", "Male", "Male",
"Male", "Male"), birthyear = c(1989L, 1990L, 1994L, 1993L, 1994L,
1983L, 1984L, 1990L, 1995L, 1996L), ride_length = c("0:06:30",
"0:07:21", "0:13:49", "0:29:43", "0:06:04", "0:03:36", "0:02:57",
"0:01:40", "0:28:47", "0:05:36"), day_of_week = c(3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L)), row.names = c(NA, 10L), class = "data.frame")`
我正在尝试执行拆分测试,但在完成之前,我需要转换几列的数据格式,因为它们被错误地列为字符而不是整数,特别是对于ride_length列,它需要是转换为 hms。
要转换我执行此代码的列:
library(tidyverse)
library(tidymodels) library(hms)
dataset2 <- read.csv("Bike_Trips_2019.csv")
dataset2$user_type <- factor(dataset2$user_type, levels = c("Customer", "Subscriber"), labels = c("no","yes"))
dataset2$trip_id <- as.character(dataset2$trip_id)
dataset2$start_time <- as.POSIXct(dataset2$start_time)
dataset2$end_time <- as.POSIXct(dataset2$end_time)
dataset2$tripduration <- parse_number(dataset2$tripduration)
dataset2$ride_length <- as_hms(dataset2$ride_length) set.seed(421) split <- initial_split(dataset2, prop = 0.8, strata = user_type) train <-split %>% training() test <- split %>% testing() `
abort_lossy_cast()Error Message: Error in
: ! Lossy cast from <character> to <hms> at position(s) 101, 146, 854, 1405, 7935, ... (and 187 more)
来自 {lubridate} 的
period
类型有什么问题?或者,如果它必须是 hms
那么类似的东西应该起作用:
ride_length <- c(
"0:06:30", "0:07:21", "0:13:49", "0:29:43", "0:06:04",
"0:03:36", "0:02:57", "0:01:40", "0:28:47", "0:05:36"
)
lubridate::hms(ride_length)
#> [1] "6M 30S" "7M 21S" "13M 49S" "29M 43S" "6M 4S" "3M 36S" "2M 57S"
#> [8] "1M 40S" "28M 47S" "5M 36S"
lubridate::hms(ride_length) |>
hms::hms()
#> 00:06:30
#> 00:07:21
#> 00:13:49
#> 00:29:43
#> 00:06:04
#> 00:03:36
#> 00:02:57
#> 00:01:40
#> 00:28:47
#> 00:05:36
创建于 2024-03-17,使用 reprex v2.1.0