这里是我的火车样本的 dput
dput(head(train,10))
train=structure(list(d_event_date = structure(c(19702, 19702, 19702,
19702, 19702, 19702, 19702, 19702, 19702, 19702), class = "Date"),
id_placement = c(20199549L, 20396189L, 20420889L, 16751495L,
17220762L, 15387714L, 20355507L, 20385438L, 17752382L, 21253771L
), id_zone = c(1632853L, 2836041L, 2919363L, 1709628L, 9937L,
1274157L, 1400727L, 2907891L, 1948324L, 1863050L), id_publisher = c(302501L,
480683L, 1204726L, 4468L, 5515L, 163955L, 176741L, 421345L,
441862L, 467632L), id_advertiser = c(914L, 914L, 914L, 914L,
914L, 914L, 914L, 914L, 914L, 914L), id_campaign = c(687314L,
687314L, 687314L, 723900L, 723900L, 723900L, 723900L, 723900L,
723900L, 723900L), id_banner = c(2032222L, 2032222L, 2032222L,
2136674L, 2136674L, 2136674L, 2136674L, 2136674L, 2136674L,
2136674L), id_landing = c(4772102L, 4772102L, 4699256L, 4821730L,
4821514L, 4821513L, 4821514L, 4823676L, 4821514L, 4821730L
), id_ad_unit = c(28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L,
28L, 28L), n_landing_pricing_type = c(3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), n_impression_qd = c(1L, 8L, 1L, 1L,
25L, 4L, 35L, 1L, 5L, 1L), n_click_qd = structure(c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0), class = "integer64"), n_conversion_qd = c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L), n_gross = c(0.0024,
0.0024, 0.0024, 0.008, 0.032499998, 0.0054, 0.064999996,
0.0076, 0.0129, 0.0239), n_net = c(0.001632, 0.00036, 0.001632,
0.0052, 0, 0.0027, 0.05135, 0.0038, 0.00774, 0.016252)), row.names = c(NA,
10L), class = "data.frame")
我使用 Arima 创建时间序列模型。 这是我的代码
# Function for data normalization
normalize_data <- function(data, method) {
normalized_data <- NULL
if (method == "none") {
normalized_data <- data
} else if (method == "min-max") {
min_val <- min(data)
max_val <- max(data)
normalized_data <- (data - min_val) / (max_val - min_val)
} else if (method == "z-score") {
mean_val <- mean(data)
std_val <- sd(data)
normalized_data <- (data - mean_val) / std_val
} else if (method == "cubic-root") {
normalized_data <- sign(data) * abs(data)^(1/3)
} else if (method == "logarithmic") {
normalized_data <- log(1 + data)
}
return(normalized_data)
}
# Function for calculating MAPE
calculate_mape <- function(actual, forecasted) {
mape <- mean(abs((actual - forecasted) / actual)) * 100
return(map)
}
# Normalization and model building for each variable
variables <- c("n_impression_qd", "n_conversion_qd", "n_gross")
normalization_methods <- c("none", "min-max", "z-score", "cubic-root", "logarithmic")
mape_results <- data.frame(variable = character(), normalization = character(), mape = numeric(), stringsAsFactors = FALSE)
for (variable in variables) {
for (method in normalization_methods) {
normalized_data <- normalize_data(data_filtered[[variable]], method)
# Convert to time series object
ts_data <- ts(normalized_data)
# Building an ARIMA model
model <- auto.arima(ts_data)
#Forecasting
forecast <- forecast(model, h = 10)$mean
# MAPE calculation
actual <- tail(data_filtered[[variable]], 10)
mape <- calculate_mape(actual, forecast)
# Saving results
result <- data.frame(variable = variable, normalization = method, mape = mape)
mape_results <- rbind(mape_results, result)
}
}
# Output results
print(map_results)
它有效,但是当我尝试在我的测试样本上应用模型时 例如这样
# Convert 'd_event_date' column to date format
test_data$d_event_date <- as.Date(test_data$d_event_date)
# Function to denormalize data
denormalize_data <- function(data, method, min_val, max_val, mean_val, std_val) {
denormalized_data <- NULL
if (method == "none") {
denormalized_data <- data
} else if (method == "min-max") {
denormalized_data <- data * (max_val - min_val) + min_val
} else if (method == "z-score") {
denormalized_data <- data * std_val + mean_val
} else if (method == "cubic-root") {
denormalized_data <- sign(data) * abs(data)^3
} else if (method == "logarithmic") {
denormalized_data <- exp(data) - 1
}
return(denormalized_data)
}
# Variables and normalization methods
variables <- c("n_impression_qd", "n_conversion_qd", "n_gross")
normalization_methods <- c("none", "min-max", "z-score", "cubic-root", "logarithmic")
# Result storage
prediction_results <- data.frame(variable = character(), actual = numeric(), predicted = numeric(), stringsAsFactors = FALSE)
# Predict and calculate MAPE for each variable and normalization method
for (variable in variables) {
for (method in normalization_methods) {
# Retrieving data and normalization parameters for a variable
data <- test_data[[variable]]
min_val <- min(data)
max_val <- max(data)
mean_val <- mean(data)
std_val <- sd(data)
# Data normalization
normalized_data <- normalize_data(data, method)
# Convert to time series object
ts_data <- ts(normalized_data)
#Forecasting
forecast <- forecast(model, h = length(ts_data))$mean
# Denormalization of predicted values
denormalized_forecast <- denormalize_data(forecast, method, min_val, max_val, mean_val, std_val)
# MAPE calculation
actual <- data
mape <- calculate_mape(actual, denormalized_forecast)
# Saving results
result <- data.frame(variable = variable, actual = actual, predicted = denormalized_forecast)
prediction_results <- rbind(prediction_results, result)
}
}
i get the error
Error in `[<-.ts`(`*tmp*`, ri, value = c(135.945603813953, 177.486351819609, :
Only element replacements are allowed
示例测试样本
dput(test_data)
structure(list(d_event_date = structure(c(19702L, 19702L, 19702L,
19702L, 19702L, 19702L, 19702L, 19702L, 19702L, 19702L), class = c("IDate",
"Date")), id_placement = c(20794491L, 20794491L, 17788422L, 20654348L,
21496171L, 18582636L, 19840341L, 17218292L, 18423615L, 10973945L
), id_zone = c(2859002L, 2859002L, 1781319L, 1733870L, 2096492L,
1933910L, 2717419L, 1850302L, 2245073L, 46564L), id_publisher = c(352465L,
352465L, 143258L, 304470L, 627080L, 59862L, 1052120L, 425742L,
38L, 34624L), id_advertiser = c(914L, 914L, 914L, 914L, 914L,
2834L, 2834L, 8579L, 15862L, 15862L), id_campaign = c(689656L,
689656L, 723900L, 723900L, 723900L, 873762L, 882336L, 360584L,
872210L, 872213L), id_banner = c(2038567L, 2038567L, 2136674L,
2136674L, 2136674L, 2546849L, 2568794L, 1181453L, 2543069L, 2543072L
), id_landing = c(3814050L, 4590592L, 4823675L, 4821514L, 4823676L,
4601283L, 4811884L, 4801168L, 4595695L, 4595698L), id_ad_unit = c(5L,
5L, 28L, 28L, 28L, 27L, 29L, 28L, 28L, 28L), n_landing_pricing_type = c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L), n_impression_qd = c(13L,
11L, 2L, 1L, 8L, 5L, 349L, 2L, 132L, 7L), n_click_qd = structure(c(0,
0, 0, 0, 0, 0, 1.15611361126852e-321, 0, 0, 0), class = "integer64"),
n_conversion_qd = c(1L, 2L, 1L, 1L, 3L, 2L, 5L, 1L, 3L, 1L
), n_gross = c(0.003, 0.0065, 0.0124, 0.0167, 0.090299998,
0.00054504, 0.03256, 0.0263, 0.08052, 0.00404), n_net = c(9e-04,
0.00195, 0.00868, 0.012024, 0.0686280016, 0.0003706272, 0.0221408,
0.017884, 0.032208, 0.00202)), row.names = c(NA, 10L), class = "data.frame")
为什么会出现这个错误,我该如何修复它? 感谢您的帮助。 预测后,我将数据非规范化回绝对值。 对于测试样本中的每个变量,所需的结果是使用一种或另一种标准化方法构建的每个模型的实际值和预测值
structure(list(n_impression_qd_actual = 12:16, n_impression_qd_pred = 12:16,
model.normalize_n_impression_qd = c("none", "min-max", "z-score",
"cubic-root", "logarithmic"), n_conversion_qd_actual = 12:16,
n_conversion_qd_pred = 12:16, model.normalize_n_n_conversion_qd = c("none",
"min-max", "z-score", "cubic-root", "logarithmic"), n_gross_actual = 12:16,
n_gross_pred = 12:16, n_gross_model.normalize = c("none",
"min-max", "z-score", "cubic-root", "logarithmic")), class = "data.frame", row.names = c(NA,
-5L))
这可能是一条评论,但我对规范化/非规范化函数有一个建议,并将其作为答案发布。
问题的解决方案是在使用结果之前将
"ts"
对象强制转换为数字。rbind
数据标准化for (variable in variables) {
for (method in normalization_methods) {
# [rest of code omitted]
# Saving results
result <- data.frame(variable = variable, actual = actual, predicted = denormalized_forecast)
# include this line
result$predicted <- as.numeric(result$predicted)
#
prediction_results <- rbind(prediction_results, result)
}
}
现在调用对数据进行非规范化
# New function for data normalization
normalize_data <- function(data, method) {
normalized_data <- NULL
params <- NULL
if (method == "none") {
normalized_data <- data
} else if (method == "min-max") {
min_val <- min(data)
max_val <- max(data)
normalized_data <- (data - min_val) / (max_val - min_val)
params <- c(min = min_val, max = max_val)
} else if (method == "z-score") {
mean_val <- mean(data)
std_val <- sd(data)
normalized_data <- (data - mean_val) / std_val
params <- c(mean = mean_val, sd = std_val)
} else if (method == "cubic-root") {
normalized_data <- sign(data) * abs(data)^(1/3)
} else if (method == "logarithmic") {
normalized_data <- log1p(data)
}
attr(normalized_data, "method") <- method
if(!is.null(params))
attr(normalized_data, "params") <- params
normalized_data
}
# New denormalization function
denormalize_data <- function(data, attrib) {
method <- attrib$method
denormalized_data <- NULL
if (method == "none") {
denormalized_data <- data
} else if (method == "min-max") {
min_val <- attrib$params["min"]
max_val <- attrib$params["max"]
denormalized_data <- data * (max_val - min_val) + min_val
} else if (method == "z-score") {
mean_val <- attrib$params["mean"]
std_val <- attrib$params["sd"]
denormalized_data <- data * std_val + mean_val
} else if (method == "cubic-root") {
denormalized_data <- sign(data) * abs(data)^3
} else if (method == "logarithmic") {
denormalized_data <- expm1(data)
}
denormalized_data
}