'Problem with `mutate()` input `index_date` when performing time series xgboost in R
here my reproducible example
mydat=structure(list(supplier = c("TKP", "TKP", "TKP", "TKP", "TKP",
"TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP",
"TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP",
"TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP",
"TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP",
"TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP"), date = c("8/31/2020",
"8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020",
"8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020",
"8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020",
"8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020",
"8/31/2020", "8/31/2020", "8/31/2020", "09.01.2020", "09.01.2020",
"09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020",
"09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020",
"09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020",
"09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020",
"09.01.2020", "09.01.2020", "09.02.2020"), hour = c(0L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 0L), weekday = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), base_price = c(26388L,
4754L, 0L, 17184L, 11685L, 0L, 0L, 3834L, 14087L, 0L, 21266L,
36267L, 20964L, 13001L, 69088L, 126908L, 190456L, 36736L, 71203L,
36919L, 42835L, 24273L, 30320L, 49526L, 25857L, 0L, 11614L, 0L,
0L, 0L, 0L, 53307L, 89831L, 35634L, 61375L, 16868L, 84491L, 79636L,
30288L, 150268L, 13804L, 70390L, 14510L, 15590L, 49066L, 89043L,
29668L, 32762L, 13740L)), class = "data.frame", row.names = c(NA,
-49L))
I want perform forecast using Xgboost method. I need for each supplier
create forecast by base_price
variable on one week ahead, with taking account weekday as an auxiliary variable.
this is how I tried to solve my problem
The time format mm-dd-yyyy
library(xgboost)
library(dplyr)
library(lubridate)
extended_data_mod <- mydat %>%
dplyr::mutate(.,
index_date = as.Date(paste0(lubridate::year(date), "-", lubridate::month(date), "-01")),
months = lubridate::month(index_date),
years = lubridate::year(index_date))
mydat <- extended_data_mod[1:length(ts), ] # initial data
pred <- extended_data_mod[(length(ts) + 1):nrow(extended_data), ] # extended time index
x_train <- xgboost::xgb.DMatrix(as.matrix(mydat %>%
dplyr::select(supplier, date, hour, weekday,
base_price)))
x_pred <- xgboost::xgb.DMatrix(as.matrix(pred %>%
dplyr::select(supplier, date, hour, weekday,
base_price)))
y_train <- mydat$base_price
#learn the model
xgb_trcontrol <- caret::trainControl(
method = "cv",
number = 5,
allowParallel = TRUE,
verboseIter = FALSE,
returnData = FALSE
)
xgb_grid <- base::expand.grid(
list(
nrounds = c(100, 200),
max_depth = c(10, 15, 20), # maximum depth of a tree
colsample_bytree = seq(0.5), # subsample ratio of columns when construction each tree
eta = 0.1, # learning rate
gamma = 0, # minimum loss reduction
min_child_weight = 1, # minimum sum of instance weight (hessian) needed ina child
subsample = 1 # subsample ratio of the training instances
))
xgb_model <- caret::train(
x_train, y_train,
trControl = xgb_trcontrol,
tuneGrid = xgb_grid,
method = "xgbTree",
nthread = 1
)
xgb_model$bestTune
But when i try run my script i get some errors.
Error: Problem with `mutate ()` input ʻindex_date`.
x character string is not in a standard unambiguous format
i Input ʻindex_date` is ʻas.Date (...) `.
Run `rlang :: last_error ()` to see where the error occurred.
> mydat <- extended_data_mod [1: length (mydat),] # initial data
Error: Object 'extended_data_mod' not found
>
> pred <- extended_data_mod [(length (mydat) + 1): nrow (extended_data),] # extended time index
Error: Object 'extended_data_mod' not found
>
> x_train <- xgboost :: xgb.DMatrix (as.matrix (mydat%>%
+ dplyr :: select (supplier, date, hour, weekday,
+ base_price)))
Error: Can't subset columns that don't exist.
x Column `supplier` doesn't exist.
Run `rlang :: last_error ()` to see where the error occurred.
> x_pred <- xgboost :: xgb.DMatrix (as.matrix (pred%>%
+ dplyr :: select (supplier, date, hour, weekday,
+ base_price)))
Error in eval (lhs, parent, parent): object 'pred' not found
how can i fix these errors to get desired output , for example
supplier date hour weekday base_price
1 TKP 09.07.2020 1 2 1000
2 TKP 09.07.2020 2 2 2000
3 TKP 09.07.2020 3 2 3000
4 TKP 09.07.2020 4 2 4000
5 TKP 09.07.2020 5 2 5000
6 TKP 09.07.2020 6 2 6000
7 TKP 09.07.2020 7 2 7000
8 TKP 09.07.2020 8 2 8000
9 TKP 09.07.2020 9 2 9000
10 TKP 09.07.2020 10 2 10000
11 TKP 09.07.2020 11 2 11000
12 TKP 09.07.2020 12 2 12000
Solution 1:[1]
You don't have data of date class so your first line lubridate::year(date)
fails. Turn the data to date class using lubridate::mdy
then you can get year, month, day from the data.
library(dplyr)
extended_data_mod <- mydat %>%
mutate(index_date = lubridate::mdy(date),
months = lubridate::month(index_date),
years = lubridate::year(index_date))
Solution 2:[2]
In base R
we can use
mydat$index_date = as.Date(mydat$date, "%m.%d.%Y"))
mydat$months <- format(mydat$index_date, "%m")
mydat$years <- format(mydat$index_date, "%Y")
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | Ronak Shah |
Solution 2 | akrun |