'Problem with `mutate()` input `index_date` when performing time series xgboost in R

here my reproducible example

  mydat=structure(list(supplier = c("TKP", "TKP", "TKP", "TKP", "TKP", 
    "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", 
    "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", 
    "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", 
    "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", 
    "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP", "TKP"), date = c("8/31/2020", 
    "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", 
    "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", 
    "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", 
    "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", "8/31/2020", 
    "8/31/2020", "8/31/2020", "8/31/2020", "09.01.2020", "09.01.2020", 
    "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", 
    "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", 
    "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", 
    "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", "09.01.2020", 
    "09.01.2020", "09.01.2020", "09.02.2020"), hour = c(0L, 1L, 2L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
    17L, 18L, 19L, 20L, 21L, 22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 
    7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 
    20L, 21L, 22L, 23L, 0L), weekday = c(1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), base_price = c(26388L, 
    4754L, 0L, 17184L, 11685L, 0L, 0L, 3834L, 14087L, 0L, 21266L, 
    36267L, 20964L, 13001L, 69088L, 126908L, 190456L, 36736L, 71203L, 
    36919L, 42835L, 24273L, 30320L, 49526L, 25857L, 0L, 11614L, 0L, 
    0L, 0L, 0L, 53307L, 89831L, 35634L, 61375L, 16868L, 84491L, 79636L, 
    30288L, 150268L, 13804L, 70390L, 14510L, 15590L, 49066L, 89043L, 
    29668L, 32762L, 13740L)), class = "data.frame", row.names = c(NA, 
    -49L))

I want perform forecast using Xgboost method. I need for each supplier create forecast by base_price variable on one week ahead, with taking account weekday as an auxiliary variable. this is how I tried to solve my problem The time format mm-dd-yyyy

library(xgboost)
library(dplyr)
library(lubridate)

extended_data_mod <- mydat %>%
  dplyr::mutate(., 
                index_date = as.Date(paste0(lubridate::year(date), "-", lubridate::month(date), "-01")),
                months = lubridate::month(index_date),
                years = lubridate::year(index_date))
mydat <- extended_data_mod[1:length(ts), ] # initial data

pred <- extended_data_mod[(length(ts) + 1):nrow(extended_data), ] # extended time index

x_train <- xgboost::xgb.DMatrix(as.matrix(mydat %>%
                                            dplyr::select(supplier, date,   hour,   weekday,    
                                                          base_price)))
x_pred <- xgboost::xgb.DMatrix(as.matrix(pred %>% 
                                           dplyr::select(supplier,  date,   hour,   weekday,    
                                                         base_price)))

y_train <- mydat$base_price
#learn the model
xgb_trcontrol <- caret::trainControl(
   method = "cv", 
   number = 5,
   allowParallel = TRUE, 
   verboseIter = FALSE, 
   returnData = FALSE
)

xgb_grid <- base::expand.grid(
   list(
    nrounds = c(100, 200),
    max_depth = c(10, 15, 20), # maximum depth of a tree
    colsample_bytree = seq(0.5), # subsample ratio of columns when construction each tree
    eta = 0.1, # learning rate
    gamma = 0, # minimum loss reduction
    min_child_weight = 1,  # minimum sum of instance weight (hessian) needed ina child
    subsample = 1 # subsample ratio of the training instances
))

xgb_model <- caret::train(
   x_train, y_train,
   trControl = xgb_trcontrol,
   tuneGrid = xgb_grid,
   method = "xgbTree",
   nthread = 1
)


xgb_model$bestTune

But when i try run my script i get some errors.

Error: Problem with `mutate ()` input ʻindex_date`.
x character string is not in a standard unambiguous format
i Input ʻindex_date` is ʻas.Date (...) `.
Run `rlang :: last_error ()` to see where the error occurred.
> mydat <- extended_data_mod [1: length (mydat),] # initial data
Error: Object 'extended_data_mod' not found
>
> pred <- extended_data_mod [(length (mydat) + 1): nrow (extended_data),] # extended time index
Error: Object 'extended_data_mod' not found
>
> x_train <- xgboost :: xgb.DMatrix (as.matrix (mydat%>%
+ dplyr :: select (supplier, date, hour, weekday,
+ base_price)))
Error: Can't subset columns that don't exist.
x Column `supplier` doesn't exist.
Run `rlang :: last_error ()` to see where the error occurred.
> x_pred <- xgboost :: xgb.DMatrix (as.matrix (pred%>%
+ dplyr :: select (supplier, date, hour, weekday,
+ base_price)))
Error in eval (lhs, parent, parent): object 'pred' not found

how can i fix these errors to get desired output , for example

   supplier       date hour weekday base_price
1       TKP 09.07.2020    1       2       1000
2       TKP 09.07.2020    2       2       2000
3       TKP 09.07.2020    3       2       3000
4       TKP 09.07.2020    4       2       4000
5       TKP 09.07.2020    5       2       5000
6       TKP 09.07.2020    6       2       6000
7       TKP 09.07.2020    7       2       7000
8       TKP 09.07.2020    8       2       8000
9       TKP 09.07.2020    9       2       9000
10      TKP 09.07.2020   10       2      10000
11      TKP 09.07.2020   11       2      11000
12      TKP 09.07.2020   12       2      12000


Solution 1:[1]

You don't have data of date class so your first line lubridate::year(date) fails. Turn the data to date class using lubridate::mdy then you can get year, month, day from the data.

library(dplyr)

extended_data_mod <- mydat %>%
         mutate(index_date = lubridate::mdy(date),
                months = lubridate::month(index_date),
                years = lubridate::year(index_date))

Solution 2:[2]

In base R we can use

mydat$index_date = as.Date(mydat$date, "%m.%d.%Y"))
mydat$months <- format(mydat$index_date, "%m")
mydat$years <- format(mydat$index_date, "%Y")

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Ronak Shah
Solution 2 akrun