This vigniette demonstrates how to use the DALEX package with models created with the xgboost package.

1 Regression

In this example we are going to use the wine dataset from the breakDown package. The wine quality will be predicted based on other features.

library("breakDown")
head(wine)
#>   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
#> 1           7.0             0.27        0.36           20.7     0.045
#> 2           6.3             0.30        0.34            1.6     0.049
#> 3           8.1             0.28        0.40            6.9     0.050
#> 4           7.2             0.23        0.32            8.5     0.058
#> 5           7.2             0.23        0.32            8.5     0.058
#> 6           8.1             0.28        0.40            6.9     0.050
#>   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
#> 1                  45                  170  1.0010 3.00      0.45     8.8
#> 2                  14                  132  0.9940 3.30      0.49     9.5
#> 3                  30                   97  0.9951 3.26      0.44    10.1
#> 4                  47                  186  0.9956 3.19      0.40     9.9
#> 5                  47                  186  0.9956 3.19      0.40     9.9
#> 6                  30                   97  0.9951 3.26      0.44    10.1
#>   quality
#> 1       6
#> 2       6
#> 3       6
#> 4       6
#> 5       6
#> 6       6

1.1 Model building

Let’s build a model. We need to prepare xgb.DMatrix first.

library("xgboost")

model_martix_train <- model.matrix(quality ~ . - 1, wine)
data_train <- xgb.DMatrix(model_martix_train, label = wine$quality)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
              objective = "reg:linear")

wine_xgb_model <- xgb.train(param, data_train, nrounds = 50)
wine_xgb_model
#> ##### xgb.Booster
#> raw: 20.1 Kb 
#> call:
#>   xgb.train(params = param, data = data_train, nrounds = 50)
#> params (as set within xgb.train):
#>   max_depth = "2", eta = "1", silent = "1", nthread = "2", objective = "reg:linear", silent = "1"
#> xgb.attributes:
#>   niter
#> callbacks:
#>   cb.print.evaluation(period = print_every_n)
#> # of features: 11 
#> niter: 50
#> nfeatures : 11

1.2 Explainer

Now we can create an explainer.

library("DALEX")

explainer_xgb <- explain(wine_xgb_model, 
                         data = model_martix_train, 
                         y = wine$quality, 
                         label = "xgboost",
                         colorize = FALSE)
#> Preparation of a new explainer is initiated
#>   -> model label       :  xgboost 
#>   -> data              :  4898  rows  11  cols 
#>   -> target variable   :  4898  values 
#>   -> predict function  :  yhat.default will be used (  default  )
#>   -> predicted values  :  numerical, min =  2.869188 , mean =  5.878132 , max =  8.078749  
#>   -> model_info        :  package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression (  default  ) 
#>   -> residual function :  difference between y and yhat (  default  )
#>   -> residuals         :  numerical, min =  -3.251447 , mean =  -0.0002230403 , max =  3.005342  
#>   A new explainer has been created!
explainer_xgb
#> Model label:  xgboost 
#> Model class:  xgb.Booster 
#> Data head  :
#>   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
#> 1           7.0             0.27        0.36           20.7     0.045
#> 2           6.3             0.30        0.34            1.6     0.049
#>   free.sulfur.dioxide total.sulfur.dioxide density  pH sulphates alcohol
#> 1                  45                  170   1.001 3.0      0.45     8.8
#> 2                  14                  132   0.994 3.3      0.49     9.5

1.3 Single variable

For continouse variable

sv_xgb_satisfaction_level  <- model_profile(explainer_xgb, 
                                            variable = "alcohol", 
                                            type = "partial")

plot(sv_xgb_satisfaction_level)

1.4 Single prediction

nobs <- model_martix_train[1, , drop = FALSE]
sp_xgb  <- predict_parts(explainer_xgb, 
                         new_observation = nobs,
                         type = "break_down")
head(sp_xgb)
#>                                   contribution
#> xgboost: intercept                       5.878
#> xgboost: residual.sugar = 20.7           0.332
#> xgboost: alcohol = 8.8                  -0.045
#> xgboost: density = 1.001                -0.429
#> xgboost: volatile.acidity = 0.27        -0.297
#> xgboost: free.sulfur.dioxide = 45       -0.040

plot(sp_xgb)

1.5 Variable importance

vd_xgb <- model_parts(explainer_xgb, type = "raw")
head(vd_xgb)
#>               variable mean_dropout_loss   label
#> 1         _full_model_         0.6295067 xgboost
#> 2        fixed.acidity         0.6391484 xgboost
#> 3            sulphates         0.6471640 xgboost
#> 4          citric.acid         0.6538835 xgboost
#> 5 total.sulfur.dioxide         0.6552513 xgboost
#> 6            chlorides         0.6691735 xgboost

plot(vd_xgb)

2 Classification

In this example we are going to use the HR_data dataset from the breakDown package. The model will predict odds that someone will leave the company.

library("breakDown")
head(HR_data)
#>   satisfaction_level last_evaluation number_project average_montly_hours
#> 1               0.38            0.53              2                  157
#> 2               0.80            0.86              5                  262
#> 3               0.11            0.88              7                  272
#> 4               0.72            0.87              5                  223
#> 5               0.37            0.52              2                  159
#> 6               0.41            0.50              2                  153
#>   time_spend_company Work_accident left promotion_last_5years sales salary
#> 1                  3             0    1                     0 sales    low
#> 2                  6             0    1                     0 sales medium
#> 3                  4             0    1                     0 sales medium
#> 4                  5             0    1                     0 sales    low
#> 5                  3             0    1                     0 sales    low
#> 6                  3             0    1                     0 sales    low

2.1 Model building

Let’s build a model. We need to prepare xgb.DMatrix first.

library("xgboost")

model_martix_train <- model.matrix(left ~ . - 1, HR_data)
data_train <- xgb.DMatrix(model_martix_train, label = HR_data$left)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
              objective = "binary:logistic", eval_metric = "auc")


HR_xgb_model <- xgb.train(param, data_train, nrounds = 50)
HR_xgb_model
#> ##### xgb.Booster
#> raw: 19.5 Kb 
#> call:
#>   xgb.train(params = param, data = data_train, nrounds = 50)
#> params (as set within xgb.train):
#>   max_depth = "2", eta = "1", silent = "1", nthread = "2", objective = "binary:logistic", eval_metric = "auc", silent = "1"
#> xgb.attributes:
#>   niter
#> callbacks:
#>   cb.print.evaluation(period = print_every_n)
#> # of features: 19 
#> niter: 50
#> nfeatures : 19

2.2 Explainer

Now we can create an explainer.

library("DALEX")
predict_logit <- function(model, x) {
  raw_x <- predict(model, x)
  exp(raw_x)/(1 + exp(raw_x))
}
logit <- function(x) exp(x)/(1+exp(x))

explainer_xgb <- explain(HR_xgb_model, 
                         data = model_martix_train, 
                         y = HR_data$left, 
                         predict_function = predict_logit,
                         link = logit,
                         label = "xgboost",
                         colorize = FALSE)
#> Preparation of a new explainer is initiated
#>   -> model label       :  xgboost 
#>   -> data              :  14999  rows  19  cols 
#>   -> target variable   :  14999  values 
#>   -> predict function  :  predict_logit 
#>   -> predicted values  :  numerical, min =  0.5 , mean =  0.5555972 , max =  0.7310584  
#>   -> model_info        :  package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression (  default  ) 
#>   -> residual function :  difference between y and yhat (  default  )
#>   -> residuals         :  numerical, min =  -0.7296657 , mean =  -0.3175147 , max =  0.4997965  
#>   A new explainer has been created!
explainer_xgb
#> Model label:  xgboost 
#> Model class:  xgb.Booster 
#> Data head  :
#>   satisfaction_level last_evaluation number_project average_montly_hours
#> 1               0.38            0.53              2                  157
#> 2               0.80            0.86              5                  262
#>   time_spend_company Work_accident promotion_last_5years salesaccounting
#> 1                  3             0                     0               0
#> 2                  6             0                     0               0
#>   saleshr salesIT salesmanagement salesmarketing salesproduct_mng salesRandD
#> 1       0       0               0              0                0          0
#> 2       0       0               0              0                0          0
#>   salessales salessupport salestechnical salarylow salarymedium
#> 1          1            0              0         1            0
#> 2          1            0              0         0            1

2.3 Single variable

For continouse variable

sv_xgb_satisfaction_level  <- model_profile(explainer_xgb, 
                          variable = "satisfaction_level",
                          type = "partial")

plot(sv_xgb_satisfaction_level)

2.4 Single prediction

nobs <- model_martix_train[1, , drop = FALSE]
sp_xgb  <- predict_parts(explainer_xgb, 
                         new_observation = nobs,
                         type = "break_down")
head(sp_xgb)
#>                                     contribution
#> xgboost: intercept                         0.556
#> xgboost: time_spend_company = 3           -0.013
#> xgboost: satisfaction_level = 0.38         0.012
#> xgboost: last_evaluation = 0.53            0.020
#> xgboost: average_montly_hours = 157        0.061
#> xgboost: salarylow = 1                     0.019

plot(sp_xgb)

2.5 Variable importance

vd_xgb <- model_parts(explainer_xgb, type = "raw")
head(vd_xgb)
#>          variable mean_dropout_loss   label
#> 1    _full_model_         0.4641699 xgboost
#> 2    salarymedium         0.4640913 xgboost
#> 3 salesaccounting         0.4641699 xgboost
#> 4         salesIT         0.4641699 xgboost
#> 5 salesmanagement         0.4641699 xgboost
#> 6  salesmarketing         0.4641699 xgboost

plot(vd_xgb)

3 Session info

sessionInfo()
#> R version 3.6.3 (2020-02-29)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 18363)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=Polish_Poland.1250  LC_CTYPE=Polish_Poland.1250   
#> [3] LC_MONETARY=Polish_Poland.1250 LC_NUMERIC=C                  
#> [5] LC_TIME=Polish_Poland.1250    
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] DALEX_2.0.1     xgboost_1.0.0.2 breakDown_0.2.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.4        pillar_1.4.3      compiler_3.6.3    ingredients_2.0  
#>  [5] tools_3.6.3       digest_0.6.25     evaluate_0.14     lifecycle_0.2.0  
#>  [9] tibble_2.1.3      gtable_0.3.0      lattice_0.20-38   pkgconfig_2.0.3  
#> [13] rlang_0.4.6       Matrix_1.2-18     yaml_2.2.1        xfun_0.12        
#> [17] stringr_1.4.0     dplyr_1.0.0       knitr_1.28        generics_0.0.2   
#> [21] vctrs_0.3.1       grid_3.6.3        tidyselect_1.1.0  glue_1.3.2       
#> [25] data.table_1.12.8 R6_2.4.1          iBreakDown_1.3.1  rmarkdown_2.1    
#> [29] farver_2.0.3      ggplot2_3.3.0     purrr_0.3.3       magrittr_1.5     
#> [33] scales_1.1.0      htmltools_0.4.0   colorspace_1.4-1  labeling_0.3     
#> [37] stringi_1.4.6     munsell_0.5.0     crayon_1.3.4