ModelOriented · fabsig · May 16, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -30,6 +30,7 @@ Suggests:
     gbm,
     jsonlite,
     lightgbm,
+    gpboost,
     randomForest,
     ranger,
     scales,

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,7 @@ S3method(treeshap,model_unified_multioutput)
 S3method(unify,default)
 S3method(unify,gbm)
 S3method(unify,lgb.Booster)
+S3method(unify,gpb.Booster)
 S3method(unify,randomForest)
 S3method(unify,ranger)
 S3method(unify,xgb.Booster)
@@ -19,6 +20,7 @@ export(gbm.unify)
 export(is.model_unified)
 export(is.treeshap)
 export(lightgbm.unify)
+export(gpboost.unify)
 export(plot_contribution)
 export(plot_feature_dependence)
 export(plot_feature_importance)

diff --git a/R/treeshap.R b/R/treeshap.R
@@ -20,6 +20,7 @@
 #' @seealso
 #' \code{\link{xgboost.unify}} for \code{XGBoost models}
 #' \code{\link{lightgbm.unify}} for \code{LightGBM models}
+#' \code{\link{gpboost.unify}} for \code{GPBoost models}
 #' \code{\link{gbm.unify}} for \code{GBM models}
 #' \code{\link{randomForest.unify}} for \code{randomForest models}
 #' \code{\link{ranger.unify}} for \code{ranger models}

diff --git a/R/unify.R b/R/unify.R
@@ -13,6 +13,8 @@
 #' @seealso
 #' \code{\link{lightgbm.unify}} for \code{\link[lightgbm:lightgbm]{LightGBM models}}
 #'
+#' \code{\link{gpboost.unify}} for \code{\link[gpboost:gpboost]{GPBoost models}}
+#'
 #' \code{\link{gbm.unify}} for \code{\link[gbm:gbm]{GBM models}}
 #'
 #' \code{\link{xgboost.unify}} for \code{\link[xgboost:xgboost]{XGBoost models}}
@@ -54,6 +56,11 @@ unify.lgb.Booster <- function(model, data, recalculate = FALSE, ...){
   lightgbm.unify(model, data, recalculate)
 }
 
+#' @export
+unify.gpb.Booster <- function(model, data, recalculate = FALSE, ...){
+  gpboost.unify(model, data, recalculate)
+}
+
 #' @export
 unify.randomForest <- function(model, data, ...){
   randomForest.unify(model, data)

diff --git a/R/unify_gpboost.R b/R/unify_gpboost.R
@@ -0,0 +1,114 @@
+# should be preceded with gpb.model.dt.tree
+#' Unify GPBoost model
+#'
+#' Convert your GPBoost model into a standardized representation.
+#' The returned representation is easy to be interpreted by the user and ready to be used as an argument in \code{treeshap()} function.
+#'
+#' @param gpb_model A gpboost model - object of class \code{gpb.Booster}
+#' @param data Reference dataset. A \code{data.frame} or \code{matrix} with the same columns as in the training set of the model. Usually dataset used to train model.
+#' @param recalculate logical indicating if covers should be recalculated according to the dataset given in data. Keep it \code{FALSE} if training data are used.
+#'
+#' @return a unified model representation - a \code{\link{model_unified.object}} object
+#'
+#' @export
+#'
+#' @import data.table
+#'
+#' @seealso
+#'
+#' \code{\link{gbm.unify}} for \code{\link[gbm:gbm]{GBM models}}
+#'
+#' \code{\link{xgboost.unify}} for \code{\link[xgboost:xgboost]{XGBoost models}}
+#' 
+#' \code{\link{lightgbm.unify}} for \code{\link[lightgbm:lightgbm]{LightGBM models}}
+#'
+#' \code{\link{ranger.unify}} for \code{\link[ranger:ranger]{ranger models}}
+#'
+#' \code{\link{randomForest.unify}} for \code{\link[randomForest:randomForest]{randomForest models}}
+#'
+#' @examples
+#' \donttest{
+#' library(gpboost)
+#' param_gpb <- list(objective = "regression", max_depth = 2,
+#'                    force_row_wise = TRUE, num_iterations = 20)
+#' data_fifa <- fifa20$data[!colnames(fifa20$data) %in%
+#'              c('work_rate', 'value_eur', 'gk_diving', 'gk_handling',
+#'              'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning')]
+#' data <- na.omit(cbind(data_fifa, fifa20$target))
+#' sparse_data <- as.matrix(data[,-ncol(data)])
+#' x <- gpboost::gpb.Dataset(sparse_data, label = as.matrix(data[,ncol(data)]))
+#' gpb_data <- gpboost::gpb.Dataset.construct(x)
+#' gpb_model <- gpboost::gpboost(data = gpb_data, params = param_gpb,
+#'                                 verbose = -1, num_threads = 0)
+#' unified_model <- gpboost.unify(gpb_model, sparse_data)
+#' shaps <- treeshap(unified_model, data[1:2, ])
+#' plot_contribution(shaps, obs = 1)
+#' }
+gpboost.unify <- function(gpb_model, data, recalculate = FALSE) {
+  if (!requireNamespace("gpboost", quietly = TRUE)) {
+    stop("Package \"gpboost\" needed for this function to work. Please install it.",
+         call. = FALSE)
+  }
+  df <- gpboost::gpb.model.dt.tree(gpb_model)
+  stopifnot(c("split_index", "split_feature", "node_parent", "leaf_index", "leaf_parent", "internal_value",
+              "internal_count", "leaf_value", "leaf_count", "decision_type") %in% colnames(df))
+  df <- data.table::as.data.table(df)
+  #convert node_parent and leaf_parent into one parent column
+  df[is.na(df$node_parent), "node_parent"] <- df[is.na(df$node_parent), "leaf_parent"]
+  #convert values into one column...
+  df[is.na(df$internal_value), "internal_value"] <- df[!is.na(df$leaf_value), "leaf_value"]
+  #...and counts
+  df[is.na(df$internal_count), "internal_count"] <- df[!is.na(df$leaf_count), "leaf_count"]
+  df[["internal_count"]] <- as.numeric(df[["internal_count"]])
+  #convert split_index and leaf_index into one column:
+  max_split_index <- df[, max(split_index, na.rm = TRUE), tree_index]
+  rep_max_split <- rep(max_split_index$V1, times = as.numeric(table(df$tree_index)))
+  new_leaf_index <- rep_max_split + df[, "leaf_index"] + 1
+  df[is.na(df$split_index), "split_index"] <- new_leaf_index[!is.na(new_leaf_index[["leaf_index"]]), 'leaf_index']
+  df[is.na(df$split_gain), "split_gain"] <- df[is.na(df$split_gain), "leaf_value"]
+  # On the basis of column 'Parent', create columns with childs: 'Yes', 'No' and 'Missing' like in the xgboost df:
+  ret.first <- function(x) x[1]
+  ret.second <- function(x) x[2]
+  tmp <- data.table::merge.data.table(df[, .(node_parent, tree_index, split_index)], df[, .(tree_index, split_index, default_left, decision_type)],
+                                      by.x = c("tree_index", "node_parent"), by.y = c("tree_index", "split_index"))
+  y_n_m <- unique(tmp[, .(Yes = ifelse(decision_type %in% c("<=", "<"), ret.first(split_index),
+                                       ifelse(decision_type %in% c(">=", ">"), ret.second(split_index), stop("Unknown decision_type"))),
+                          No = ifelse(decision_type %in% c(">=", ">"), ret.first(split_index),
+                                      ifelse(decision_type %in% c("<=", "<"), ret.second(split_index), stop("Unknown decision_type"))),
+                          Missing = ifelse(default_left, ret.first(split_index),ret.second(split_index)),
+                          decision_type = decision_type),
+                      .(tree_index, node_parent)])
+  df <- data.table::merge.data.table(df[, c("tree_index", "depth", "split_index", "split_feature", "node_parent", "split_gain",
+                                            "threshold", "internal_value", "internal_count")],
+                                     y_n_m, by.x = c("tree_index", "split_index"),
+                                     by.y = c("tree_index", "node_parent"), all.x = TRUE)
+  df[decision_type == ">=", decision_type := "<"]
+  df[decision_type == ">", decision_type := "<="]
+  df$Decision.type <- factor(x = df$decision_type, levels = c("<=", "<"))
+  df[is.na(split_index), Decision.type := NA]
+  df <- df[, c("tree_index", "split_index", "split_feature", "Decision.type", "threshold", "Yes", "No", "Missing", "split_gain", "internal_count")]
+  colnames(df) <- c("Tree", "Node", "Feature", "Decision.type", "Split", "Yes", "No", "Missing", "Prediction", "Cover")
+  attr(df, "sorted") <- NULL
+
+  ID <- paste0(df$Node, "-", df$Tree)
+  df$Yes <- match(paste0(df$Yes, "-", df$Tree), ID)
+  df$No <- match(paste0(df$No, "-", df$Tree), ID)
+  df$Missing <- match(paste0(df$Missing, "-", df$Tree), ID)
+
+  # Here we lose "Quality" information
+  df$Prediction[!is.na(df$Feature)] <- NA
+
+  feature_names <- jsonlite::fromJSON(gpb_model$dump_model())$feature_names
+  data <- data[,colnames(data) %in% feature_names]
+
+  ret <- list(model = as.data.frame(df), data = as.data.frame(data), feature_names = feature_names)
+  class(ret) <- "model_unified"
+  attr(ret, "missing_support") <- TRUE
+  attr(ret, "model") <- "gpboost"
+
+  if (recalculate) {
+    ret <- set_reference_dataset(ret, as.data.frame(data))
+  }
+
+  return(ret)
+}
diff --git a/README.Rmd b/README.Rmd
@@ -22,7 +22,7 @@ set.seed(21)
 [![CRAN status](https://www.r-pkg.org/badges/version/treeshap)](https://CRAN.R-project.org/package=treeshap)
 <!-- badges: end -->
 
-In the era of complicated classifiers conquering their market, sometimes even the authors of algorithms do not know the exact manner of building a tree ensemble model. The difficulties in models' structures are one of the reasons why most users use them simply like black-boxes. But, how can they know whether the prediction made by the model is reasonable? `treeshap` is an efficient answer for this question. Due to implementing an optimized algorithm for tree ensemble models (called TreeSHAP), it calculates the SHAP values in polynomial (instead of exponential) time. Currently, `treeshap` supports models produced with `xgboost`, `lightgbm`, `gbm`, `ranger`, and `randomForest` packages. Support for `catboost` is available only in [`catboost` branch](https://github.com/ModelOriented/treeshap/tree/catboost) (see why [here](#catboost)).
+In the era of complicated classifiers conquering their market, sometimes even the authors of algorithms do not know the exact manner of building a tree ensemble model. The difficulties in models' structures are one of the reasons why most users use them simply like black-boxes. But, how can they know whether the prediction made by the model is reasonable? `treeshap` is an efficient answer for this question. Due to implementing an optimized algorithm for tree ensemble models (called TreeSHAP), it calculates the SHAP values in polynomial (instead of exponential) time. Currently, `treeshap` supports models produced with `xgboost`, `lightgbm`, `gpboost`, `gbm`, `ranger`, and `randomForest` packages. Support for `catboost` is available only in [`catboost` branch](https://github.com/ModelOriented/treeshap/tree/catboost) (see why [here](#catboost)).
 
 ## Installation
 

diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ can they know whether the prediction made by the model is reasonable?
 an optimized algorithm for tree ensemble models (called TreeSHAP), it
 calculates the SHAP values in polynomial (instead of exponential) time.
 Currently, `treeshap` supports models produced with `xgboost`,
-`lightgbm`, `gbm`, `ranger`, and `randomForest` packages. Support for
+`lightgbm`, `gpboost`, `gbm`, `ranger`, and `randomForest` packages. Support for
 `catboost` is available only in [`catboost`
 branch](https://github.com/ModelOriented/treeshap/tree/catboost) (see
 why [here](#catboost)).

diff --git a/man/gpboost.unify.Rd b/man/gpboost.unify.Rd
diff --git a/man/unify.Rd b/man/unify.Rd