R-package/R/xgb.model.dt.tree.R

#' Convert tree model dump to data.table
#' 
#' Read a tree model text dump and return a data.table.
#' 
#' @importFrom data.table data.table
#' @importFrom data.table set
#' @importFrom data.table rbindlist
#' @importFrom data.table copy
#' @importFrom data.table :=
#' @importFrom magrittr %>%
#' @importFrom magrittr not
#' @importFrom magrittr add
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_extract
#' @importFrom stringr str_trim
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
#'
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
#'
#' @details 
#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
#' 
#' The content of the \code{data.table} is organised that way:
#' 
#' \itemize{
#' \item \code{ID}: unique identifier of a node ;
#'  \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
#'  \item \code{Split}: value of the chosen feature where is operated the split ;
#'  \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
#'  \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
#'  \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
#'  \item \code{Quality}: it's the gain related to the split in this specific node ;
#'  \item \code{Cover}: metric to measure the number of observation affected by the split ;
#'  \item \code{Tree}: ID of the tree. It is included in the main ID ;
#'  \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
#' } 
#'   
#' @examples
#' data(agaricus.train, package='xgboost')
#' 
#' #Both dataset are list with two items, a sparse matrix and labels 
#' #(labels = outcome column which will be learned). 
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#' 
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
#'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' 
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
#' 
#' @export
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
  
  if (!class(feature_names) %in% c("character", "NULL")) {     
    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
  }
  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
    stop("filename_dump: path to the model doesn't exist.")
  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
  }
  
  if (!class(model) %in% c("xgb.Booster", "NULL")) {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }
  
  if (!class(text) %in% c("character", "NULL")) { 
    stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
  }
  
  if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
    stop("n_first_tree: Has to be a numeric vector of size 1.")
  }
  
  if(!is.null(model)){
    text = xgb.dump(model = model, with.stats = T)
  } else if(!is.null(filename_dump)){
    text <- readLines(filename_dump) %>% str_trim(side = "both")  
  }
  
  position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1)
  
  extract <- function(x, pattern)  str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
  
  n_round <- min(length(position) - 1, n_first_tree)
  
  addTreeId <- function(x, i) paste(i,x,sep = "-")
  
  allTrees <- data.table()
 
  anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" 
  for(i in 1:n_round){
    
    tree <- text[(position[i]+1):(position[i+1]-1)]
    
    # avoid tree made of a leaf only (no split)
    if(length(tree) <2) next
    
    treeID <- i-1
    
    notLeaf <- str_match(tree, "leaf") %>% is.na
    leaf <- notLeaf %>% not %>% tree[.]
    branch <- notLeaf %>% tree[.]
    idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
    idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
    featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric 
    if(!is.null(feature_names)){
      featureBranch <- feature_names[featureBranch + 1]
    }
    featureLeaf <- rep("Leaf", length(leaf))
    splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "") 
    splitLeaf <- rep(NA, length(leaf)) 
    yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
    yesLeaf <- rep(NA, length(leaf)) 
    noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID)
    noLeaf <- rep(NA, length(leaf))
    missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
    missingLeaf <- rep(NA, length(leaf))
    qualityBranch <- extract(branch, paste0("gain=",anynumber_regex))
    qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
    coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
    coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
    dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]
    
    allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
  }
  
  yes <- allTrees[!is.na(Yes), Yes]
  
  set(allTrees, i = which(allTrees[, Feature] != "Leaf"), 
      j = "Yes.Feature", 
      value = allTrees[ID %in% yes, Feature])
  
  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
      j = "Yes.Cover", 
      value = allTrees[ID %in% yes, Cover])
  
  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
      j = "Yes.Quality", 
      value = allTrees[ID %in% yes, Quality])
  no <- allTrees[!is.na(No), No]
  
  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
      j = "No.Feature", 
      value = allTrees[ID %in% no, Feature])
  
  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
      j = "No.Cover", 
      value = allTrees[ID %in% no, Cover])
  
  set(allTrees, i = which(allTrees[, Feature] != "Leaf"), 
      j = "No.Quality", 
      value = allTrees[ID %in% no, Quality])
  
  allTrees
}

# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence"))