Source code for synthyverse.generators.nrgboost_generator.nrg_boost

from ..base import TabularBaseGenerator
import pandas as pd
from typing import Optional
from nrgboost import Dataset, NRGBooster



[docs]
class NRGBoostGenerator(TabularBaseGenerator):
    """ENeRgy-based Generative Boosting (NRGBoost).

    Turns gradient-boosted decision trees into energy-based generative models.

    Uses the nrgboost pypi package implementation.

    Paper: "NRGBoost: Energy-Based Generative Boosted Trees" by J. Bravo (2024).

    Args:
        num_trees (int): Number of trees in the boosted ensemble. Default: 200.
        shrinkage (float): Shrinkage parameter for boosting. Default: 0.15.
        line_search (bool): Whether to use line search for step size optimization. Default: True.
        max_leaves (int): Maximum number of leaves per tree. Default: 256.
        max_ratio_in_leaf (float): Maximum ratio of data / model data per leaf. Default: 2.
        min_data_in_leaf (float): Minimum data points per leaf. Default: 0.
        initial_uniform_mixture (float): Mixture coeficient for the starting point of boosting: 0 means starting from the product of training marginals, 1 means starting from a uniform distribution. Default: 0.1.
        categorical_split_one_vs_all (bool): Whether to use one-vs-all splitting for categorical features. Default: False.
        feature_frac (float): Fraction of features to randomly consider for splitting each node. Default: 1.
        splitter (str): Determines how trees are grown. "best" is best first and "depth" is breadth first. Default: "best".
        num_steps (int): Number of Gibbs sampling steps. Default: 100.
        num_sampling_rounds (Optional[int]): Include only first n trees when sampling. Default: None.
        temperature (float): Temperature parameter for sampling. Default: 1.0.
        num_sampling_threads (int): Number of threads for parallel sampling (0 for openmp default). Default: 0.
        random_state (int): Random seed for reproducibility. Default: 0.
        **kwargs: Additional arguments passed to TabularBaseGenerator.

    Example:
        >>> import pandas as pd
        >>> from synthyverse.generators import NRGBoostGenerator
        >>>
        >>> # Load data
        >>> X = pd.read_csv("data.csv")
        >>> discrete_features = ["category_col"]
        >>>
        >>> # Create generator
        >>> generator = NRGBoostGenerator(
        ...     num_trees=200,
        ...     shrinkage=0.15,
        ...     num_steps=100,
        ...     random_state=42
        ... )
        >>>
        >>> # Fit and generate
        >>> generator.fit(X, discrete_features)
        >>> X_syn = generator.generate(1000)
    """

    name = "nrgboost"

    def __init__(
        self,
        num_trees: int = 200,
        shrinkage: float = 0.15,
        line_search: bool = True,
        max_leaves: int = 256,
        max_ratio_in_leaf: float = 2,
        min_data_in_leaf: float = 0,
        initial_uniform_mixture: float = 0.1,
        categorical_split_one_vs_all: bool = False,
        feature_frac: float = 1,
        splitter: str = "best",
        num_steps: int = 100,
        num_sampling_rounds: Optional[int] = None,
        temperature: float = 1.0,
        num_sampling_threads: int = 0,
        random_state: int = 0,
        **kwargs,
    ):
        super().__init__(random_state=random_state, **kwargs)
        self.training_params = {
            "num_trees": num_trees,
            "shrinkage": shrinkage,
            "line_search": line_search,
            "max_leaves": max_leaves,
            "max_ratio_in_leaf": max_ratio_in_leaf,
            "min_data_in_leaf": min_data_in_leaf,
            "initial_uniform_mixture": initial_uniform_mixture,
        }

        self.sampling_params = {
            "seed": random_state,
            "num_steps": num_steps,
            "num_rounds": num_sampling_rounds,
            "temperature": temperature,
            "num_threads": num_sampling_threads,
        }

        self.random_state = random_state

    def _fit_model(
        self, X: pd.DataFrame, discrete_features: list, X_val: pd.DataFrame = None
    ):
        xx = X.copy()
        xx[discrete_features] = xx[discrete_features].astype("category")

        self.model = NRGBooster.fit(
            Dataset(xx), params=self.training_params, seed=self.random_state
        )

    def _generate_data(self, n: int):
        syn = self.model.sample(n, **self.sampling_params)
        return syn