Source code for synthyverse.generators.forestdiffusion_generator.forestdiffusion

from ..base import TabularBaseGenerator
import pandas as pd
from .fd_dir.fd import ForestDiffusionModel
import numpy as np



[docs]
class ForestDiffusionGenerator(TabularBaseGenerator):
    """Forest Diffusion.

    Diffusion model leveraging XGBoost models to estimate the score function.

    Uses the ForestDiffusion pypi package implementation. Can be a costly method for large datasets.

    Paper: "Generating and imputing tabular data via diffusion and flow-based gradient-boosted trees" by Jolicoeur-Martineau et al. (2024).

    Args:
        target_column (str): Name of the target column.
        duplicate_K (int): Number of duplicates for each sample. Default: 100.
        noise_level (int): Noise level for diffusion. Default: 50.
        n_batch (int): Number of batches to use for XGBoost's data iterator. Default: 1.
        diffusion_type (str): Type of diffusion. Options: "flow", "vp". Default: "flow".
        n_jobs (int): Number of parallel jobs (-1 for all cores). Default: -1.
        max_depth (int): Maximum depth of trees. Default: 7.
        n_estimators (int): Number of tree estimators. Default: 100.
        eta (float): Learning rate. Default: 0.3.
        tree_method (str): Tree construction method. Options: "hist", "approx", "exact". Default: "hist".
        reg_alpha (float): L1 regularization. Default: 0.0.
        reg_lambda (float): L2 regularization. Default: 0.0.
        subsample (float): Subsample ratio. Default: 1.0.
        num_leaves (int): Number of leaves in trees. Default: 31.
        eps (float): Epsilon parameter. Default: 1e-3.
        beta_min (float): Minimum beta for diffusion. Default: 0.1.
        beta_max (float): Maximum beta for diffusion. Default: 8.
        n_z (int): Dimension of latent space. Default: 10.
        gpu_hist (bool): Whether to use GPU histogram. Default: False.
        random_state (int): Random seed for reproducibility. Default: 0.
        **kwargs: Additional arguments passed to TabularBaseGenerator.

    Example:
        >>> import pandas as pd
        >>> from synthyverse.generators import ForestDiffusionGenerator
        >>>
        >>> # Load data
        >>> X = pd.read_csv("data.csv")
        >>> discrete_features = ["category_col"]
        >>>
        >>> # Create generator (requires target column)
        >>> generator = ForestDiffusionGenerator(
        ...     target_column="target",
        ...     diffusion_type="flow",
        ...     n_jobs=-1,
        ...     random_state=42
        ... )
        >>>
        >>> # Fit and generate
        >>> generator.fit(X, discrete_features)
        >>> X_syn = generator.generate(1000)
    """

    name = "forestdiffusion"
    needs_target_column = True
    handles_missingness = True

    def __init__(
        self,
        target_column: str,
        duplicate_K: int = 100,
        noise_level: int = 50,
        diffusion_type: str = "flow",
        n_jobs: int = -1,
        max_depth: int = 7,
        n_estimators: int = 100,
        eta: float = 0.3,
        tree_method: str = "hist",
        reg_alpha: float = 0.0,
        reg_lambda: float = 0.0,
        subsample: float = 1.0,
        num_leaves: int = 31,
        eps: float = 1e-3,
        beta_min: float = 0.1,
        beta_max: float = 8,
        n_z: int = 10,
        gpu_hist: bool = False,
        random_state: int = 0,
        n_batch: int = 1,
        **kwargs,
    ):
        super().__init__(random_state=random_state, **kwargs)
        self.target_column = target_column
        self.random_state = random_state
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.eta = eta
        self.gpu_hist = gpu_hist
        self.tree_method = tree_method
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.subsample = subsample
        self.num_leaves = num_leaves
        self.eps = eps
        self.beta_min = beta_min
        self.beta_max = beta_max
        self.n_z = n_z
        self.n_jobs = n_jobs
        self.duplicate_K = duplicate_K
        self.diffusion_type = diffusion_type
        self.noise_level = noise_level
        self.n_batch = n_batch

    def _fit_model(
        self, X: pd.DataFrame, discrete_features: list, X_val: pd.DataFrame = None
    ):
        self.ori_col_order = X.columns
        self.discrete_features = discrete_features.copy()
        x = X.copy()

        # separate target column in case of classification
        if self.target_column in self.discrete_features:
            y = x[self.target_column]
            x = x.drop(columns=[self.target_column])
            y = y.to_numpy()
        else:
            y = None

        bin_features = []
        cat_features = []
        for col in x.columns:
            if col in self.discrete_features:
                if x[col].nunique() == 2:
                    bin_features.append(col)
                else:
                    cat_features.append(col)

        bin_indexes = [x.columns.get_loc(col) for col in bin_features]
        cat_indexes = [x.columns.get_loc(col) for col in cat_features]
        int_indexes = []  # already handled by basegenerator

        x = x.to_numpy()

        self.model = ForestDiffusionModel(
            x,  # Numpy dataset
            X_covs=None,  # Numpy dataset of additional covariates/features in order to sample X | X_covs (Optional); note that these variables will not be transformed, please apply your own z-scoring or min-max scaling if desired.
            label_y=y,  # must be a categorical/binary variable; if provided will learn multiple models for each label y
            n_t=self.noise_level,  # number of noise level
            model="xgboost",  # xgboost, random_forest, lgbm, catboost
            diffusion_type=self.diffusion_type,  # vp, flow (flow is better, but only vp can be used for imputation)
            max_depth=self.max_depth,
            n_estimators=self.n_estimators,
            eta=self.eta,  # xgboost hyperparameters
            tree_method=self.tree_method,
            reg_alpha=self.reg_alpha,
            reg_lambda=self.reg_lambda,
            subsample=self.subsample,  # xgboost hyperparameters
            num_leaves=self.num_leaves,  # lgbm hyperparameters
            duplicate_K=self.duplicate_K,  # number of different noise sample per real data sample
            bin_indexes=bin_indexes,  # vector which indicates which column is binary
            cat_indexes=cat_indexes,  # vector which indicates which column is categorical (>=3 categories)
            int_indexes=int_indexes,  # vector which indicates which column is an integer (ordinal variables such as number of cats in a box)
            remove_miss=False,  # If True, we remove the missing values, this allow us to train the XGBoost using one model for all predictors; otherwise we cannot do it
            p_in_one=True,  # When possible (when there are no missing values), will train the XGBoost using one model for all predictors
            true_min_max_values=None,  # Vector or None of form [[min_x, min_y], [max_x, max_y]]; If  provided, we use these values as the min/max for each variables when using clipping
            gpu_hist=self.gpu_hist,  # using GPU or not with xgboost
            n_z=self.n_z,  # number of noise to use in zero-shot classification
            eps=self.eps,
            beta_min=self.beta_min,
            beta_max=self.beta_max,
            n_jobs=self.n_jobs,  # cpus used (feel free to limit it to something small, this will leave more cpus per model; for lgbm you have to use n_jobs=1, otherwise it will never finish)
            n_batch=self.n_batch,  # If >0 use the data iterator with the specified number of batches
            seed=self.random_state,
        )

    def _generate_data(self, n: int):
        syn = self.model.generate(batch_size=n)

        # put back in original column order
        if self.target_column in self.discrete_features:
            syn_X = pd.DataFrame(
                syn[:, :-1],
                columns=[x for x in self.ori_col_order if x != self.target_column],
            )
            syn_y = pd.DataFrame(
                np.expand_dims(syn[:, -1], axis=1), columns=[self.target_column]
            )
            syn = pd.concat([syn_X, syn_y], axis=1)
            syn = syn[self.ori_col_order]
        else:
            syn = pd.DataFrame(syn, columns=self.ori_col_order)

        return syn

    def _cleanup_additional_state_for_save(self) -> None:
        if not hasattr(self, "model"):
            return

        # These are only needed for imputation workflows, not generation.
        for attr in ("X1", "X_covs"):
            if hasattr(self.model, attr):
                setattr(self.model, attr, None)