Source code for synthyverse.generators.forestdiffusion_generator.forestdiffusion

from ..base import TabularBaseGenerator
import pandas as pd
from .fd_dir.fd import ForestDiffusionModel
import numpy as np


[docs] class ForestDiffusionGenerator(TabularBaseGenerator): """Forest Diffusion. Diffusion model leveraging XGBoost models to estimate the score function. Uses the ForestDiffusion pypi package implementation. Can be a costly method for large datasets. Paper: "Generating and imputing tabular data via diffusion and flow-based gradient-boosted trees" by Jolicoeur-Martineau et al. (2024). Args: target_column (str): Name of the target column. duplicate_K (int): Number of duplicates for each sample. Default: 100. noise_level (int): Noise level for diffusion. Default: 50. n_batch (int): Number of batches to use for XGBoost's data iterator. Default: 1. diffusion_type (str): Type of diffusion. Options: "flow", "vp". Default: "flow". n_jobs (int): Number of parallel jobs (-1 for all cores). Default: -1. max_depth (int): Maximum depth of trees. Default: 7. n_estimators (int): Number of tree estimators. Default: 100. eta (float): Learning rate. Default: 0.3. tree_method (str): Tree construction method. Options: "hist", "approx", "exact". Default: "hist". reg_alpha (float): L1 regularization. Default: 0.0. reg_lambda (float): L2 regularization. Default: 0.0. subsample (float): Subsample ratio. Default: 1.0. num_leaves (int): Number of leaves in trees. Default: 31. eps (float): Epsilon parameter. Default: 1e-3. beta_min (float): Minimum beta for diffusion. Default: 0.1. beta_max (float): Maximum beta for diffusion. Default: 8. n_z (int): Dimension of latent space. Default: 10. gpu_hist (bool): Whether to use GPU histogram. Default: False. random_state (int): Random seed for reproducibility. Default: 0. **kwargs: Additional arguments passed to TabularBaseGenerator. Example: >>> import pandas as pd >>> from synthyverse.generators import ForestDiffusionGenerator >>> >>> # Load data >>> X = pd.read_csv("data.csv") >>> discrete_features = ["category_col"] >>> >>> # Create generator (requires target column) >>> generator = ForestDiffusionGenerator( ... target_column="target", ... diffusion_type="flow", ... n_jobs=-1, ... random_state=42 ... ) >>> >>> # Fit and generate >>> generator.fit(X, discrete_features) >>> X_syn = generator.generate(1000) """ name = "forestdiffusion" needs_target_column = True handles_missingness = True def __init__( self, target_column: str, duplicate_K: int = 100, noise_level: int = 50, diffusion_type: str = "flow", n_jobs: int = -1, max_depth: int = 7, n_estimators: int = 100, eta: float = 0.3, tree_method: str = "hist", reg_alpha: float = 0.0, reg_lambda: float = 0.0, subsample: float = 1.0, num_leaves: int = 31, eps: float = 1e-3, beta_min: float = 0.1, beta_max: float = 8, n_z: int = 10, gpu_hist: bool = False, random_state: int = 0, n_batch: int = 1, **kwargs, ): super().__init__(random_state=random_state, **kwargs) self.target_column = target_column self.random_state = random_state self.max_depth = max_depth self.n_estimators = n_estimators self.eta = eta self.gpu_hist = gpu_hist self.tree_method = tree_method self.reg_alpha = reg_alpha self.reg_lambda = reg_lambda self.subsample = subsample self.num_leaves = num_leaves self.eps = eps self.beta_min = beta_min self.beta_max = beta_max self.n_z = n_z self.n_jobs = n_jobs self.duplicate_K = duplicate_K self.diffusion_type = diffusion_type self.noise_level = noise_level self.n_batch = n_batch def _fit_model( self, X: pd.DataFrame, discrete_features: list, X_val: pd.DataFrame = None ): self.ori_col_order = X.columns self.discrete_features = discrete_features.copy() x = X.copy() # separate target column in case of classification if self.target_column in self.discrete_features: y = x[self.target_column] x = x.drop(columns=[self.target_column]) y = y.to_numpy() else: y = None bin_features = [] cat_features = [] for col in x.columns: if col in self.discrete_features: if x[col].nunique() == 2: bin_features.append(col) else: cat_features.append(col) bin_indexes = [x.columns.get_loc(col) for col in bin_features] cat_indexes = [x.columns.get_loc(col) for col in cat_features] int_indexes = [] # already handled by basegenerator x = x.to_numpy() self.model = ForestDiffusionModel( x, # Numpy dataset X_covs=None, # Numpy dataset of additional covariates/features in order to sample X | X_covs (Optional); note that these variables will not be transformed, please apply your own z-scoring or min-max scaling if desired. label_y=y, # must be a categorical/binary variable; if provided will learn multiple models for each label y n_t=self.noise_level, # number of noise level model="xgboost", # xgboost, random_forest, lgbm, catboost diffusion_type=self.diffusion_type, # vp, flow (flow is better, but only vp can be used for imputation) max_depth=self.max_depth, n_estimators=self.n_estimators, eta=self.eta, # xgboost hyperparameters tree_method=self.tree_method, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, subsample=self.subsample, # xgboost hyperparameters num_leaves=self.num_leaves, # lgbm hyperparameters duplicate_K=self.duplicate_K, # number of different noise sample per real data sample bin_indexes=bin_indexes, # vector which indicates which column is binary cat_indexes=cat_indexes, # vector which indicates which column is categorical (>=3 categories) int_indexes=int_indexes, # vector which indicates which column is an integer (ordinal variables such as number of cats in a box) remove_miss=False, # If True, we remove the missing values, this allow us to train the XGBoost using one model for all predictors; otherwise we cannot do it p_in_one=True, # When possible (when there are no missing values), will train the XGBoost using one model for all predictors true_min_max_values=None, # Vector or None of form [[min_x, min_y], [max_x, max_y]]; If provided, we use these values as the min/max for each variables when using clipping gpu_hist=self.gpu_hist, # using GPU or not with xgboost n_z=self.n_z, # number of noise to use in zero-shot classification eps=self.eps, beta_min=self.beta_min, beta_max=self.beta_max, n_jobs=self.n_jobs, # cpus used (feel free to limit it to something small, this will leave more cpus per model; for lgbm you have to use n_jobs=1, otherwise it will never finish) n_batch=self.n_batch, # If >0 use the data iterator with the specified number of batches seed=self.random_state, ) def _generate_data(self, n: int): syn = self.model.generate(batch_size=n) # put back in original column order if self.target_column in self.discrete_features: syn_X = pd.DataFrame( syn[:, :-1], columns=[x for x in self.ori_col_order if x != self.target_column], ) syn_y = pd.DataFrame( np.expand_dims(syn[:, -1], axis=1), columns=[self.target_column] ) syn = pd.concat([syn_X, syn_y], axis=1) syn = syn[self.ori_col_order] else: syn = pd.DataFrame(syn, columns=self.ori_col_order) return syn def _cleanup_additional_state_for_save(self) -> None: if not hasattr(self, "model"): return # These are only needed for imputation workflows, not generation. for attr in ("X1", "X_covs"): if hasattr(self.model, attr): setattr(self.model, attr, None)