Source code for synthyverse.generators.cdtd_generator.cdtd

import pandas as pd
from sklearn.preprocessing import QuantileTransformer, StandardScaler
import torch

from ..base import TabularBaseGenerator
from .cdtd_dir import CDTD


[docs] class CDTDGenerator(TabularBaseGenerator): """Continuous Diffusion for mixed-type Tabular Data (CDTD). CDTD uses continuous diffusion for mixed-type tabular data. It provides several improvements to homogenize data types in the modelling process. Uses the simple wrapper implementation from the original paper's authors (https://github.com/muellermarkus/cdtd_simple) Paper: "Continuous Diffusion for Mixed-Type Tabular Data" by Mueller et al. (2023). Args: cat_emb_dim (int): Embedding dimension for categorical features. Default: 16. mlp_emb_dim (int): Embedding dimension for MLP layers. Default: 256. mlp_n_layers (int): Number of MLP layers. Default: 5. mlp_n_units (int): Number of units per MLP layer. Default: 1024. sigma_data_cat (float): Data sigma for categorical features. Default: 1.0. sigma_data_cont (float): Data sigma for continuous features. Default: 1.0. sigma_min_cat (float): Minimum sigma for categorical features. Default: 0.0. sigma_min_cont (float): Minimum sigma for continuous features. Default: 0.0. sigma_max_cat (float): Maximum sigma for categorical features. Default: 100.0. sigma_max_cont (float): Maximum sigma for continuous features. Default: 80.0. cat_emb_init_sigma (float): Initial sigma for categorical embeddings. Default: 0.001. timewarp_type (str): Type of time warping. Options: "single", "bytype", "all". Default: "bytype". timewarp_weight_low_noise (float): Weight for low noise in time warping. Default: 1.0. num_steps_train (int): Number of training steps (iterations, not epochs). Default: 30000. num_steps_warmup (int): Number of warmup steps. Default: 1000. batch_size (int): Batch size for training. Default: 4096. lr (float): Learning rate. Default: 1e-3. ema_decay (float): Exponential moving average decay. Default: 0.999. log_steps (int): Steps between logging. Default: 100. random_state (int): Random seed for reproducibility. Default: 0. **kwargs: Additional arguments passed to TabularBaseGenerator. Example: >>> import pandas as pd >>> from synthyverse.generators import CDTDGenerator >>> >>> # Load data >>> X = pd.read_csv("data.csv") >>> discrete_features = ["category_col"] >>> >>> # Create generator >>> generator = CDTDGenerator( ... timewarp_type="bytype", ... num_steps_train=30000, ... random_state=42 ... ) >>> >>> # Fit and generate >>> generator.fit(X, discrete_features) >>> X_syn = generator.generate(1000) """ name = "cdtd" def __init__( self, cat_emb_dim: int = 16, mlp_emb_dim: int = 256, mlp_n_layers: int = 5, mlp_n_units: int = 1024, sigma_data_cat: float = 1.0, sigma_data_cont: float = 1.0, sigma_min_cat: float = 0.0, sigma_min_cont: float = 0.0, sigma_max_cat: float = 100.0, sigma_max_cont: float = 80.0, cat_emb_init_sigma: float = 0.001, timewarp_type: str = "bytype", # 'single', 'bytype', or 'all' timewarp_weight_low_noise: float = 1.0, num_steps_train: int = 30_000, num_steps_warmup: int = 1000, batch_size: int = 4096, lr: float = 1e-3, ema_decay: float = 0.999, log_steps: int = 100, random_state: int = 0, **kwargs, ): super().__init__(random_state=random_state, **kwargs) self.cdtd_params = { "cat_emb_dim": cat_emb_dim, "mlp_emb_dim": mlp_emb_dim, "mlp_n_layers": mlp_n_layers, "mlp_n_units": mlp_n_units, "sigma_data_cat": sigma_data_cat, "sigma_data_cont": sigma_data_cont, "sigma_min_cat": sigma_min_cat, "sigma_min_cont": sigma_min_cont, "sigma_max_cat": sigma_max_cat, "sigma_max_cont": sigma_max_cont, "cat_emb_init_sigma": cat_emb_init_sigma, "timewarp_type": timewarp_type, # 'single', 'bytype', or 'all' "timewarp_weight_low_noise": timewarp_weight_low_noise, } self.training_params = { "num_steps_train": num_steps_train, "num_steps_warmup": num_steps_warmup, "batch_size": batch_size, "lr": lr, "ema_decay": ema_decay, "log_steps": log_steps, "seed": self.random_state, } self.sample_params = { "num_steps": 200, "batch_size": batch_size, "seed": self.random_state, } # GPUs can use fast float32 operations if torch.cuda.is_available(): torch.set_float32_matmul_precision("high") def _fit_model( self, X: pd.DataFrame, discrete_features: list, X_val: pd.DataFrame = None ): self.discrete_features = discrete_features self.numerical_features = [ col for col in X.columns if col not in discrete_features ] # retain original column order to output correct dataframe format after generation self.col_order = X.columns # discrete columns alreaady integer-encoded by BaseGenerator X_discrete = X[discrete_features].to_numpy() # quantile transform and standard scale numericals (tries to put 30 samples per bin, but caps range inside [10,1000]) X_numerical = X[self.numerical_features].to_numpy().astype(float) self.quant_encoder = QuantileTransformer( output_distribution="normal", n_quantiles=max(min(X_numerical.shape[0] // 30, 1000), 10), subsample=int(1e9), random_state=self.random_state, ) X_numerical = self.quant_encoder.fit_transform(X_numerical) self.scaler = StandardScaler() X_numerical = self.scaler.fit_transform(X_numerical) X_discrete = torch.tensor(X_discrete).long() X_numerical = torch.tensor(X_numerical).float() self.cdtd = CDTD( X_cat_train=X_discrete, X_cont_train=X_numerical, **self.cdtd_params ) self.cdtd.fit( X_cat_train=X_discrete, X_cont_train=X_numerical, **self.training_params ) def _generate_data(self, n: int): # synthesize syn_X_discrete, syn_X_numerical = self.cdtd.sample( num_samples=n, **self.sample_params ) # postprocess to format expected by basegenerator syn_X_numerical = self.scaler.inverse_transform(syn_X_numerical) syn_X_numerical = self.quant_encoder.inverse_transform(syn_X_numerical) # combine to synthetic dataset syn_X = pd.concat( (pd.DataFrame(syn_X_discrete), pd.DataFrame(syn_X_numerical)), axis=1 ) syn_X.columns = self.discrete_features + self.numerical_features # rearrange columns to correct order syn_X = syn_X[self.col_order] return syn_X