import pandas as pd
from sklearn.preprocessing import QuantileTransformer, StandardScaler
import torch
from ..base import TabularBaseGenerator
from .cdtd_dir import CDTD
[docs]
class CDTDGenerator(TabularBaseGenerator):
"""Continuous Diffusion for mixed-type Tabular Data (CDTD).
CDTD uses continuous diffusion for mixed-type tabular data. It provides several improvements to homogenize data types in the modelling process.
Uses the simple wrapper implementation from the original paper's authors (https://github.com/muellermarkus/cdtd_simple)
Paper: "Continuous Diffusion for Mixed-Type Tabular Data" by Mueller et al. (2023).
Args:
cat_emb_dim (int): Embedding dimension for categorical features. Default: 16.
mlp_emb_dim (int): Embedding dimension for MLP layers. Default: 256.
mlp_n_layers (int): Number of MLP layers. Default: 5.
mlp_n_units (int): Number of units per MLP layer. Default: 1024.
sigma_data_cat (float): Data sigma for categorical features. Default: 1.0.
sigma_data_cont (float): Data sigma for continuous features. Default: 1.0.
sigma_min_cat (float): Minimum sigma for categorical features. Default: 0.0.
sigma_min_cont (float): Minimum sigma for continuous features. Default: 0.0.
sigma_max_cat (float): Maximum sigma for categorical features. Default: 100.0.
sigma_max_cont (float): Maximum sigma for continuous features. Default: 80.0.
cat_emb_init_sigma (float): Initial sigma for categorical embeddings. Default: 0.001.
timewarp_type (str): Type of time warping. Options: "single", "bytype", "all". Default: "bytype".
timewarp_weight_low_noise (float): Weight for low noise in time warping. Default: 1.0.
num_steps_train (int): Number of training steps (iterations, not epochs). Default: 30000.
num_steps_warmup (int): Number of warmup steps. Default: 1000.
batch_size (int): Batch size for training. Default: 4096.
lr (float): Learning rate. Default: 1e-3.
ema_decay (float): Exponential moving average decay. Default: 0.999.
log_steps (int): Steps between logging. Default: 100.
random_state (int): Random seed for reproducibility. Default: 0.
**kwargs: Additional arguments passed to TabularBaseGenerator.
Example:
>>> import pandas as pd
>>> from synthyverse.generators import CDTDGenerator
>>>
>>> # Load data
>>> X = pd.read_csv("data.csv")
>>> discrete_features = ["category_col"]
>>>
>>> # Create generator
>>> generator = CDTDGenerator(
... timewarp_type="bytype",
... num_steps_train=30000,
... random_state=42
... )
>>>
>>> # Fit and generate
>>> generator.fit(X, discrete_features)
>>> X_syn = generator.generate(1000)
"""
name = "cdtd"
def __init__(
self,
cat_emb_dim: int = 16,
mlp_emb_dim: int = 256,
mlp_n_layers: int = 5,
mlp_n_units: int = 1024,
sigma_data_cat: float = 1.0,
sigma_data_cont: float = 1.0,
sigma_min_cat: float = 0.0,
sigma_min_cont: float = 0.0,
sigma_max_cat: float = 100.0,
sigma_max_cont: float = 80.0,
cat_emb_init_sigma: float = 0.001,
timewarp_type: str = "bytype", # 'single', 'bytype', or 'all'
timewarp_weight_low_noise: float = 1.0,
num_steps_train: int = 30_000,
num_steps_warmup: int = 1000,
batch_size: int = 4096,
lr: float = 1e-3,
ema_decay: float = 0.999,
log_steps: int = 100,
random_state: int = 0,
**kwargs,
):
super().__init__(random_state=random_state, **kwargs)
self.cdtd_params = {
"cat_emb_dim": cat_emb_dim,
"mlp_emb_dim": mlp_emb_dim,
"mlp_n_layers": mlp_n_layers,
"mlp_n_units": mlp_n_units,
"sigma_data_cat": sigma_data_cat,
"sigma_data_cont": sigma_data_cont,
"sigma_min_cat": sigma_min_cat,
"sigma_min_cont": sigma_min_cont,
"sigma_max_cat": sigma_max_cat,
"sigma_max_cont": sigma_max_cont,
"cat_emb_init_sigma": cat_emb_init_sigma,
"timewarp_type": timewarp_type, # 'single', 'bytype', or 'all'
"timewarp_weight_low_noise": timewarp_weight_low_noise,
}
self.training_params = {
"num_steps_train": num_steps_train,
"num_steps_warmup": num_steps_warmup,
"batch_size": batch_size,
"lr": lr,
"ema_decay": ema_decay,
"log_steps": log_steps,
"seed": self.random_state,
}
self.sample_params = {
"num_steps": 200,
"batch_size": batch_size,
"seed": self.random_state,
}
# GPUs can use fast float32 operations
if torch.cuda.is_available():
torch.set_float32_matmul_precision("high")
def _fit_model(
self, X: pd.DataFrame, discrete_features: list, X_val: pd.DataFrame = None
):
self.discrete_features = discrete_features
self.numerical_features = [
col for col in X.columns if col not in discrete_features
]
# retain original column order to output correct dataframe format after generation
self.col_order = X.columns
# discrete columns alreaady integer-encoded by BaseGenerator
X_discrete = X[discrete_features].to_numpy()
# quantile transform and standard scale numericals (tries to put 30 samples per bin, but caps range inside [10,1000])
X_numerical = X[self.numerical_features].to_numpy().astype(float)
self.quant_encoder = QuantileTransformer(
output_distribution="normal",
n_quantiles=max(min(X_numerical.shape[0] // 30, 1000), 10),
subsample=int(1e9),
random_state=self.random_state,
)
X_numerical = self.quant_encoder.fit_transform(X_numerical)
self.scaler = StandardScaler()
X_numerical = self.scaler.fit_transform(X_numerical)
X_discrete = torch.tensor(X_discrete).long()
X_numerical = torch.tensor(X_numerical).float()
self.cdtd = CDTD(
X_cat_train=X_discrete, X_cont_train=X_numerical, **self.cdtd_params
)
self.cdtd.fit(
X_cat_train=X_discrete, X_cont_train=X_numerical, **self.training_params
)
def _generate_data(self, n: int):
# synthesize
syn_X_discrete, syn_X_numerical = self.cdtd.sample(
num_samples=n, **self.sample_params
)
# postprocess to format expected by basegenerator
syn_X_numerical = self.scaler.inverse_transform(syn_X_numerical)
syn_X_numerical = self.quant_encoder.inverse_transform(syn_X_numerical)
# combine to synthetic dataset
syn_X = pd.concat(
(pd.DataFrame(syn_X_discrete), pd.DataFrame(syn_X_numerical)), axis=1
)
syn_X.columns = self.discrete_features + self.numerical_features
# rearrange columns to correct order
syn_X = syn_X[self.col_order]
return syn_X