Source code for synthyverse.generators.tvae_generator.tvae

from ctgan import TVAE
from ..base import TabularBaseGenerator
import pandas as pd


[docs] class TVAEGenerator(TabularBaseGenerator): """Tabular Variational Autoencoder (TVAE). Similar to CTGAN. Conditions on discrete columns, and uses mode-specific normalization for numerical columns. Uses the implementation from the ctgan package, which is also used in the Synthetic Data Vault. Paper: "Modeling tabular data using conditional gan" by Xu et al. (2019). Args: embedding_dim (int): Dimension of the embedding layer. Default: 128. compress_dims (tuple): Tuple of dimensions for encoder layers. Default: (128, 128). decompress_dims (tuple): Tuple of dimensions for decoder layers. Default: (128, 128). l2scale (float): L2 regularization scale. Default: 1e-5. batch_size (int): Batch size for training. Default: 500. epochs (int): Number of training epochs. Default: 300. loss_factor (int): Loss factor for Beta-VAE. Default: 2. cuda (bool): Whether to use CUDA if available. Default: True. verbose (bool): Whether to print training progress. Default: True. random_state (int): Random seed for reproducibility. Default: 0. **kwargs: Additional arguments passed to TabularBaseGenerator. Example: >>> import pandas as pd >>> from synthyverse.generators import TVAEGenerator >>> >>> # Load data >>> X = pd.read_csv("data.csv") >>> discrete_features = ["category_col"] >>> >>> # Create generator >>> generator = TVAEGenerator( ... embedding_dim=128, ... epochs=300, ... cuda=True, ... random_state=42 ... ) >>> >>> # Fit and generate >>> generator.fit(X, discrete_features) >>> X_syn = generator.generate(1000) """ name = "tvae" def __init__( self, embedding_dim=128, compress_dims=(128, 128), decompress_dims=(128, 128), l2scale=1e-5, batch_size=500, epochs=300, loss_factor=2, cuda=True, verbose=True, random_state: int = 0, **kwargs, ): super().__init__(random_state=random_state, **kwargs) self.embedding_dim = embedding_dim self.compress_dims = compress_dims self.decompress_dims = decompress_dims self.l2scale = l2scale self.batch_size = batch_size self.epochs = epochs self.loss_factor = loss_factor self.cuda = cuda self.verbose = verbose def _fit_model( self, X: pd.DataFrame, discrete_features: list, X_val: pd.DataFrame = None ): self.model = TVAE( embedding_dim=self.embedding_dim, compress_dims=self.compress_dims, decompress_dims=self.decompress_dims, l2scale=self.l2scale, batch_size=self.batch_size, verbose=self.verbose, epochs=self.epochs, cuda=self.cuda, ) self.model.fit(X, discrete_features) def _generate_data(self, n: int): return self.model.sample(n)