Source code for synthyverse.generators.bn_generator.bn

import pandas as pd
from synthcity.plugins.core.dataloader import GenericDataLoader
from synthcity.plugins import Plugins

from ..base import TabularBaseGenerator


[docs] class BNGenerator(TabularBaseGenerator): """Bayesian Network (BN). Uses Bayesian networks to model dependencies between variables and generate synthetic data by sampling from the learned joint distribution. Uses the implementation from Synthcity (https://github.com/vanderschaarlab/synthcity/). Args: struct_learning_n_iter (int): Number of iterations for DAG learning. Default: 1000. struct_learning_search_method (str): Search method for DAG learning. Options: "hillclimb", "pc", "tree_search", "mmhc", "exhaustive". Default: "tree_search". struct_learning_score (str): Scoring function for DAG learning. Options: "k2", "bdeu", "bic", "bds". Default: "k2". struct_max_indegree (int): Maximum number of parents for each node. Decrease to reduce computational overhead. Default: 4. encoder_max_clusters (int): Maximum clusters for encoding continuous variables. Default: 10. encoder_noise_scale (float): Noise scale for encoding. Default: 0.1. random_state (int): Random seed for reproducibility. Default: 0. **kwargs: Additional arguments passed to TabularBaseGenerator. Example: >>> import pandas as pd >>> from synthyverse.generators import BNGenerator >>> >>> # Load data >>> X = pd.read_csv("data.csv") >>> discrete_features = ["category_col"] >>> >>> # Create generator >>> generator = BNGenerator( ... struct_learning_search_method="tree_search", ... struct_learning_score="k2", ... random_state=42 ... ) >>> >>> # Fit and generate >>> generator.fit(X, discrete_features) >>> X_syn = generator.generate(1000) """ name = "bn" def __init__( self, struct_learning_n_iter: int = 1000, struct_learning_search_method: str = "tree_search", # hillclimb, pc, tree_search, mmhc, exhaustive struct_learning_score: str = "k2", # k2, bdeu, bic, bds struct_max_indegree: int = 4, encoder_max_clusters: int = 10, encoder_noise_scale: float = 0.1, random_state: int = 0, **kwargs, ): super().__init__(random_state=random_state, **kwargs) self.model = Plugins().get( "bayesian_network", struct_learning_n_iter=struct_learning_n_iter, struct_learning_search_method=struct_learning_search_method, struct_learning_score=struct_learning_score, struct_max_indegree=struct_max_indegree, encoder_max_clusters=encoder_max_clusters, encoder_noise_scale=encoder_noise_scale, random_state=random_state, ) def _fit_model( self, X: pd.DataFrame, discrete_features: list, X_val: pd.DataFrame = None ): loader = GenericDataLoader( X, target_column=self.target_column, train_size=1, random_state=self.random_state, ) self.model.fit(loader) def _generate_data(self, n: int): syn = self.model.generate(n) return syn.dataframe()