Source code for synthyverse.generators.smote_generator.smote

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from imblearn.over_sampling import SMOTENC, SMOTE, SMOTEN


from ..base import TabularBaseGenerator


[docs] class SMOTEGenerator(TabularBaseGenerator): """Synthetic Minority Over-sampling Technique (SMOTE) for tabular data. Creates synthetic samples via interpolation in feature space using SMOTE. For classification tasks, the provided target column is used directly for class-conditional oversampling. For regression tasks, a pseudo-binary target is derived by splitting the target at its median, following a strategy similar to the TabDDPM paper. Args: target_column (str): Name of the target column used to drive oversampling. k_neighbors (int): Number of nearest neighbors used during interpolation. Default: 5. n_jobs (int): Number of parallel jobs for nearest-neighbor search. Default: -1. random_state (int): Random seed for reproducibility. Default: 0. **kwargs: Additional arguments passed to `TabularBaseGenerator`. Example: >>> import pandas as pd >>> from synthyverse.generators import SMOTEGenerator >>> >>> # Load data and define discrete features >>> X = pd.read_csv("data.csv") >>> discrete_features = ["target", "category_col"] >>> >>> # Create generator >>> generator = SMOTEGenerator( ... target_column="target", ... k_neighbors=5, ... random_state=42 ... ) >>> >>> # Fit and generate synthetic rows >>> generator.fit(X, discrete_features) >>> X_syn = generator.generate(1000) """ name = "smote" needs_target_column = True def __init__( self, target_column: str, k_neighbors: int = 5, n_jobs: int = -1, random_state: int = 0, **kwargs, ): self.target_column = target_column self.k_neighbors = k_neighbors self.n_jobs = n_jobs super().__init__(random_state=random_state, **kwargs) def _fit_model( self, X: pd.DataFrame, discrete_features: list, X_val: pd.DataFrame = None ): self.is_classification = self.target_column in discrete_features self.X = X.copy() if not self.is_classification: # pseudo outcome similar to TabDDPM paper self.y_train = np.where( self.X[self.target_column] > np.median(self.X[self.target_column]), 1, 0 ) self.y_train = pd.Series(self.y_train) else: self.y_train = self.X[self.target_column] self.X = self.X.drop(columns=[self.target_column]) self.discrete_features = [x for x in self.X.columns if x in discrete_features] self.numerical_features = [ x for x in self.X.columns if x not in discrete_features ] # SMOTE is not a model, so we don't need to fit it here def _generate_data(self, n: int): if len(self.numerical_features) > 0: if len(self.discrete_features) > 0: self.smote = SMOTENC else: self.smote = SMOTE else: self.smote = SMOTEN # setup SMOTE frac_samples = n / self.X.shape[0] sampling_strategy = { k: int((1 + frac_samples) * np.sum(self.y_train == k)) for k in np.unique(self.y_train) } obs_sum = sum(sampling_strategy.values()) diff = obs_sum - self.y_train.shape[0] # if too many / too few samples would be drawn, make adjustments to randomly chosen class if diff != n: c = np.random.choice(list(sampling_strategy.keys()), 1).item() sampling_strategy[c] += n - diff assert sum(sampling_strategy.values()) - self.y_train.shape[0] == n # compute per-class counts class_counts = pd.Series(self.y_train).value_counts() min_count = int(class_counts.min()) # SMOTE needs at least 2 samples in any class it will oversample if min_count < 2: raise ValueError( f"SMOTE cannot run: smallest class has {min_count} sample(s). " "Need at least 2." ) # cap k_neighbors so that k_neighbors <= min_count - 1 k_eff = min(self.k_neighbors, min_count - 1) nearest_neighbors = NearestNeighbors(n_neighbors=k_eff, n_jobs=self.n_jobs) # nearest_neighbors = NearestNeighbors( # n_neighbors=self.k_neighbors, n_jobs=self.n_jobs # ) params = { "sampling_strategy": sampling_strategy, "random_state": self.random_state, "k_neighbors": nearest_neighbors, } if len(self.discrete_features) > 0 and len(self.numerical_features) > 0: params["categorical_features"] = [ x for x in self.discrete_features if x != self.target_column ] self.smote = self.smote(**params) syn_X, syn_y = self.smote.fit_resample(self.X, self.y_train) # only retain fake data not true samples syn_X = syn_X[self.X.shape[0] :] syn_y = syn_y[self.y_train.shape[0] :] # shuffle generated data idx = np.random.permutation(len(syn_X)) syn_X = syn_X.iloc[idx] syn_y = syn_y.iloc[idx] syn_X, syn_y = syn_X.reset_index(drop=True), syn_y.reset_index(drop=True) if self.is_classification: # append y to data if target is discrete; for regression there was no real target syn_y = pd.Series(syn_y, name=self.target_column) syn_X = pd.concat([syn_X, syn_y], axis=1) syn_X[self.discrete_features] = syn_X[self.discrete_features].astype(int) return syn_X