-
Notifications
You must be signed in to change notification settings - Fork 89
Open
Description
Here is my code:
import pandas as pd
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import display_bayesian_network
# Load your dataset from the CSV file
input_data_file = 'C:\\Users\\DAIZO\\Documents\\Python\\thyroid-cancer-dataset-2\\dataset\\data_train.csv'
data = pd.read_csv(input_data_file)
# Backup the original dataset
data_backup = data.copy()
# Specify categorical attributes
categorical_attributes = {
# Binary
"binary__SEXE":True,
"binary__NIVEAU_INSTRUC":True,
"binary__SECURITE_SOCIALE":True,
"binary__ACTIVITE_VIGOUREUSE":True,
"binary__ATCD_PER_KC":True,
"binary__MCV_FAM":True,
"binary__M":True,
"binary__INVASION_VASCULAIRE":True,
"binary__MULTIFOCALITE":True,
"binary__META":True,
"binary__MALADIE_CV":True,
"binary__Tabagisme":True,
"binary__Alcoolisme":True,
"binary__Papillaire":True,
"binary__Medullaire":True,
"binary__Vésiculaire":True,
"binary__b_ETE":True,
# Ordinal
"ordinal__REVENU_ANNUEL": True,
"ordinal__RISQUE_RECIDIVE_ATA": True,
"ordinal__YEAR_CHIRURGIE": True,
"ordinal__T": True,
"ordinal__DOSE_CUMULEE_IODE": True,
"ordinal__ETE": True,
"ordinal__RISK_AJCC8": True,
# Nominal
"nominal__STATUT_MATRIMONIAL Marié": True,
"nominal__STATUT_MATRIMONIAL Célibataire": True,
"nominal__STATUT_MATRIMONIAL Divorcé": True,
"nominal__STATUT_MATRIMONIAL Veuf": True,
"nominal__ACTIVITE_POFESSIONNELLE Employé": True,
"nominal__ACTIVITE_POFESSIONNELLE Indépendant": True,
"nominal__ACTIVITE_POFESSIONNELLE Maître (sse) de maison": True,
"nominal__ACTIVITE_POFESSIONNELLE Retraité(e)": True,
"nominal__ACTIVITE_POFESSIONNELLE Étudiant": True,
"nominal__ACTIVITE_POFESSIONNELLE Chômeur (se)": True,
"nominal__TYPE_HISTOLOGIQUE NIFT": True,
"nominal__TYPE_HISTOLOGIQUE Tumeur vesiculaire à potentiel de malignité incertain": True,
"nominal__TYPE_HISTOLOGIQUE Papillaire": True,
"nominal__TYPE_HISTOLOGIQUE Vésiculaire": True,
"nominal__TYPE_HISTOLOGIQUE Peu différencié": True,
"nominal__TYPE_HISTOLOGIQUE Anaplasique": True,
"nominal__TYPE_HISTOLOGIQUE Medullaire": True,
"nominal__MALADIE_CV_CONNUE Non": True,
"nominal__MALADIE_CV_CONNUE cardiopathie ischémique": True,
"nominal__MALADIE_CV_CONNUE Insuffisance cardiaque": True,
"nominal__MALADIE_CV_CONNUE Maladie rythmique": True,
"nominal__MALADIE_CV_CONNUE AOMI": True,
"nominal__MALADIE_CV_CONNUE Maladie rythmique+ IC": True,
"nominal__MALADIE_CV_CONNUE TVP": True,
"nominal__MALADIE_CV_CONNUE AVC": True,
"nominal__TABAC_STAT Jamais": True,
"nominal__TABAC_STAT Actif": True,
"nominal__TABAC_STAT Ancien": True,
"nominal__ALCOOL_STATUS Jamais": True,
"nominal__ALCOOL_STATUS Actif": True,
"nominal__ALCOOL_STATUS Ancien": True,
"nominal__RISK_DYNAMIQ Excellente réponse": True,
"nominal__RISK_DYNAMIQ Excellente Réponse indeterminée": True,
"nominal__RISK_DYNAMIQ Excellente Réponse biologique incomplète": True,
"nominal__RISK_DYNAMIQ Excellente Réponse radiologique incomplète": True,
"N": True,
}
# Define privacy settings
epsilon = 0.1
degree_of_bayesian_network = 2
num_tuples_to_generate = 1000
# Initialize DataDescriber with category threshold
describer = DataDescriber(category_threshold=5)
# Describe the dataset to create a Bayesian network
describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data_file,
epsilon=epsilon,
k=degree_of_bayesian_network,
attribute_to_is_categorical=categorical_attributes
)
description_file = 'C:\\Users\\DAIZO\\Documents\\Python\\thyroid-cancer-dataset-2\\Notebooks\\models\\out.json'
describer.save_dataset_description_to_file(description_file)
display_bayesian_network(describer.bayesian_network)\
generator = DataGenerator()
generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
# Save synthetic data to a CSV file
synthetic_data_file = 'synthetic__data.csv'
generator.save_synthetic_data(synthetic_data_file)
here is the error I got:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], [line 2](vscode-notebook-cell:?execution_count=7&line=2)
1 generator = DataGenerator()
----> [2](vscode-notebook-cell:?execution_count=7&line=2) generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
3 # Save synthetic data to a CSV file
4 synthetic_data_file = 'synthetic_retail_data.csv'
File c:\Users\DAIZO\miniconda3\envs\BRAF_lab\Lib\site-packages\DataSynthesizer\DataGenerator.py:65, in DataGenerator.generate_dataset_in_correlated_attribute_mode(self, n, description_file, seed)
63 all_attributes = self.description['meta']['all_attributes']
64 candidate_keys = set(self.description['meta']['candidate_keys'])
---> [65](file:///C:/Users/DAIZO/miniconda3/envs/BRAF_lab/Lib/site-packages/DataSynthesizer/DataGenerator.py:65) self.encoded_dataset = DataGenerator.generate_encoded_dataset(self.n, self.description)
66 self.synthetic_dataset = DataFrame(columns=all_attributes)
67 for attr in all_attributes:
File c:\Users\DAIZO\miniconda3\envs\BRAF_lab\Lib\site-packages\DataSynthesizer\DataGenerator.py:99, in DataGenerator.generate_encoded_dataset(n, description)
97 for parents_instance in child_conditional_distributions.keys():
98 dist = child_conditional_distributions[parents_instance]
---> [99](file:///C:/Users/DAIZO/miniconda3/envs/BRAF_lab/Lib/site-packages/DataSynthesizer/DataGenerator.py:99) parents_instance = list(eval(parents_instance))
101 filter_condition = ''
102 for parent, value in zip(parents, parents_instance):
File <string>:1
NameError: name 'np' is not defined
Metadata
Metadata
Assignees
Labels
No labels