From a1b5886e9fce4cfeebfccaaa3dc63342b8e927ea Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Fri, 11 Jul 2025 20:40:29 +0000 Subject: [PATCH 01/22] changed the name of dollarstreet research prompt in json file to dollarstreet_finetunebalance --- fairnessBench/benchmarks/tasks.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fairnessBench/benchmarks/tasks.json b/fairnessBench/benchmarks/tasks.json index 45ef804..d45470a 100644 --- a/fairnessBench/benchmarks/tasks.json +++ b/fairnessBench/benchmarks/tasks.json @@ -412,7 +412,7 @@ "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the synthetic adult data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", "benchmark_folder_name": "sampadult-eod-sex" }, - "dollarstreet_income": { + "dollarstreet_finetunebalance": { "research_problem": "Go through the data_description.txt file to understand the dataset and its features, then summarize key points in your research logs to track your actions. Update the provided train.py script to fine-tune a pretrained image classifier with a focus on improving fairness across income levels. You may experiment with freezing/unfreezing strategies, learning rates, adjusting model architecture and fairness-oriented preprocessing or optimizing loss functions to achieve this goal. Do not inspect any data (csv or image) files directly. Use the execute script action to evaluate whether the modifications improve fairness outcomes.", "benchmark_folder_name": "dollarstreet-income" } From 25df3f7e0b591aa15f9b782dfd67d9307ecb0556 Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Sat, 1 Nov 2025 03:03:18 +0000 Subject: [PATCH 02/22] Fixed env_readonly_files function in environment.py and modified dollarstreet env_read_only_files accordingly --- .../scripts/env_read_only_files.txt | 6 ++++++ .../dollarstreet-income/scripts/read_only_files.txt | 4 ++-- fairnessBench/environment.py | 10 +++++----- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt index e69de29..e1cd988 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt @@ -0,0 +1,6 @@ +assets/* +./images_v2_imagenet_test.csv +./topics_to_imagenet_classes_map.json +./README.md +./images_v2.csv +./images_v2_imagenet_train.csv \ No newline at end of file diff --git a/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt b/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt index 4f3d041..fba2668 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt +++ b/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt @@ -1,6 +1,6 @@ -./assets/* +assets/* ./images_v2_imagenet_test.csv ./topics_to_imagenet_classes_map.json ./README.md ./images_v2.csv -./images_v2_imagenet_train.csv \ No newline at end of file +./images_v2_imagenet_train.csv diff --git a/fairnessBench/environment.py b/fairnessBench/environment.py index b111272..2789ae8 100644 --- a/fairnessBench/environment.py +++ b/fairnessBench/environment.py @@ -200,13 +200,13 @@ def _initialize_task_env(self): llm_ignore_files = open(os.path.join(benchmark_dir, "scripts", "env_read_only_files.txt"), "r").read().split("\n") for path, subdirs, files in os.walk(os.path.join(work_dir)): - # relpath = os.path.relpath(path, work_dir) + relpath = os.path.relpath(path, work_dir) # filter out the files that are read only - # filenames = [os.path.join(relpath, filename) for filename in files] + filenames = [os.path.join(relpath, filename) for filename in files] for llm_ignore_file in llm_ignore_files: - # llm_ignore_filenames = [n for n in filenames if fnmatch.fnmatch(n, llm_ignore)] - if llm_ignore_file in files: - self.env_read_only_files.append(llm_ignore_file) + llm_ignore_filenames = [n for n in filenames if fnmatch.fnmatch(n, llm_ignore_file)] + #if llm_ignore_file in files: + self.env_read_only_files.extend(llm_ignore_filenames) # init backup folder and remove all content if it exists From 43c34b70b234e35487d109831658f9c24f5f457c Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Sat, 1 Nov 2025 03:07:38 +0000 Subject: [PATCH 03/22] Added new task: health_di_race --- .../health-di-race/env/data_description.txt | 126 ++++++++++++++++++ .../benchmarks/health-di-race/env/train.py | 44 ++++++ .../scripts/env_read_only_files.txt | 4 + .../benchmarks/health-di-race/scripts/eval.py | 45 +++++++ .../health-di-race/scripts/prepare.py | 51 +++++++ .../scripts/read_only_files.txt | 4 + fairnessBench/benchmarks/tasks.json | 8 +- 7 files changed, 280 insertions(+), 2 deletions(-) create mode 100644 fairnessBench/benchmarks/health-di-race/env/data_description.txt create mode 100644 fairnessBench/benchmarks/health-di-race/env/train.py create mode 100644 fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt create mode 100644 fairnessBench/benchmarks/health-di-race/scripts/eval.py create mode 100644 fairnessBench/benchmarks/health-di-race/scripts/prepare.py create mode 100644 fairnessBench/benchmarks/health-di-race/scripts/read_only_files.txt diff --git a/fairnessBench/benchmarks/health-di-race/env/data_description.txt b/fairnessBench/benchmarks/health-di-race/env/data_description.txt new file mode 100644 index 0000000..8f24250 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/env/data_description.txt @@ -0,0 +1,126 @@ +patient-age: Integer + +lifestyle : Categorical feature + Private + Self-emp-not-inc + Self-emp-inc + Federal-gov + Local-gov + State-gov + Without-pay + Never-worked + +insurance_provider : Categorical + Education Level + Bachelors + Some-college + 11th + HS-grad + Prof-school + Assoc-acdm + Assoc-voc + 9th + 7th-8th + 12th + Masters + 1st-4th + 10th + Doctorate + 5th-6th + Preschool + +marital-status : Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +procedure_on_the_visit : Categorical + Tech-support, + Craft-repair, + Other-service, + Sales, + Exec-managerial, + Prof-specialty, + Handlers-cleaners, + Machine-op-inspct, + Adm-clerical, + Farming-fishing, + Transport-moving, + Priv-house-serv, + Protective-serv, + Armed-Forces + +relationship : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +race : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +gender : Binary + Female + Male + +cholesterol : Integer + +blood-pressure : Integer + +daily-activity-hours : Integer + +reason-for-visit : Categorical + United-States, + Cambodia, + England, + Puerto-Rico, + Canada, + Germany, + Outlying-US(Guam-USVI-etc), + India, + Japan, + Greece, + South, + China, + Cuba, + Iran, + Honduras, + Philippines, + Italy, + Poland, + Jamaica, + Vietnam, + Mexico, + Portugal, + Ireland, + France, + Dominican-Republic, + Laos, Ecuador, + Taiwan, + Haiti, + Columbia, + Hungary, + Guatemala, + Nicaragua, + Scotland, + Thailand, + Yugoslavia, + El-Salvador, + Trinadad&Tobago, + Peru, + Hong, + Holand-Netherlands + +health-risk : binary target + 1 + 0 \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-race/env/train.py b/fairnessBench/benchmarks/health-di-race/env/train.py new file mode 100644 index 0000000..578e4c3 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['race'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "race" : prot_attr}) + +privileged_groups = [{'race': 1}] +unprivileged_groups = [{'race': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['race']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +disparate_impact = metric.disparate_impact() + +print('accuracy score:', accuracy) +print('disparate impact ratio', disparate_impact) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt new file mode 100644 index 0000000..ecc89d9 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +train_X.csv +train_y.csv +test_X.csv +test_y.csv diff --git a/fairnessBench/benchmarks/health-di-race/scripts/eval.py b/fairnessBench/benchmarks/health-di-race/scripts/eval.py new file mode 100644 index 0000000..88dd899 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'race': 1}] + unprivileged_groups = [{'race': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['race']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-race/scripts/prepare.py b/fairnessBench/benchmarks/health-di-race/scripts/prepare.py new file mode 100644 index 0000000..fa681b3 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/scripts/prepare.py @@ -0,0 +1,51 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["race"] = X["race"].apply(lambda i: 1 if i == 'White' else 0) +X = X.replace('?', np.nan) +#print((X.head())) +cholesterol = MinMaxScaler(feature_range=(100, 300)) +X['cholesterol'] = cholesterol.fit_transform(X[['capital-gain']]) + +bp = MinMaxScaler(feature_range=(80, 180)) +X['blood-pressure'] = bp.fit_transform(X[['capital-loss']]) + +activity = MinMaxScaler(feature_range=(0, 24)) +X['daily-activity-hours'] = activity.fit_transform(X[['hours-per-week']]) + +X.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt'], axis=1, inplace=True) +#print((X.head())) +#print((X.columns)) + +X.rename(columns={'age': 'patient-age', 'workclass': 'lifestyle', 'education': 'insurance_provider', 'occupation': 'pocedure_on_the_visite', 'sex':'gender', 'native-country': 'reason_for_visit'}, inplace=True) +y.rename(columns={'income': 'health-risk'}, inplace = True) + +# print((X.columns)) +# print((y.columns)) + +categorical_columns = ["lifestyle", "insurance_provider", "marital-status", "pocedure_on_the_visite", "relationship", + "gender", "reason_for_visit"] + +X_enc = pd.get_dummies(X, columns=categorical_columns) +X_enc = X_enc.replace({True: 1, False: 0}) + +train_X, test_X, train_y, test_y = train_test_split(X_enc, y, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-race/scripts/read_only_files.txt b/fairnessBench/benchmarks/health-di-race/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/tasks.json b/fairnessBench/benchmarks/tasks.json index d45470a..12af2cd 100644 --- a/fairnessBench/benchmarks/tasks.json +++ b/fairnessBench/benchmarks/tasks.json @@ -415,5 +415,9 @@ "dollarstreet_finetunebalance": { "research_problem": "Go through the data_description.txt file to understand the dataset and its features, then summarize key points in your research logs to track your actions. Update the provided train.py script to fine-tune a pretrained image classifier with a focus on improving fairness across income levels. You may experiment with freezing/unfreezing strategies, learning rates, adjusting model architecture and fairness-oriented preprocessing or optimizing loss functions to achieve this goal. Do not inspect any data (csv or image) files directly. Use the execute script action to evaluate whether the modifications improve fairness outcomes.", "benchmark_folder_name": "dollarstreet-income" - } -} \ No newline at end of file + }, + "health_di_balance-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across race. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-di-race" + } +} From 4d258ec2eb3d66a61b3f96aa126ee9fbfc790b11 Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Sat, 1 Nov 2025 03:08:45 +0000 Subject: [PATCH 04/22] dollarstreet: modified train.py to correct relative path --- fairnessBench/benchmarks/dollarstreet-income/env/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fairnessBench/benchmarks/dollarstreet-income/env/train.py b/fairnessBench/benchmarks/dollarstreet-income/env/train.py index f060560..f56a1cf 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/env/train.py +++ b/fairnessBench/benchmarks/dollarstreet-income/env/train.py @@ -45,8 +45,8 @@ def __getitem__(self, idx): train_csv = "./images_v2_imagenet_train.csv" test_csv = "./images_v2_imagenet_test.csv" -train_dataset = DollarStreetDataset(train_csv, ".", transform) -test_dataset = DollarStreetDataset(test_csv, ".", transform) +train_dataset = DollarStreetDataset(csv_path=train_csv, root_dir=".", transform=transform) +test_dataset = DollarStreetDataset(csv_path = test_csv, root_dir=".", transform=transform) num_classes = len(train_dataset.synset2idx) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4) test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4) From c045b80c92cdbb075d782f0ceb03946e3b5ceff5 Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Sat, 1 Nov 2025 03:21:15 +0000 Subject: [PATCH 05/22] runner.py: printing only the first 10 items of read_only lists as they might be large --- fairnessBench/runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fairnessBench/runner.py b/fairnessBench/runner.py index bd7b4b7..7437428 100644 --- a/fairnessBench/runner.py +++ b/fairnessBench/runner.py @@ -29,8 +29,8 @@ def run(agent_cls, args): print("Research problem: ", research_problem) print("Lower level actions enabled: ", [action.name for action in env.low_level_actions]) print("High level actions enabled: ", [action.name for action in env.high_level_actions]) - print("Read only files: ", env.read_only_files, file=sys.stderr) - print("Env read only files: ", env.env_read_only_files, file=sys.stderr) + print("Read only files: ", env.read_only_files if len(env.read_only_files) < 10 else env.read_only_files[:10], file=sys.stderr) + print("Env read only files: ", env.env_read_only_files if len(env.env_read_only_files) < 10 else env.env_read_only_files[:10], file=sys.stderr) print("=====================================") # AS: Create agent object from whichever agent was requested in agrs From 6333d38f99a263a28fe80485dc982968d9b9ba27 Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Sat, 1 Nov 2025 03:33:56 +0000 Subject: [PATCH 06/22] multi_run_experiment: Now have a base path to which logs would be saved to --- multi_run_experiment.sh | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/multi_run_experiment.sh b/multi_run_experiment.sh index 322e994..da6e8b1 100644 --- a/multi_run_experiment.sh +++ b/multi_run_experiment.sh @@ -5,6 +5,9 @@ # For every task this script will be run at least 3 times; model, retrival, agent/s # This scrip calls on the runner.py +# Base path depends on where we want to place out logs (base log folder) (work/scratch/project/...) +base="/scratch3/workspace/ayman_sandouk_uri_edu-fairness/fairnessBench/" + # grab preliminary info exp_path=$1 task=$2 @@ -21,12 +24,11 @@ do shift done - - extra_args="${@}" folder=$exp_path python=$(which python) + echo "exp_path: $exp_path" echo "task: $task" echo "n_devices: $n_device" @@ -40,21 +42,21 @@ for i in "${devices[@]}" do # time in current Unix timestamp ts=$(date +%s) - + echo "Run: #$ts" # Check for log folder with a time-named folder in it or create one - if [ -d "/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts" ]; then - echo "Folder /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts already exists. removing it" - rm -rf /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts + if [ -d "$base/$folder/$ts" ]; then + echo "Folder $base/$folder/$ts already exists. removing it" + rm -rf $base/$folder/$ts fi - mkdir -p "/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts" + mkdir -p "$base/$folder/$ts" # Call the prepare task script python -u -m fairnessBench.prepare_task $task $python # Printing command for debugging purposes and executing task with runner.py - echo "python -u -m fairnessBench.runner --python $python --task $task --device $i --log-dir /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts --work-dir /scratch3/workspace/ayman_sandouk_uri_edu-fairness/fairnessBench/workspaces/$folder/$ts ${extra_args}" > /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts/log 2>&1 & + echo "python -u -m fairnessBench.runner --python $python --task $task --device $i --log-dir $base/$folder/$ts --work-dir $base/workspaces/$folder/$ts ${extra_args}" > $base/$folder/$ts/log 2>&1 & - eval "python -u -m fairnessBench.runner --python $python --task $task --device $i --log-dir /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts --work-dir /scratch3/workspace/ayman_sandouk_uri_edu-fairness/fairnessBench/workspaces/$folder/$ts ${extra_args}" > /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts/log 2>&1 & + eval "python -u -m fairnessBench.runner --python $python --task $task --device $i --log-dir $base/$folder/$ts --work-dir $base/workspaces/$folder/$ts ${extra_args}" > $base/$folder/$ts/log 2>&1 & # 2 seconds between runs sleep 2 From d8aa7b89104c09dc657f769c1c7a594ca7708a94 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sun, 2 Nov 2025 03:10:43 +0000 Subject: [PATCH 07/22] LLM.py fixed stop sequence to not include observation without colons --- fairnessBench/LLM.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fairnessBench/LLM.py b/fairnessBench/LLM.py index df3b020..a6e2f19 100644 --- a/fairnessBench/LLM.py +++ b/fairnessBench/LLM.py @@ -606,13 +606,13 @@ def complete_text(prompt, log_file, model, device=0, **kwargs): if model.startswith("claude"): # use anthropic API - completion = complete_text_claude(prompt, stop_sequences=[anthropic.HUMAN_PROMPT,"Observation:", "Observation"], log_file=log_file, model=model, **kwargs) + completion = complete_text_claude(prompt, stop_sequences=[anthropic.HUMAN_PROMPT,"Observation:"], log_file=log_file, model=model, **kwargs) elif model.startswith("gemini"): - completion = complete_text_gemini(prompt, stop_sequences=["Observation:", "Observation"], log_file=log_file, model=model, **kwargs) + completion = complete_text_gemini(prompt, stop_sequences=["Observation:"], log_file=log_file, model=model, **kwargs) elif model.startswith("llama"): - completion = complete_text_hf(prompt, stop_sequences=["Observation:", "Observation"], log_file=log_file, model=model, device=device, **kwargs) + completion = complete_text_hf(prompt, stop_sequences=["Observation:"], log_file=log_file, model=model, device=device, **kwargs) elif model.startswith("qwen"): - completion = complete_text_qwen(prompt, stop_sequences=["Observation:", "Observation"], log_file=log_file, model=model, device=device, **kwargs) + completion = complete_text_qwen(prompt, stop_sequences=["Observation:"], log_file=log_file, model=model, device=device, **kwargs) elif model.startswith("granite"): completion = complete_text_granite(prompt, stop_sequences=["}"], log_file=log_file, model=model, device=device, **kwargs) elif model.startswith("deepseek"): @@ -621,10 +621,10 @@ def complete_text(prompt, log_file, model, device=0, **kwargs): completion = complete_text_gemma(prompt, stop_sequences=["}"], log_file=log_file, model=model, device=device, **kwargs) elif "/" in model: # use CRFM API since this specifies organization like "openai/..." - completion = complete_text_crfm(prompt, stop_sequences=["Observation:", "Observation"], log_file=log_file, model=model, **kwargs) + completion = complete_text_crfm(prompt, stop_sequences=["Observation:"], log_file=log_file, model=model, **kwargs) else: # use OpenAI API - completion = complete_text_openai(prompt, stop_sequences=["Observation:", "Observation"], log_file=log_file, model=model, **kwargs) + completion = complete_text_openai(prompt, stop_sequences=["Observation:"], log_file=log_file, model=model, **kwargs) return completion From 68b4085219c442644a27cd2c5f84be67ab745e5a Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Tue, 4 Nov 2025 06:14:45 +0000 Subject: [PATCH 08/22] prepare.py: scaling and renaming columns, domain change- adult to health care, and data_description: changed the values and col names wrt health care data --- .../health-di-race/env/data_description.txt | 165 +++++++++--------- .../health-di-race/scripts/prepare.py | 81 +++++++-- 2 files changed, 144 insertions(+), 102 deletions(-) diff --git a/fairnessBench/benchmarks/health-di-race/env/data_description.txt b/fairnessBench/benchmarks/health-di-race/env/data_description.txt index 8f24250..1ef9d2f 100644 --- a/fairnessBench/benchmarks/health-di-race/env/data_description.txt +++ b/fairnessBench/benchmarks/health-di-race/env/data_description.txt @@ -1,35 +1,33 @@ -patient-age: Integer +patient_age: Integer -lifestyle : Categorical feature - Private - Self-emp-not-inc - Self-emp-inc - Federal-gov - Local-gov - State-gov - Without-pay - Never-worked +lifestyle: Categorical feature + Sedentary + Moderately-active + Active + Highly-active + Unhealthy + Healthy + At-risk insurance_provider : Categorical - Education Level - Bachelors - Some-college - 11th - HS-grad - Prof-school - Assoc-acdm - Assoc-voc - 9th - 7th-8th - 12th - Masters - 1st-4th - 10th - Doctorate - 5th-6th - Preschool + UnitedHealthcare + Blue Cross Blue Shield + Aetna + Cigna + Humana + Kaiser Permanente + Anthem + Molina Healthcare + Centene Corporation + Highmark Health + Health Net + WellCare Health Plans + CareFirst + Independence Blue Cross + Fidelis Care + Oscar Health -marital-status : Categorical +marital-status: Categorical Married-civ-spouse, Divorced, Never-married, @@ -39,20 +37,20 @@ marital-status : Categorical Married-AF-spouse procedure_on_the_visit : Categorical - Tech-support, - Craft-repair, - Other-service, - Sales, - Exec-managerial, - Prof-specialty, - Handlers-cleaners, - Machine-op-inspct, - Adm-clerical, - Farming-fishing, - Transport-moving, - Priv-house-serv, - Protective-serv, - Armed-Forces + Administrative-check + Specialist-consultation + Minor-procedure + Major-surgery + General-checkup + Diagnostic-test + Rehabilitation + Emergency + Preventive-screening + Therapy-session + Follow-up + Mental-health-consultation + Chronic-disease-management + Vaccination relationship : Categorical Wife, @@ -80,47 +78,48 @@ blood-pressure : Integer daily-activity-hours : Integer reason-for-visit : Categorical - United-States, - Cambodia, - England, - Puerto-Rico, - Canada, - Germany, - Outlying-US(Guam-USVI-etc), - India, - Japan, - Greece, - South, - China, - Cuba, - Iran, - Honduras, - Philippines, - Italy, - Poland, - Jamaica, - Vietnam, - Mexico, - Portugal, - Ireland, - France, - Dominican-Republic, - Laos, Ecuador, - Taiwan, - Haiti, - Columbia, - Hungary, - Guatemala, - Nicaragua, - Scotland, - Thailand, - Yugoslavia, - El-Salvador, - Trinadad&Tobago, - Peru, - Hong, - Holand-Netherlands + General-checkup + Routine-follow-up + New-symptom + Fever + Cold-Flu-Cough + Headache-migraine + Heart-concerns + Respiratory-issue + Digestive-issue + Pain-related + Skin-concern + Eye + ENT + Mental-health-concerns + Stress + Fatigue-weakness + Weight-concerns + Diabetes/Bloodsugar + BP-concern + Cholesterol-lipid-concern + Allergy-symptoms + Prenatal + Gynecological-concern + Pediatrics + Age-related + Chronic-disease + Medication-side-effect + Injury + Trauma + Preventive-counseling + Genetic-risks + Vaccination-inquiry + Health-counseling + Substance-concern + Surgical-consultation + Specialist-referral + Discharge + Lab-result-discussion' + Second-opinion + Administrative-inquiry + Other -health-risk : binary target +health_risk : binary target 1 0 \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-race/scripts/prepare.py b/fairnessBench/benchmarks/health-di-race/scripts/prepare.py index fa681b3..4c52895 100644 --- a/fairnessBench/benchmarks/health-di-race/scripts/prepare.py +++ b/fairnessBench/benchmarks/health-di-race/scripts/prepare.py @@ -5,6 +5,7 @@ import pandas as pd from ucimlrepo import fetch_ucirepo from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split adult = fetch_ucirepo(id=2) @@ -16,33 +17,75 @@ y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) X["race"] = X["race"].apply(lambda i: 1 if i == 'White' else 0) X = X.replace('?', np.nan) -#print((X.head())) -cholesterol = MinMaxScaler(feature_range=(100, 300)) -X['cholesterol'] = cholesterol.fit_transform(X[['capital-gain']]) -bp = MinMaxScaler(feature_range=(80, 180)) -X['blood-pressure'] = bp.fit_transform(X[['capital-loss']]) +ad = pd.concat([X, y], axis=1) +ad = ad.dropna() -activity = MinMaxScaler(feature_range=(0, 24)) -X['daily-activity-hours'] = activity.fit_transform(X[['hours-per-week']]) +cholesterol_scale = MinMaxScaler(feature_range=(100, 300)) +ad['cholesterol'] = cholesterol_scale.fit_transform(ad[['capital-gain']]) -X.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt'], axis=1, inplace=True) -#print((X.head())) -#print((X.columns)) +bp_scale = MinMaxScaler(feature_range=(80, 180)) +ad['blood-pressure'] = bp_scale.fit_transform(ad[['capital-loss']]) -X.rename(columns={'age': 'patient-age', 'workclass': 'lifestyle', 'education': 'insurance_provider', 'occupation': 'pocedure_on_the_visite', 'sex':'gender', 'native-country': 'reason_for_visit'}, inplace=True) -y.rename(columns={'income': 'health-risk'}, inplace = True) +activity_scale = MinMaxScaler(feature_range=(0, 24)) +ad['daily-activity-hours'] = activity_scale.fit_transform(ad[['hours-per-week']]) -# print((X.columns)) -# print((y.columns)) +ad.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'], axis=1, inplace=True) -categorical_columns = ["lifestyle", "insurance_provider", "marital-status", "pocedure_on_the_visite", "relationship", - "gender", "reason_for_visit"] +occupation_to_procedure = {'Adm-clerical': 'Administrative-check', 'Exec-managerial': 'Specialist-consultation', + 'Handlers-cleaners': 'Minor-procedure', 'Prof-specialty': 'Major-surgery', + 'Other-service': 'General-checkup', 'Sales': 'Diagnostic-test', + 'Craft-repair': 'Rehabilitation', 'Transport-moving': 'Emergency', + 'Farming-fishing': 'Preventive-screening', 'Machine-op-inspct': 'Therapy-session', + 'Tech-support': 'Follow-up', 'Protective-serv': 'Mental-health-consultation', + 'Armed-Forces': 'Chronic-disease-management', 'Priv-house-serv': 'Vaccination'} -X_enc = pd.get_dummies(X, columns=categorical_columns) -X_enc = X_enc.replace({True: 1, False: 0}) -train_X, test_X, train_y, test_y = train_test_split(X_enc, y, test_size=0.2, random_state=1) +country_to_reason = {'United-States': 'General-checkup', 'Cuba': 'Routine-follow-up', 'Jamaica': 'New-symptom', + 'India': 'Fever', 'Mexico': 'Cold-Flu-Cough', 'South': 'Headache-migraine', + 'Puerto-Rico': 'Heart-concerns', 'Honduras': 'Respiratory-issue', 'England': 'Digestive-issue', + 'Canada': 'Pain-related', 'Germany': 'Skin-concern', 'Iran': 'Eye', 'Philippines': 'ENT', + 'Italy': 'Mental-health-concerns', 'Poland': 'Stress', 'Columbia': 'Fatigue-weakness', + 'Cambodia': 'Weight-concerns', 'Thailand': 'Diabetes/Bloodsugar', 'Ecuador': 'BP-concern', + 'Laos': 'Cholesterol-lipid-concern', 'Taiwan': 'Allergy-symptoms', 'Haiti': 'Prenatal', + 'Portugal': 'Gynecological-concern', 'Dominican-Republic': 'Pediatrics', + 'El-Salvador': 'Age-related', 'France': 'Chronic-disease', 'Guatemala': 'Medication-side-effect', + 'China': 'Injury', 'Japan': 'Trauma', 'Yugoslavia': 'Preventive-counseling', 'Peru': 'Genetic-risks', + 'Outlying-US(Guam-USVI-etc)': 'Vaccination-inquiry', 'Scotland': 'Health-counseling', + 'Trinadad&Tobago': 'Substance-concern', 'Greece': 'Surgical-consultation', + 'Nicaragua': 'Specialist-referral', 'Vietnam': 'Discharge', 'Hong': 'Lab-result-discussion', + 'Ireland': 'Second-opinion', 'Hungary': 'Administrative-inquiry', 'Holand-Netherlands': 'Other'} + +workclass_to_lifestyle = { 'Private':'Sedentary', 'Self-emp-not-inc':'Moderately-active', 'Local-gov':'Active', + 'State-gov': 'Highly-active', 'Self-emp-inc': 'Unhealthy', 'Federal-gov': 'Healthy', + 'Without-pay': 'At-risk'} + +education_to_insurance_provider = {'HS-grad':'UnitedHealthcare', 'Some-college': 'Blue Cross Blue Shield', 'Bachelors': 'Aetna', + 'Masters': 'Cigna', 'Assoc-voc': 'Humana', '11th': 'Kaiser Permanente', 'Assoc-acdm': 'Anthem', + '10th': 'Molina Healthcare', '7th-8th': 'Centene Corporation', 'Prof-school': 'Highmark Health', + '9th': 'Health Net', '12th':'WellCare Health Plans', 'Doctorate':'CareFirst', + '5th-6th': 'Independence Blue Cross', '1st-4th': 'Fidelis Care', 'Preschool': 'Oscar Health'} + +ad['procedure_on_the_day'] = ad['occupation'].map(occupation_to_procedure) +ad['reason_for_visit'] = ad['native-country'].map(country_to_reason) +ad['lifestyle'] = ad['workclass'].map(workclass_to_lifestyle) +ad['insurance_provider'] = ad['education'].map(education_to_insurance_provider) + +ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) + +ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1, inplace=True) + +cat_cols = ['marital-status', 'race', 'relationship', 'gender', 'lifestyle', + 'insurance_provider', 'procedure_on_the_day', 'reason_for_visit'] + + +ad_enc = pd.get_dummies(ad, columns=cat_cols) +ad_enc = ad_enc.astype(int) + +adX = ad_enc.drop(columns=['health_risk']) +ady = ad_enc['health_risk'] + +train_X, test_X, train_y, test_y = train_test_split(adX, ady, test_size=0.2, random_state=1) os.chdir("../env/") train_X.to_csv("train_X.csv") From 2b2d07923da7eb91de25acf7728583e4e711548a Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 6 Nov 2025 04:29:35 +0000 Subject: [PATCH 09/22] Fixed env_read_only_files for all tasks --- .../adult-di-race/scripts/env_read_only_files.txt | 8 ++++---- .../adult-di-sex/scripts/env_read_only_files.txt | 8 ++++---- .../adult-eod-race/scripts/env_read_only_files.txt | 8 ++++---- .../adult-eod-sex/scripts/env_read_only_files.txt | 8 ++++---- .../adult-erd-race/scripts/env_read_only_files.txt | 8 ++++---- .../adult-erd-sex/scripts/env_read_only_files.txt | 8 ++++---- .../adult-err-race/scripts/env_read_only_files.txt | 8 ++++---- .../adult-err-sex/scripts/env_read_only_files.txt | 8 ++++---- .../adult-ford-race/scripts/env_read_only_files.txt | 8 ++++---- .../adult-ford-sex/scripts/env_read_only_files.txt | 8 ++++---- .../adult-spd-race/scripts/env_read_only_files.txt | 8 ++++---- .../adult-spd-sex/scripts/env_read_only_files.txt | 8 ++++---- .../scripts/env_read_only_files.txt | 8 ++++---- .../scripts/env_read_only_files.txt | 8 ++++---- .../scripts/env_read_only_files.txt | 8 ++++---- .../scripts/env_read_only_files.txt | 8 ++++---- .../dollarstreet-income/scripts/env_read_only_files.txt | 4 ++-- .../germancredit-di-sex/scripts/env_read_only_files.txt | 8 ++++---- .../germancredit-eod-sex/scripts/env_read_only_files.txt | 8 ++++---- .../health-di-race/scripts/env_read_only_files.txt | 8 ++++---- .../randoadult-di-race/scripts/env_read_only_files.txt | 8 ++++---- .../randoadult-di-sex/scripts/env_read_only_files.txt | 8 ++++---- .../randoadult-eod-race/scripts/env_read_only_files.txt | 8 ++++---- .../randoadult-eod-sex/scripts/env_read_only_files.txt | 8 ++++---- .../sampadult-di-race/scripts/env_read_only_files.txt | 8 ++++---- .../sampadult-di-sex/scripts/env_read_only_files.txt | 8 ++++---- .../sampadult-eod-race/scripts/env_read_only_files.txt | 8 ++++---- .../sampadult-eod-sex/scripts/env_read_only_files.txt | 8 ++++---- 28 files changed, 110 insertions(+), 110 deletions(-) diff --git a/fairnessBench/benchmarks/adult-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-di-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-di-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-di-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-di-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-di-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-di-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-di-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-eod-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-eod-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-eod-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-eod-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-eod-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-eod-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-eod-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-eod-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-erd-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-erd-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-erd-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-erd-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-erd-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-erd-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-erd-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-erd-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-err-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-err-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-err-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-err-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-err-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-err-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-err-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-err-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-ford-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-ford-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-ford-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-ford-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-ford-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-ford-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-ford-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-ford-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-spd-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-spd-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-spd-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-spd-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-spd-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-spd-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-spd-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-spd-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adultrecon-allmetric-gender/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adultrecon-allmetric-gender/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adultrecon-allmetric-gender/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adultrecon-allmetric-gender/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adultrecon-allmetric-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adultrecon-allmetric-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adultrecon-allmetric-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adultrecon-allmetric-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/creditdefault-di-gender/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/creditdefault-di-gender/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/creditdefault-di-gender/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/creditdefault-di-gender/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/creditdefault-eod-gender/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/creditdefault-eod-gender/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/creditdefault-eod-gender/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/creditdefault-eod-gender/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt index e1cd988..fba2668 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt @@ -1,6 +1,6 @@ assets/* -./images_v2_imagenet_test.csv +./images_v2_imagenet_test.csv ./topics_to_imagenet_classes_map.json ./README.md ./images_v2.csv -./images_v2_imagenet_train.csv \ No newline at end of file +./images_v2_imagenet_train.csv diff --git a/fairnessBench/benchmarks/germancredit-di-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/germancredit-di-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/germancredit-di-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/germancredit-di-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/germancredit-eod-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/germancredit-eod-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/germancredit-eod-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/germancredit-eod-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/randoadult-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/randoadult-di-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/randoadult-di-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/randoadult-di-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/randoadult-di-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/randoadult-di-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/randoadult-di-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/randoadult-di-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/randoadult-eod-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/randoadult-eod-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/randoadult-eod-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/randoadult-eod-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/randoadult-eod-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/randoadult-eod-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/randoadult-eod-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/randoadult-eod-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/sampadult-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/sampadult-di-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/sampadult-di-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/sampadult-di-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/sampadult-di-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/sampadult-di-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/sampadult-di-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/sampadult-di-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/sampadult-eod-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/sampadult-eod-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/sampadult-eod-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/sampadult-eod-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/sampadult-eod-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/sampadult-eod-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/sampadult-eod-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/sampadult-eod-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv From b4fb6ec295ecc0d7d5e36d784fa2d013ea481e97 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Tue, 11 Nov 2025 05:09:39 +0000 Subject: [PATCH 10/22] dollarstreet: read_only_files had a tab at the end of one line causing fnmatch to miss that file --- .../dollarstreet-income/scripts/env_read_only_files.txt | 2 +- .../benchmarks/dollarstreet-income/scripts/read_only_files.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt index fba2668..9a933bd 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt @@ -1,5 +1,5 @@ assets/* -./images_v2_imagenet_test.csv +./images_v2_imagenet_test.csv ./topics_to_imagenet_classes_map.json ./README.md ./images_v2.csv diff --git a/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt b/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt index fba2668..9a933bd 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt +++ b/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt @@ -1,5 +1,5 @@ assets/* -./images_v2_imagenet_test.csv +./images_v2_imagenet_test.csv ./topics_to_imagenet_classes_map.json ./README.md ./images_v2.csv From 50efc223f1e11f662f22685a1249a0b9cb89519b Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 19 Nov 2025 03:39:53 +0000 Subject: [PATCH 11/22] Activate llm eval x5 times --- fairnessBench/eval.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fairnessBench/eval.py b/fairnessBench/eval.py index 18632cc..089de7d 100644 --- a/fairnessBench/eval.py +++ b/fairnessBench/eval.py @@ -200,13 +200,13 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed result.score.append(eval_step_score) print(e) # Getting llm_eval here using llm_eval_repeat function - # try: - # llm_score = repeat_llm_eval(5, train_script, eval_model) - # result.llm_score.append(llm_score) - # except Exception as e: - # print("\nllm_eval didn't work\n") - # print(e) - # pass + try: + llm_score = repeat_llm_eval(5, train_script, eval_model) + result.llm_score.append(llm_score) + except Exception as e: + print("\nllm_eval didn't work\n") + print(e) + pass # Getting Flake8 score here try: flake8_score = get_flake8(train_script) @@ -236,7 +236,7 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed # Getting llm_eval here using llm_eval_repeat function if eval_model: try: - llm_score = repeat_llm_eval(1, train_script, eval_model) + llm_score = repeat_llm_eval(5, train_script, eval_model) result.final_llm_score = llm_score except Exception as e: print("\nllm_eval didn't work\n") @@ -266,7 +266,7 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed lines = [line.strip() for line in history_step.splitlines() ] f.write('\n'.join(lines)) print(f"Using history_steps from {temp_txt_path} for evaluation") - log_score = repeat_llm_eval_log(1, temp_txt_path, eval_model) + log_score = repeat_llm_eval_log(5, temp_txt_path, eval_model) else: # if there's no agent step which i doubt fallback to the main_log(will be tooo long and make model fail. lol) print("No history step files found.") From 36f962476010b38a136ae18a5b8e0ff9f8796593 Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Wed, 19 Nov 2025 05:40:49 +0000 Subject: [PATCH 12/22] new task: created new non descriptive task using adult data structure, all the column names are non descriptive but the values remain same as in adult --- .../env/data_description.txt | 125 ++++++++++++++++++ .../non_descriptive_ad-di-race/env/train.py | 44 ++++++ .../scripts/env_read_only_files.txt | 4 + .../scripts/eval.py | 45 +++++++ .../scripts/prepare.py | 49 +++++++ .../scripts/read_only_files.txt | 4 + 6 files changed, 271 insertions(+) create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-race/env/data_description.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-race/env/train.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/env_read_only_files.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/eval.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/prepare.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/read_only_files.txt diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/env/data_description.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-race/env/data_description.txt new file mode 100644 index 0000000..068cb06 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-race/env/data_description.txt @@ -0,0 +1,125 @@ +patt2: Integer + +feat1 : Categorical feature + Private + Self-emp-not-inc + Self-emp-inc + Federal-gov + Local-gov + State-gov + Without-pay + Never-worked. + +feat2 : Categorical + Education Level + Bachelors + Some-college + 11th, HS-grad + Prof-school + Assoc-acdm + Assoc-voc + 9th + 7th-8th + 12th + Masters + 1st-4th + 10th + Doctorate + 5th-6th + Preschool + +patt3 : Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +feat3 : Categorical + Tech-support, + Craft-repair, + Other-service, + Sales, + Exec-managerial, + Prof-specialty, + Handlers-cleaners, + Machine-op-inspct, + Adm-clerical, + Farming-fishing, + Transport-moving, + Priv-house-serv, + Protective-serv, + Armed-Forces + +patt4 : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +patt1 : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +patt5 : Binary + Female + Male + +feat4 : Integer + +feat5 : Integer + +feat6 : Integer + +feat7 : Categorical + United-States, + Cambodia, + England, + Puerto-Rico, + Canada, + Germany, + Outlying-US(Guam-USVI-etc), + India, + Japan, + Greece, + South, + China, + Cuba, + Iran, + Honduras, + Philippines, + Italy, + Poland, + Jamaica, + Vietnam, + Mexico, + Portugal, + Ireland, + France, + Dominican-Republic, + Laos, Ecuador, + Taiwan, + Haiti, + Columbia, + Hungary, + Guatemala, + Nicaragua, + Scotland, + Thailand, + Yugoslavia, + El-Salvador, + Trinadad&Tobago, + Peru, + Hong, + Holand-Netherlands + +target : binary target + <=50K + >50K. \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/env/train.py b/fairnessBench/benchmarks/non_descriptive_ad-di-race/env/train.py new file mode 100644 index 0000000..93a14ea --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-race/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['patt1'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "patt1" : prot_attr}) + +privileged_groups = [{'patt1': 1}] +unprivileged_groups = [{'patt1': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['patt1']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +disparate_impact = metric.disparate_impact() + +print('accuracy score:', accuracy) +print('disparate impact ratio', disparate_impact) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/eval.py b/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/eval.py new file mode 100644 index 0000000..a79a1c4 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'patt1': 1}] + unprivileged_groups = [{'patt1': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['patt1']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/prepare.py b/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/prepare.py new file mode 100644 index 0000000..f798c75 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/prepare.py @@ -0,0 +1,49 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["race"] = X["race"].apply(lambda i: 1 if i == 'White' else 0) +X = X.replace('?', np.nan) + +X = X.rename(columns={'age': 'patt2', 'workclass': 'feat1', 'education': 'feat2', 'marital-status': 'patt3', 'occupation': 'feat3','relationship': 'patt4', 'race': 'patt1', 'sex': 'patt5', 'capital-gain': 'feat4', 'capital-loss':'feat5', 'hours-per-week': 'feat6', 'native-country': 'feat7'}) + +y=y.rename(columns={'income':'target'}) + +categorical_columns = ["feat1", "feat2", "patt3", "feat3", "patt4", "patt5", "feat7"] + +X_enc = pd.get_dummies(X, columns=categorical_columns) +X_enc = X_enc.replace({True: 1, False: 0}) + + +train_X, test_X, train_y, test_y = train_test_split(X_enc, y, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") + +# feat1 workclass cat +# feat2 education cat +# patt1 (race) cat +# patt2 (age) num +# patt3 (ms) cat +# patt4 (relationship) cat +# patt5 (sex) cat +# feat3 occupation cat +# feat4 cap gain num +# feat5 cap loss num +# feat6 hours-per-week num +# feat7 native country cat +# target income bin cat \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv From 8c3707b9c5a66e52d7a9696ffc968846db02f3de Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 6 Dec 2025 22:36:16 +0000 Subject: [PATCH 13/22] Removed unnecessary loading method of llama --- fairnessBench/LLM.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/fairnessBench/LLM.py b/fairnessBench/LLM.py index a6e2f19..83a1f14 100644 --- a/fairnessBench/LLM.py +++ b/fairnessBench/LLM.py @@ -18,22 +18,6 @@ # AS: Setup llama loaded_hf_models = {} -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -try: - # Need export HF_HOME=/datasets/ai/llama3 - # llama_= "meta-llama/Llama-3.3-70B-Instruct" # Gave us decent results. - # llama_= "meta-llama/Llama-3.1-405B-Instruct" # Terrible hallusinations - # llama_= "meta-llama/Llama-3.1-8B-Instruct" # Trying smaller models for test runs - - tokenizer = AutoTokenizer.from_pretrained(llama_) - quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16) - model = AutoModelForCausalLM.from_pretrained(llama_, quantization_config = quant_config, device_map="auto",torch_dtype=torch.float16) - loaded_hf_models = {"llama": (model, tokenizer)} - print(f"Loaded local {llama_} successfuly using device: {model.device}.") -except Exception as e: - print(f"Failed to load local llama - Current device:{device}\nIssue: {e}") - - def complete_text_hf(prompt, stop_sequences=[], model="llama", max_tokens_to_sample = 2500, temperature=0.5, log_file=None, device=0, **kwargs): if model in loaded_hf_models: hf_model, tokenizer = loaded_hf_models[model] From ca6b97be342d11e91bc488e94c59a2f5d48a1e89 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 6 Dec 2025 23:25:07 +0000 Subject: [PATCH 14/22] Eval: Now using flags to enable\/disable types of eval. Also if result file exits for a specific run concatinate instead of overwirte --- fairnessBench/eval.py | 113 +++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 51 deletions(-) diff --git a/fairnessBench/eval.py b/fairnessBench/eval.py index 089de7d..6594cf6 100644 --- a/fairnessBench/eval.py +++ b/fairnessBench/eval.py @@ -131,7 +131,7 @@ class EvaluationResult: extra: Dict[str, bool] -def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermediate=False): +def run_eval(log_folder, benchmark_folder_name, eval_model=None, eval_intermediate=False, llm_eval=False, log_eval=False, flake8=False): results = {} # Log folder is the specific log folder for one model and one task @@ -139,7 +139,8 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed for subdir, dirs, files in os.walk(log_folder): for file in files: - if file == 'trace.json': + # trace.json is an indicator that at least one step occurred + if file == 'trace.json': # TODO: Add new keys result = EvaluationResult( path=os.path.join(subdir, file), summary="", @@ -200,27 +201,29 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed result.score.append(eval_step_score) print(e) # Getting llm_eval here using llm_eval_repeat function - try: - llm_score = repeat_llm_eval(5, train_script, eval_model) - result.llm_score.append(llm_score) - except Exception as e: - print("\nllm_eval didn't work\n") - print(e) - pass + if llm_eval and eval_model: + try: + llm_score = repeat_llm_eval(5, train_script, eval_model) + result.llm_score.append(llm_score) + except Exception as e: + print("\nllm_eval didn't work\n") + print(e) + pass # Getting Flake8 score here - try: - flake8_score = get_flake8(train_script) - result.flake8_score.append(flake8_score) - print(flake8_score) - except Exception as e: - print("\nFlake8_eval didn't work\n") - print(e) - pass + if flake8: + try: + flake8_score = get_flake8(train_script) + result.flake8_score.append(flake8_score) + # print(f"Flake8 final score: {flake8_score0}") + except Exception as e: + print("\nFlake8_eval didn't work\n") + print(e) + pass - # Add the ids of the steps that were evaluated to the JSON file result.score_steps = list(subsampled_list) - + + # Evaluate the final step folder_path = os.path.join(subdir, 'traces/step_final_files') train_script = os.path.join(folder_path, ".train.py") if (os.path.exists(os.path.join(folder_path, ".train.py"))) else os.path.join(folder_path, "train.py") @@ -229,12 +232,12 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed eval_final_score = module.get_score(folder_path) result.score.append(eval_final_score) result.final_score = eval_final_score - print(eval_final_score) + print(f"Final score: {eval_final_score}") except Exception as e: print(e) pass # Getting llm_eval here using llm_eval_repeat function - if eval_model: + if llm_eval and eval_model: try: llm_score = repeat_llm_eval(5, train_script, eval_model) result.final_llm_score = llm_score @@ -243,15 +246,16 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed print(e) pass # Getting Flake8 score - try: - flake8_score = get_flake8(train_script) - result.final_flake8_score = flake8_score - print(flake8_score) - except Exception as e: - print("\nFlake8_eval didn't work\n") - print(e) - pass - if eval_model: + if flake8: + try: + flake8_score = get_flake8(train_script) + result.final_flake8_score = flake8_score + print(f"Flake8 final score: {flake8_score}") + except Exception as e: + print("\nFlake8_eval didn't work\n") + print(e) + pass + if log_eval and eval_model: # Getting LLM log eval here using llm_eval_log_repeat function log_file=os.path.join(subdir.rsplit('/',1)[0], "agent_log/main_log") try: @@ -260,7 +264,6 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed # use the latest step in the eval if history_step: # create a temporary file with the history_step content - #history_text = "\n\n".join(history_step) temp_txt_path = os.path.join(subdir.rsplit('/',1)[0], "agent_log/temp_history_step.txt") with open(temp_txt_path, 'w') as f: lines = [line.strip() for line in history_step.splitlines() ] @@ -296,39 +299,47 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed results[os.path.join(subdir, file)] = result - return results + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--log-folder", type=str, default="logs") - parser.add_argument("--task", type=str, default="adult") - parser.add_argument("--output-file", type=str, default="results.json") - parser.add_argument("--eval_model", type=str, default=None) - parser.add_argument("--eval-intermediate", action="store_true") + parser.add_argument("--log-folder", type=str, default="final_exp_logs") # Logs of model/task with multiple runs + parser.add_argument("--task", type=str, default="adult_di_best-sex") # All runs in a single model/task will be evaluated + parser.add_argument("--eval-intermediate", action="store_true") # Set true to evaluate all steps not just the final step + parser.add_argument("--eval_model", type=str, default=None) # LLM evaluator + parser.add_argument("--llm_eval", type=bool, action="store_true") # Set true to evaluate train.py with llm + parser.add_argument("--log_eval", type=bool, action="store_true") # Set true to evaluate logs (thought-action) with llm + parser.add_argument("--flake8", type=bool, action="store_true") # Set true to evaluate train.py with flake8 + parser.add_argument("--output-file", type=str, default="results.json") # JSON result file for model/task args = parser.parse_args() + # Report if the task has no logs found if not os.path.exists(args.log_folder): print(f"WARNING\nWARNING\nWARNING: The log folder {args.log_folder} doesn't exist. \nWARNING\nWARNING") exit() - - if os.path.exists(args.output_file): - with open(args.output_file) as f: - content = json.load(f) - if content: - print(f"WARNING\nWARNING\nWARNING: Results for {args.output_file} already exists\nWARNING\nWARNING") - exit() - - - benchmark_folder_name = get_task_info(args.task)[0] - results = run_eval(args.log_folder, benchmark_folder_name, eval_model = args.eval_model, eval_intermediate = args.eval_intermediate) + # To find task/scripts/eval.py + benchmark_folder_name = get_task_info(args.task)[0] # Return is (folder, research problem) + results = run_eval(args.log_folder, benchmark_folder_name, eval_intermediate = args.eval_intermediate, eval_model = args.eval_model, llm_eval = args.llm_eval, log_eval = args.log_eval, flake8 = args.flake8) + + # Report a failure in the run_eval function - Prevent empty file from being generated if not results: - print(f"WARNING\nWARNING\nWARNING: Results for {args.log_folder.rsplit('/')} is empty\nWARNING\nWARNING") + print(f"WARNING\nWARNING\nWARNING: Eval failed. Results for {args.log_folder.rsplit('/')} were empty\nWARNING\nWARNING") else: + # Report if json result file already exists so that we don't overwrite + if os.path.exists(args.output_file): + with open(args.output_file) as f: + content = json.load(f) + if content: # If file is empty it's ok to overwrite + print(f"WARNING\nWARNING\nWARNING: Results for {args.output_file} already exists. Concatinating...\nWARNING\nWARNING") + # exit() + for key, eval in content.items(): + if key not in results.keys(): + results.update({key:eval}) + json.dump(results, open(args.output_file, "w"), indent=4, cls=EnhancedJSONEncoder) - - + \ No newline at end of file From e823867105429ca6578eeb32da2db619316f945b Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 6 Dec 2025 23:30:43 +0000 Subject: [PATCH 15/22] Eval: option formating to match the rest --- fairnessBench/eval.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fairnessBench/eval.py b/fairnessBench/eval.py index 6594cf6..c512267 100644 --- a/fairnessBench/eval.py +++ b/fairnessBench/eval.py @@ -308,9 +308,9 @@ def run_eval(log_folder, benchmark_folder_name, eval_model=None, eval_intermedia parser.add_argument("--log-folder", type=str, default="final_exp_logs") # Logs of model/task with multiple runs parser.add_argument("--task", type=str, default="adult_di_best-sex") # All runs in a single model/task will be evaluated parser.add_argument("--eval-intermediate", action="store_true") # Set true to evaluate all steps not just the final step - parser.add_argument("--eval_model", type=str, default=None) # LLM evaluator - parser.add_argument("--llm_eval", type=bool, action="store_true") # Set true to evaluate train.py with llm - parser.add_argument("--log_eval", type=bool, action="store_true") # Set true to evaluate logs (thought-action) with llm + parser.add_argument("--eval-model", type=str, default=None) # LLM evaluator + parser.add_argument("--llm-eval", type=bool, action="store_true") # Set true to evaluate train.py with llm + parser.add_argument("--log-eval", type=bool, action="store_true") # Set true to evaluate logs (thought-action) with llm parser.add_argument("--flake8", type=bool, action="store_true") # Set true to evaluate train.py with flake8 parser.add_argument("--output-file", type=str, default="results.json") # JSON result file for model/task args = parser.parse_args() From f008c094149e59f28bba2b79909418ea67370e33 Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Fri, 9 Jan 2026 07:48:35 +0000 Subject: [PATCH 16/22] fixed dollarstreet eval.py script --- .../benchmarks/dollarstreet-income/scripts/eval.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fairnessBench/benchmarks/dollarstreet-income/scripts/eval.py b/fairnessBench/benchmarks/dollarstreet-income/scripts/eval.py index 4d5c13c..95732fc 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/scripts/eval.py +++ b/fairnessBench/benchmarks/dollarstreet-income/scripts/eval.py @@ -5,18 +5,19 @@ def get_score(agent_files): submission_path = os.path.join(agent_files, "submission.csv") - submission_df= pd.read_csv(submission_path, index_col=0) + submission_df= pd.read_csv(submission_path) test_df = pd.read_csv(os.path.join("../env", "images_v2_imagenet_test.csv")) test_df["imagenet_sysnet_id"] = test_df["imagenet_sysnet_id"].apply(ast.literal_eval) test_df["synset"] = test_df["imagenet_sysnet_id"].apply(lambda x: x[0]) + all_synsets = sorted(set(test_df["synset"])) synset2idx = {syn: idx for idx, syn in enumerate(all_synsets)} test_df["true_label_idx"] = test_df["synset"].map(synset2idx) - submission_df["predicted_label"] = submission_df.values.argmax(axis=1) - test_df["predicted_label"] = submission_df["predicted_label"] + test_df["predicted_label"] = submission_df.values.argmax(axis=1) + test_df["correct"] = test_df["predicted_label"] == test_df["true_label_idx"] test_df["income_q"] = pd.qcut(test_df["income"], q=4, labels=["Q1", "Q2", "Q3", "Q4"]) test_df["income_group"] = test_df["income_q"].apply(lambda q: "Disadvantaged" if q == "Q1" else "Advantaged") From d147e1b30786a50801852a68458191fc2b3d8737 Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Sat, 10 Jan 2026 06:41:30 +0000 Subject: [PATCH 17/22] refactored prepare.py script for health-di-race task to include comments explaining each group of changes and creation of new columns as per profs suggestion --- .../health-di-race/scripts/prepare.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/fairnessBench/benchmarks/health-di-race/scripts/prepare.py b/fairnessBench/benchmarks/health-di-race/scripts/prepare.py index 4c52895..ab1071e 100644 --- a/fairnessBench/benchmarks/health-di-race/scripts/prepare.py +++ b/fairnessBench/benchmarks/health-di-race/scripts/prepare.py @@ -21,17 +21,21 @@ ad = pd.concat([X, y], axis=1) ad = ad.dropna() +# SR: scaling the values in cols capital-gain, capital-loss and hours-per-week into a realistic range. It preserves relative ordering while only changing the scale for respective cols. + cholesterol_scale = MinMaxScaler(feature_range=(100, 300)) ad['cholesterol'] = cholesterol_scale.fit_transform(ad[['capital-gain']]) - + bp_scale = MinMaxScaler(feature_range=(80, 180)) ad['blood-pressure'] = bp_scale.fit_transform(ad[['capital-loss']]) activity_scale = MinMaxScaler(feature_range=(0, 24)) ad['daily-activity-hours'] = activity_scale.fit_transform(ad[['hours-per-week']]) +# dropping unwanted cols, we have never used these two cols ('fnlwgt', 'education-num') in any analysis so far with adult data. ad.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'], axis=1, inplace=True) +# column values mapping eg: occupation = Adm-clerical is mapped to procedure = Administrative-check occupation_to_procedure = {'Adm-clerical': 'Administrative-check', 'Exec-managerial': 'Specialist-consultation', 'Handlers-cleaners': 'Minor-procedure', 'Prof-specialty': 'Major-surgery', 'Other-service': 'General-checkup', 'Sales': 'Diagnostic-test', @@ -41,6 +45,7 @@ 'Armed-Forces': 'Chronic-disease-management', 'Priv-house-serv': 'Vaccination'} +# column values mapping eg: country = United-States is mapped to reason = General-checkup country_to_reason = {'United-States': 'General-checkup', 'Cuba': 'Routine-follow-up', 'Jamaica': 'New-symptom', 'India': 'Fever', 'Mexico': 'Cold-Flu-Cough', 'South': 'Headache-migraine', 'Puerto-Rico': 'Heart-concerns', 'Honduras': 'Respiratory-issue', 'England': 'Digestive-issue', @@ -56,24 +61,29 @@ 'Nicaragua': 'Specialist-referral', 'Vietnam': 'Discharge', 'Hong': 'Lab-result-discussion', 'Ireland': 'Second-opinion', 'Hungary': 'Administrative-inquiry', 'Holand-Netherlands': 'Other'} +# column values mapping eg: workclass = Private is mapped to lifestyle = Sedentary workclass_to_lifestyle = { 'Private':'Sedentary', 'Self-emp-not-inc':'Moderately-active', 'Local-gov':'Active', 'State-gov': 'Highly-active', 'Self-emp-inc': 'Unhealthy', 'Federal-gov': 'Healthy', 'Without-pay': 'At-risk'} +# column values mapping eg: education = HS-grad is mapped to insurance_provider = UnitedHealthcare education_to_insurance_provider = {'HS-grad':'UnitedHealthcare', 'Some-college': 'Blue Cross Blue Shield', 'Bachelors': 'Aetna', 'Masters': 'Cigna', 'Assoc-voc': 'Humana', '11th': 'Kaiser Permanente', 'Assoc-acdm': 'Anthem', '10th': 'Molina Healthcare', '7th-8th': 'Centene Corporation', 'Prof-school': 'Highmark Health', '9th': 'Health Net', '12th':'WellCare Health Plans', 'Doctorate':'CareFirst', '5th-6th': 'Independence Blue Cross', '1st-4th': 'Fidelis Care', 'Preschool': 'Oscar Health'} -ad['procedure_on_the_day'] = ad['occupation'].map(occupation_to_procedure) -ad['reason_for_visit'] = ad['native-country'].map(country_to_reason) -ad['lifestyle'] = ad['workclass'].map(workclass_to_lifestyle) -ad['insurance_provider'] = ad['education'].map(education_to_insurance_provider) +new_col_list = [] -ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) +new_col_list.append(pd.Series(ad['occupation'].map(occupation_to_procedure),name='procedure_on_the_day')) +new_col_list.append(pd.Series(ad['native-country'].map(country_to_reason),name='reason_for_visit')) +new_col_list.append(pd.Series(ad['workclass'].map(workclass_to_lifestyle), name='lifestyle')) +new_col_list.append(pd.Series(ad['education'].map(education_to_insurance_provider),name='insurance_provider')) -ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1, inplace=True) +ad = pd.concat([ad] + new_col_list, axis=1) + +ad = ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1) +ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) cat_cols = ['marital-status', 'race', 'relationship', 'gender', 'lifestyle', 'insurance_provider', 'procedure_on_the_day', 'reason_for_visit'] From 2a33702ccc41fe58bf395400a53b0fbd9bc4f0ca Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Fri, 16 Jan 2026 05:18:27 +0000 Subject: [PATCH 18/22] updated research problem for nondescriptive task --- fairnessBench/benchmarks/tasks.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fairnessBench/benchmarks/tasks.json b/fairnessBench/benchmarks/tasks.json index 12af2cd..cf84b05 100644 --- a/fairnessBench/benchmarks/tasks.json +++ b/fairnessBench/benchmarks/tasks.json @@ -419,5 +419,9 @@ "health_di_balance-race": { "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across race. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", "benchmark_folder_name": "health-di-race" + }, + "nondescriptive_di_balance-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across race. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-di-race" } } From 580ae19beeee495b64be634470b31aab4129eacf Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Sat, 17 Jan 2026 06:07:23 +0000 Subject: [PATCH 19/22] Added new task variations for both health and non_descriptive data and all the codes are tested --- .../health-di-race/scripts/prepare.py | 2 +- .../health-di-sex/env/data_description.txt | 125 ++++++++++++++++++ .../benchmarks/health-di-sex/env/train.py | 44 ++++++ .../scripts/env_read_only_files.txt | 0 .../benchmarks/health-di-sex/scripts/eval.py | 45 +++++++ .../health-di-sex/scripts/prepare.py | 104 +++++++++++++++ .../scripts/read_only_files.txt | 0 .../health-eod-race/env/data_description.txt | 125 ++++++++++++++++++ .../benchmarks/health-eod-race/env/train.py | 44 ++++++ .../scripts/env_read_only_files.txt | 4 + .../health-eod-race/scripts/eval.py | 45 +++++++ .../health-eod-race/scripts/prepare.py | 104 +++++++++++++++ .../scripts/read_only_files.txt | 4 + .../health-eod-sex/env/data_description.txt | 125 ++++++++++++++++++ .../benchmarks/health-eod-sex/env/train.py | 44 ++++++ .../scripts/env_read_only_files.txt | 4 + .../benchmarks/health-eod-sex/scripts/eval.py | 45 +++++++ .../health-eod-sex/scripts/prepare.py | 104 +++++++++++++++ .../scripts/read_only_files.txt | 4 + .../env/data_description.txt | 0 .../env/train.py | 0 .../scripts/env_read_only_files.txt | 4 + .../scripts/eval.py | 0 .../scripts/prepare.py | 0 .../scripts/read_only_files.txt | 4 + .../env/data_description.txt | 125 ++++++++++++++++++ .../non_descriptive_ad-di-patt5/env/train.py | 44 ++++++ .../scripts/env_read_only_files.txt | 4 + .../scripts/eval.py | 45 +++++++ .../scripts/prepare.py | 49 +++++++ .../scripts/read_only_files.txt | 4 + .../env/data_description.txt | 125 ++++++++++++++++++ .../non_descriptive_ad-eod-patt1/env/train.py | 44 ++++++ .../scripts/env_read_only_files.txt | 4 + .../scripts/eval.py | 45 +++++++ .../scripts/prepare.py | 49 +++++++ .../scripts/read_only_files.txt | 4 + .../env/data_description.txt | 125 ++++++++++++++++++ .../non_descriptive_ad-eod-patt5/env/train.py | 44 ++++++ .../scripts/env_read_only_files.txt | 4 + .../scripts/eval.py | 45 +++++++ .../scripts/prepare.py | 49 +++++++ .../scripts/read_only_files.txt | 4 + 43 files changed, 1792 insertions(+), 1 deletion(-) create mode 100644 fairnessBench/benchmarks/health-di-sex/env/data_description.txt create mode 100644 fairnessBench/benchmarks/health-di-sex/env/train.py rename fairnessBench/benchmarks/{non_descriptive_ad-di-race => health-di-sex}/scripts/env_read_only_files.txt (100%) create mode 100644 fairnessBench/benchmarks/health-di-sex/scripts/eval.py create mode 100644 fairnessBench/benchmarks/health-di-sex/scripts/prepare.py rename fairnessBench/benchmarks/{non_descriptive_ad-di-race => health-di-sex}/scripts/read_only_files.txt (100%) create mode 100644 fairnessBench/benchmarks/health-eod-race/env/data_description.txt create mode 100644 fairnessBench/benchmarks/health-eod-race/env/train.py create mode 100644 fairnessBench/benchmarks/health-eod-race/scripts/env_read_only_files.txt create mode 100644 fairnessBench/benchmarks/health-eod-race/scripts/eval.py create mode 100644 fairnessBench/benchmarks/health-eod-race/scripts/prepare.py create mode 100644 fairnessBench/benchmarks/health-eod-race/scripts/read_only_files.txt create mode 100644 fairnessBench/benchmarks/health-eod-sex/env/data_description.txt create mode 100644 fairnessBench/benchmarks/health-eod-sex/env/train.py create mode 100644 fairnessBench/benchmarks/health-eod-sex/scripts/env_read_only_files.txt create mode 100644 fairnessBench/benchmarks/health-eod-sex/scripts/eval.py create mode 100644 fairnessBench/benchmarks/health-eod-sex/scripts/prepare.py create mode 100644 fairnessBench/benchmarks/health-eod-sex/scripts/read_only_files.txt rename fairnessBench/benchmarks/{non_descriptive_ad-di-race => non_descriptive_ad-di-patt1}/env/data_description.txt (100%) rename fairnessBench/benchmarks/{non_descriptive_ad-di-race => non_descriptive_ad-di-patt1}/env/train.py (100%) create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/env_read_only_files.txt rename fairnessBench/benchmarks/{non_descriptive_ad-di-race => non_descriptive_ad-di-patt1}/scripts/eval.py (100%) rename fairnessBench/benchmarks/{non_descriptive_ad-di-race => non_descriptive_ad-di-patt1}/scripts/prepare.py (100%) create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/read_only_files.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/data_description.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/train.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/env_read_only_files.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/eval.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/prepare.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/read_only_files.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/data_description.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/train.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/env_read_only_files.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/eval.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/prepare.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/read_only_files.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/data_description.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/train.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/env_read_only_files.txt create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/eval.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/prepare.py create mode 100644 fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/read_only_files.txt diff --git a/fairnessBench/benchmarks/health-di-race/scripts/prepare.py b/fairnessBench/benchmarks/health-di-race/scripts/prepare.py index ab1071e..d79ea06 100644 --- a/fairnessBench/benchmarks/health-di-race/scripts/prepare.py +++ b/fairnessBench/benchmarks/health-di-race/scripts/prepare.py @@ -85,7 +85,7 @@ ad = ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1) ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) -cat_cols = ['marital-status', 'race', 'relationship', 'gender', 'lifestyle', +cat_cols = ['marital-status', 'relationship', 'gender', 'lifestyle', 'insurance_provider', 'procedure_on_the_day', 'reason_for_visit'] diff --git a/fairnessBench/benchmarks/health-di-sex/env/data_description.txt b/fairnessBench/benchmarks/health-di-sex/env/data_description.txt new file mode 100644 index 0000000..1ef9d2f --- /dev/null +++ b/fairnessBench/benchmarks/health-di-sex/env/data_description.txt @@ -0,0 +1,125 @@ +patient_age: Integer + +lifestyle: Categorical feature + Sedentary + Moderately-active + Active + Highly-active + Unhealthy + Healthy + At-risk + +insurance_provider : Categorical + UnitedHealthcare + Blue Cross Blue Shield + Aetna + Cigna + Humana + Kaiser Permanente + Anthem + Molina Healthcare + Centene Corporation + Highmark Health + Health Net + WellCare Health Plans + CareFirst + Independence Blue Cross + Fidelis Care + Oscar Health + +marital-status: Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +procedure_on_the_visit : Categorical + Administrative-check + Specialist-consultation + Minor-procedure + Major-surgery + General-checkup + Diagnostic-test + Rehabilitation + Emergency + Preventive-screening + Therapy-session + Follow-up + Mental-health-consultation + Chronic-disease-management + Vaccination + +relationship : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +race : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +gender : Binary + Female + Male + +cholesterol : Integer + +blood-pressure : Integer + +daily-activity-hours : Integer + +reason-for-visit : Categorical + General-checkup + Routine-follow-up + New-symptom + Fever + Cold-Flu-Cough + Headache-migraine + Heart-concerns + Respiratory-issue + Digestive-issue + Pain-related + Skin-concern + Eye + ENT + Mental-health-concerns + Stress + Fatigue-weakness + Weight-concerns + Diabetes/Bloodsugar + BP-concern + Cholesterol-lipid-concern + Allergy-symptoms + Prenatal + Gynecological-concern + Pediatrics + Age-related + Chronic-disease + Medication-side-effect + Injury + Trauma + Preventive-counseling + Genetic-risks + Vaccination-inquiry + Health-counseling + Substance-concern + Surgical-consultation + Specialist-referral + Discharge + Lab-result-discussion' + Second-opinion + Administrative-inquiry + Other + +health_risk : binary target + 1 + 0 \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-sex/env/train.py b/fairnessBench/benchmarks/health-di-sex/env/train.py new file mode 100644 index 0000000..bc908e4 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-sex/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['gender'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "gender" : prot_attr}) + +privileged_groups = [{'gender': 1}] +unprivileged_groups = [{'gender': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['gender']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +disparate_impact = metric.disparate_impact() + +print('accuracy score:', accuracy) +print('disparate impact ratio', disparate_impact) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/health-di-sex/scripts/env_read_only_files.txt similarity index 100% rename from fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/env_read_only_files.txt rename to fairnessBench/benchmarks/health-di-sex/scripts/env_read_only_files.txt diff --git a/fairnessBench/benchmarks/health-di-sex/scripts/eval.py b/fairnessBench/benchmarks/health-di-sex/scripts/eval.py new file mode 100644 index 0000000..ede1df7 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-sex/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'gender': 1}] + unprivileged_groups = [{'gender': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['gender']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-sex/scripts/prepare.py b/fairnessBench/benchmarks/health-di-sex/scripts/prepare.py new file mode 100644 index 0000000..b86b911 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-sex/scripts/prepare.py @@ -0,0 +1,104 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["sex"] = X["sex"].apply(lambda i: 1 if i == 'Male' else 0) +X = X.replace('?', np.nan) + +ad = pd.concat([X, y], axis=1) +ad = ad.dropna() + +# SR: scaling the values in cols capital-gain, capital-loss and hours-per-week into a realistic range. It preserves relative ordering while only changing the scale for respective cols. + +cholesterol_scale = MinMaxScaler(feature_range=(100, 300)) +ad['cholesterol'] = cholesterol_scale.fit_transform(ad[['capital-gain']]) + +bp_scale = MinMaxScaler(feature_range=(80, 180)) +ad['blood-pressure'] = bp_scale.fit_transform(ad[['capital-loss']]) + +activity_scale = MinMaxScaler(feature_range=(0, 24)) +ad['daily-activity-hours'] = activity_scale.fit_transform(ad[['hours-per-week']]) + +# dropping unwanted cols, we have never used these two cols ('fnlwgt', 'education-num') in any analysis so far with adult data. +ad.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'], axis=1, inplace=True) + +# column values mapping eg: occupation = Adm-clerical is mapped to procedure = Administrative-check +occupation_to_procedure = {'Adm-clerical': 'Administrative-check', 'Exec-managerial': 'Specialist-consultation', + 'Handlers-cleaners': 'Minor-procedure', 'Prof-specialty': 'Major-surgery', + 'Other-service': 'General-checkup', 'Sales': 'Diagnostic-test', + 'Craft-repair': 'Rehabilitation', 'Transport-moving': 'Emergency', + 'Farming-fishing': 'Preventive-screening', 'Machine-op-inspct': 'Therapy-session', + 'Tech-support': 'Follow-up', 'Protective-serv': 'Mental-health-consultation', + 'Armed-Forces': 'Chronic-disease-management', 'Priv-house-serv': 'Vaccination'} + + +# column values mapping eg: country = United-States is mapped to reason = General-checkup +country_to_reason = {'United-States': 'General-checkup', 'Cuba': 'Routine-follow-up', 'Jamaica': 'New-symptom', + 'India': 'Fever', 'Mexico': 'Cold-Flu-Cough', 'South': 'Headache-migraine', + 'Puerto-Rico': 'Heart-concerns', 'Honduras': 'Respiratory-issue', 'England': 'Digestive-issue', + 'Canada': 'Pain-related', 'Germany': 'Skin-concern', 'Iran': 'Eye', 'Philippines': 'ENT', + 'Italy': 'Mental-health-concerns', 'Poland': 'Stress', 'Columbia': 'Fatigue-weakness', + 'Cambodia': 'Weight-concerns', 'Thailand': 'Diabetes/Bloodsugar', 'Ecuador': 'BP-concern', + 'Laos': 'Cholesterol-lipid-concern', 'Taiwan': 'Allergy-symptoms', 'Haiti': 'Prenatal', + 'Portugal': 'Gynecological-concern', 'Dominican-Republic': 'Pediatrics', + 'El-Salvador': 'Age-related', 'France': 'Chronic-disease', 'Guatemala': 'Medication-side-effect', + 'China': 'Injury', 'Japan': 'Trauma', 'Yugoslavia': 'Preventive-counseling', 'Peru': 'Genetic-risks', + 'Outlying-US(Guam-USVI-etc)': 'Vaccination-inquiry', 'Scotland': 'Health-counseling', + 'Trinadad&Tobago': 'Substance-concern', 'Greece': 'Surgical-consultation', + 'Nicaragua': 'Specialist-referral', 'Vietnam': 'Discharge', 'Hong': 'Lab-result-discussion', + 'Ireland': 'Second-opinion', 'Hungary': 'Administrative-inquiry', 'Holand-Netherlands': 'Other'} + +# column values mapping eg: workclass = Private is mapped to lifestyle = Sedentary +workclass_to_lifestyle = { 'Private':'Sedentary', 'Self-emp-not-inc':'Moderately-active', 'Local-gov':'Active', + 'State-gov': 'Highly-active', 'Self-emp-inc': 'Unhealthy', 'Federal-gov': 'Healthy', + 'Without-pay': 'At-risk'} + +# column values mapping eg: education = HS-grad is mapped to insurance_provider = UnitedHealthcare +education_to_insurance_provider = {'HS-grad':'UnitedHealthcare', 'Some-college': 'Blue Cross Blue Shield', 'Bachelors': 'Aetna', + 'Masters': 'Cigna', 'Assoc-voc': 'Humana', '11th': 'Kaiser Permanente', 'Assoc-acdm': 'Anthem', + '10th': 'Molina Healthcare', '7th-8th': 'Centene Corporation', 'Prof-school': 'Highmark Health', + '9th': 'Health Net', '12th':'WellCare Health Plans', 'Doctorate':'CareFirst', + '5th-6th': 'Independence Blue Cross', '1st-4th': 'Fidelis Care', 'Preschool': 'Oscar Health'} + +new_col_list = [] + +new_col_list.append(pd.Series(ad['occupation'].map(occupation_to_procedure),name='procedure_on_the_day')) +new_col_list.append(pd.Series(ad['native-country'].map(country_to_reason),name='reason_for_visit')) +new_col_list.append(pd.Series(ad['workclass'].map(workclass_to_lifestyle), name='lifestyle')) +new_col_list.append(pd.Series(ad['education'].map(education_to_insurance_provider),name='insurance_provider')) + +ad = pd.concat([ad] + new_col_list, axis=1) + +ad = ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1) +ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) + +cat_cols = ['marital-status', 'relationship', 'race', 'lifestyle', + 'insurance_provider', 'procedure_on_the_day', 'reason_for_visit'] + + +ad_enc = pd.get_dummies(ad, columns=cat_cols) +ad_enc = ad_enc.astype(int) + +adX = ad_enc.drop(columns=['health_risk']) +ady = ad_enc['health_risk'] + +train_X, test_X, train_y, test_y = train_test_split(adX, ady, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/read_only_files.txt b/fairnessBench/benchmarks/health-di-sex/scripts/read_only_files.txt similarity index 100% rename from fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/read_only_files.txt rename to fairnessBench/benchmarks/health-di-sex/scripts/read_only_files.txt diff --git a/fairnessBench/benchmarks/health-eod-race/env/data_description.txt b/fairnessBench/benchmarks/health-eod-race/env/data_description.txt new file mode 100644 index 0000000..1ef9d2f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/env/data_description.txt @@ -0,0 +1,125 @@ +patient_age: Integer + +lifestyle: Categorical feature + Sedentary + Moderately-active + Active + Highly-active + Unhealthy + Healthy + At-risk + +insurance_provider : Categorical + UnitedHealthcare + Blue Cross Blue Shield + Aetna + Cigna + Humana + Kaiser Permanente + Anthem + Molina Healthcare + Centene Corporation + Highmark Health + Health Net + WellCare Health Plans + CareFirst + Independence Blue Cross + Fidelis Care + Oscar Health + +marital-status: Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +procedure_on_the_visit : Categorical + Administrative-check + Specialist-consultation + Minor-procedure + Major-surgery + General-checkup + Diagnostic-test + Rehabilitation + Emergency + Preventive-screening + Therapy-session + Follow-up + Mental-health-consultation + Chronic-disease-management + Vaccination + +relationship : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +race : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +gender : Binary + Female + Male + +cholesterol : Integer + +blood-pressure : Integer + +daily-activity-hours : Integer + +reason-for-visit : Categorical + General-checkup + Routine-follow-up + New-symptom + Fever + Cold-Flu-Cough + Headache-migraine + Heart-concerns + Respiratory-issue + Digestive-issue + Pain-related + Skin-concern + Eye + ENT + Mental-health-concerns + Stress + Fatigue-weakness + Weight-concerns + Diabetes/Bloodsugar + BP-concern + Cholesterol-lipid-concern + Allergy-symptoms + Prenatal + Gynecological-concern + Pediatrics + Age-related + Chronic-disease + Medication-side-effect + Injury + Trauma + Preventive-counseling + Genetic-risks + Vaccination-inquiry + Health-counseling + Substance-concern + Surgical-consultation + Specialist-referral + Discharge + Lab-result-discussion' + Second-opinion + Administrative-inquiry + Other + +health_risk : binary target + 1 + 0 \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-race/env/train.py b/fairnessBench/benchmarks/health-eod-race/env/train.py new file mode 100644 index 0000000..8ba7f61 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['race'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "race" : prot_attr}) + +privileged_groups = [{'race': 1}] +unprivileged_groups = [{'race': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['race']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +equal_opp_diff = metric.equal_opportunity_difference() + +print('accuracy score:', accuracy) +print('equal opportunity difference', equal_opp_diff) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/health-eod-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/health-eod-race/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-eod-race/scripts/eval.py b/fairnessBench/benchmarks/health-eod-race/scripts/eval.py new file mode 100644 index 0000000..88dd899 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'race': 1}] + unprivileged_groups = [{'race': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['race']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-race/scripts/prepare.py b/fairnessBench/benchmarks/health-eod-race/scripts/prepare.py new file mode 100644 index 0000000..d79ea06 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/scripts/prepare.py @@ -0,0 +1,104 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["race"] = X["race"].apply(lambda i: 1 if i == 'White' else 0) +X = X.replace('?', np.nan) + +ad = pd.concat([X, y], axis=1) +ad = ad.dropna() + +# SR: scaling the values in cols capital-gain, capital-loss and hours-per-week into a realistic range. It preserves relative ordering while only changing the scale for respective cols. + +cholesterol_scale = MinMaxScaler(feature_range=(100, 300)) +ad['cholesterol'] = cholesterol_scale.fit_transform(ad[['capital-gain']]) + +bp_scale = MinMaxScaler(feature_range=(80, 180)) +ad['blood-pressure'] = bp_scale.fit_transform(ad[['capital-loss']]) + +activity_scale = MinMaxScaler(feature_range=(0, 24)) +ad['daily-activity-hours'] = activity_scale.fit_transform(ad[['hours-per-week']]) + +# dropping unwanted cols, we have never used these two cols ('fnlwgt', 'education-num') in any analysis so far with adult data. +ad.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'], axis=1, inplace=True) + +# column values mapping eg: occupation = Adm-clerical is mapped to procedure = Administrative-check +occupation_to_procedure = {'Adm-clerical': 'Administrative-check', 'Exec-managerial': 'Specialist-consultation', + 'Handlers-cleaners': 'Minor-procedure', 'Prof-specialty': 'Major-surgery', + 'Other-service': 'General-checkup', 'Sales': 'Diagnostic-test', + 'Craft-repair': 'Rehabilitation', 'Transport-moving': 'Emergency', + 'Farming-fishing': 'Preventive-screening', 'Machine-op-inspct': 'Therapy-session', + 'Tech-support': 'Follow-up', 'Protective-serv': 'Mental-health-consultation', + 'Armed-Forces': 'Chronic-disease-management', 'Priv-house-serv': 'Vaccination'} + + +# column values mapping eg: country = United-States is mapped to reason = General-checkup +country_to_reason = {'United-States': 'General-checkup', 'Cuba': 'Routine-follow-up', 'Jamaica': 'New-symptom', + 'India': 'Fever', 'Mexico': 'Cold-Flu-Cough', 'South': 'Headache-migraine', + 'Puerto-Rico': 'Heart-concerns', 'Honduras': 'Respiratory-issue', 'England': 'Digestive-issue', + 'Canada': 'Pain-related', 'Germany': 'Skin-concern', 'Iran': 'Eye', 'Philippines': 'ENT', + 'Italy': 'Mental-health-concerns', 'Poland': 'Stress', 'Columbia': 'Fatigue-weakness', + 'Cambodia': 'Weight-concerns', 'Thailand': 'Diabetes/Bloodsugar', 'Ecuador': 'BP-concern', + 'Laos': 'Cholesterol-lipid-concern', 'Taiwan': 'Allergy-symptoms', 'Haiti': 'Prenatal', + 'Portugal': 'Gynecological-concern', 'Dominican-Republic': 'Pediatrics', + 'El-Salvador': 'Age-related', 'France': 'Chronic-disease', 'Guatemala': 'Medication-side-effect', + 'China': 'Injury', 'Japan': 'Trauma', 'Yugoslavia': 'Preventive-counseling', 'Peru': 'Genetic-risks', + 'Outlying-US(Guam-USVI-etc)': 'Vaccination-inquiry', 'Scotland': 'Health-counseling', + 'Trinadad&Tobago': 'Substance-concern', 'Greece': 'Surgical-consultation', + 'Nicaragua': 'Specialist-referral', 'Vietnam': 'Discharge', 'Hong': 'Lab-result-discussion', + 'Ireland': 'Second-opinion', 'Hungary': 'Administrative-inquiry', 'Holand-Netherlands': 'Other'} + +# column values mapping eg: workclass = Private is mapped to lifestyle = Sedentary +workclass_to_lifestyle = { 'Private':'Sedentary', 'Self-emp-not-inc':'Moderately-active', 'Local-gov':'Active', + 'State-gov': 'Highly-active', 'Self-emp-inc': 'Unhealthy', 'Federal-gov': 'Healthy', + 'Without-pay': 'At-risk'} + +# column values mapping eg: education = HS-grad is mapped to insurance_provider = UnitedHealthcare +education_to_insurance_provider = {'HS-grad':'UnitedHealthcare', 'Some-college': 'Blue Cross Blue Shield', 'Bachelors': 'Aetna', + 'Masters': 'Cigna', 'Assoc-voc': 'Humana', '11th': 'Kaiser Permanente', 'Assoc-acdm': 'Anthem', + '10th': 'Molina Healthcare', '7th-8th': 'Centene Corporation', 'Prof-school': 'Highmark Health', + '9th': 'Health Net', '12th':'WellCare Health Plans', 'Doctorate':'CareFirst', + '5th-6th': 'Independence Blue Cross', '1st-4th': 'Fidelis Care', 'Preschool': 'Oscar Health'} + +new_col_list = [] + +new_col_list.append(pd.Series(ad['occupation'].map(occupation_to_procedure),name='procedure_on_the_day')) +new_col_list.append(pd.Series(ad['native-country'].map(country_to_reason),name='reason_for_visit')) +new_col_list.append(pd.Series(ad['workclass'].map(workclass_to_lifestyle), name='lifestyle')) +new_col_list.append(pd.Series(ad['education'].map(education_to_insurance_provider),name='insurance_provider')) + +ad = pd.concat([ad] + new_col_list, axis=1) + +ad = ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1) +ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) + +cat_cols = ['marital-status', 'relationship', 'gender', 'lifestyle', + 'insurance_provider', 'procedure_on_the_day', 'reason_for_visit'] + + +ad_enc = pd.get_dummies(ad, columns=cat_cols) +ad_enc = ad_enc.astype(int) + +adX = ad_enc.drop(columns=['health_risk']) +ady = ad_enc['health_risk'] + +train_X, test_X, train_y, test_y = train_test_split(adX, ady, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-race/scripts/read_only_files.txt b/fairnessBench/benchmarks/health-eod-race/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-eod-sex/env/data_description.txt b/fairnessBench/benchmarks/health-eod-sex/env/data_description.txt new file mode 100644 index 0000000..1ef9d2f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/env/data_description.txt @@ -0,0 +1,125 @@ +patient_age: Integer + +lifestyle: Categorical feature + Sedentary + Moderately-active + Active + Highly-active + Unhealthy + Healthy + At-risk + +insurance_provider : Categorical + UnitedHealthcare + Blue Cross Blue Shield + Aetna + Cigna + Humana + Kaiser Permanente + Anthem + Molina Healthcare + Centene Corporation + Highmark Health + Health Net + WellCare Health Plans + CareFirst + Independence Blue Cross + Fidelis Care + Oscar Health + +marital-status: Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +procedure_on_the_visit : Categorical + Administrative-check + Specialist-consultation + Minor-procedure + Major-surgery + General-checkup + Diagnostic-test + Rehabilitation + Emergency + Preventive-screening + Therapy-session + Follow-up + Mental-health-consultation + Chronic-disease-management + Vaccination + +relationship : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +race : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +gender : Binary + Female + Male + +cholesterol : Integer + +blood-pressure : Integer + +daily-activity-hours : Integer + +reason-for-visit : Categorical + General-checkup + Routine-follow-up + New-symptom + Fever + Cold-Flu-Cough + Headache-migraine + Heart-concerns + Respiratory-issue + Digestive-issue + Pain-related + Skin-concern + Eye + ENT + Mental-health-concerns + Stress + Fatigue-weakness + Weight-concerns + Diabetes/Bloodsugar + BP-concern + Cholesterol-lipid-concern + Allergy-symptoms + Prenatal + Gynecological-concern + Pediatrics + Age-related + Chronic-disease + Medication-side-effect + Injury + Trauma + Preventive-counseling + Genetic-risks + Vaccination-inquiry + Health-counseling + Substance-concern + Surgical-consultation + Specialist-referral + Discharge + Lab-result-discussion' + Second-opinion + Administrative-inquiry + Other + +health_risk : binary target + 1 + 0 \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-sex/env/train.py b/fairnessBench/benchmarks/health-eod-sex/env/train.py new file mode 100644 index 0000000..2888465 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['gender'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "gender" : prot_attr}) + +privileged_groups = [{'gender': 1}] +unprivileged_groups = [{'gender': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['gender']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +equal_opp_diff = metric.equal_opportunity_difference() + +print('accuracy score:', accuracy) +print('equal opportunity difference', equal_opp_diff) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/health-eod-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/health-eod-sex/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-eod-sex/scripts/eval.py b/fairnessBench/benchmarks/health-eod-sex/scripts/eval.py new file mode 100644 index 0000000..ede1df7 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'gender': 1}] + unprivileged_groups = [{'gender': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['gender']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-sex/scripts/prepare.py b/fairnessBench/benchmarks/health-eod-sex/scripts/prepare.py new file mode 100644 index 0000000..b86b911 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/scripts/prepare.py @@ -0,0 +1,104 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["sex"] = X["sex"].apply(lambda i: 1 if i == 'Male' else 0) +X = X.replace('?', np.nan) + +ad = pd.concat([X, y], axis=1) +ad = ad.dropna() + +# SR: scaling the values in cols capital-gain, capital-loss and hours-per-week into a realistic range. It preserves relative ordering while only changing the scale for respective cols. + +cholesterol_scale = MinMaxScaler(feature_range=(100, 300)) +ad['cholesterol'] = cholesterol_scale.fit_transform(ad[['capital-gain']]) + +bp_scale = MinMaxScaler(feature_range=(80, 180)) +ad['blood-pressure'] = bp_scale.fit_transform(ad[['capital-loss']]) + +activity_scale = MinMaxScaler(feature_range=(0, 24)) +ad['daily-activity-hours'] = activity_scale.fit_transform(ad[['hours-per-week']]) + +# dropping unwanted cols, we have never used these two cols ('fnlwgt', 'education-num') in any analysis so far with adult data. +ad.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'], axis=1, inplace=True) + +# column values mapping eg: occupation = Adm-clerical is mapped to procedure = Administrative-check +occupation_to_procedure = {'Adm-clerical': 'Administrative-check', 'Exec-managerial': 'Specialist-consultation', + 'Handlers-cleaners': 'Minor-procedure', 'Prof-specialty': 'Major-surgery', + 'Other-service': 'General-checkup', 'Sales': 'Diagnostic-test', + 'Craft-repair': 'Rehabilitation', 'Transport-moving': 'Emergency', + 'Farming-fishing': 'Preventive-screening', 'Machine-op-inspct': 'Therapy-session', + 'Tech-support': 'Follow-up', 'Protective-serv': 'Mental-health-consultation', + 'Armed-Forces': 'Chronic-disease-management', 'Priv-house-serv': 'Vaccination'} + + +# column values mapping eg: country = United-States is mapped to reason = General-checkup +country_to_reason = {'United-States': 'General-checkup', 'Cuba': 'Routine-follow-up', 'Jamaica': 'New-symptom', + 'India': 'Fever', 'Mexico': 'Cold-Flu-Cough', 'South': 'Headache-migraine', + 'Puerto-Rico': 'Heart-concerns', 'Honduras': 'Respiratory-issue', 'England': 'Digestive-issue', + 'Canada': 'Pain-related', 'Germany': 'Skin-concern', 'Iran': 'Eye', 'Philippines': 'ENT', + 'Italy': 'Mental-health-concerns', 'Poland': 'Stress', 'Columbia': 'Fatigue-weakness', + 'Cambodia': 'Weight-concerns', 'Thailand': 'Diabetes/Bloodsugar', 'Ecuador': 'BP-concern', + 'Laos': 'Cholesterol-lipid-concern', 'Taiwan': 'Allergy-symptoms', 'Haiti': 'Prenatal', + 'Portugal': 'Gynecological-concern', 'Dominican-Republic': 'Pediatrics', + 'El-Salvador': 'Age-related', 'France': 'Chronic-disease', 'Guatemala': 'Medication-side-effect', + 'China': 'Injury', 'Japan': 'Trauma', 'Yugoslavia': 'Preventive-counseling', 'Peru': 'Genetic-risks', + 'Outlying-US(Guam-USVI-etc)': 'Vaccination-inquiry', 'Scotland': 'Health-counseling', + 'Trinadad&Tobago': 'Substance-concern', 'Greece': 'Surgical-consultation', + 'Nicaragua': 'Specialist-referral', 'Vietnam': 'Discharge', 'Hong': 'Lab-result-discussion', + 'Ireland': 'Second-opinion', 'Hungary': 'Administrative-inquiry', 'Holand-Netherlands': 'Other'} + +# column values mapping eg: workclass = Private is mapped to lifestyle = Sedentary +workclass_to_lifestyle = { 'Private':'Sedentary', 'Self-emp-not-inc':'Moderately-active', 'Local-gov':'Active', + 'State-gov': 'Highly-active', 'Self-emp-inc': 'Unhealthy', 'Federal-gov': 'Healthy', + 'Without-pay': 'At-risk'} + +# column values mapping eg: education = HS-grad is mapped to insurance_provider = UnitedHealthcare +education_to_insurance_provider = {'HS-grad':'UnitedHealthcare', 'Some-college': 'Blue Cross Blue Shield', 'Bachelors': 'Aetna', + 'Masters': 'Cigna', 'Assoc-voc': 'Humana', '11th': 'Kaiser Permanente', 'Assoc-acdm': 'Anthem', + '10th': 'Molina Healthcare', '7th-8th': 'Centene Corporation', 'Prof-school': 'Highmark Health', + '9th': 'Health Net', '12th':'WellCare Health Plans', 'Doctorate':'CareFirst', + '5th-6th': 'Independence Blue Cross', '1st-4th': 'Fidelis Care', 'Preschool': 'Oscar Health'} + +new_col_list = [] + +new_col_list.append(pd.Series(ad['occupation'].map(occupation_to_procedure),name='procedure_on_the_day')) +new_col_list.append(pd.Series(ad['native-country'].map(country_to_reason),name='reason_for_visit')) +new_col_list.append(pd.Series(ad['workclass'].map(workclass_to_lifestyle), name='lifestyle')) +new_col_list.append(pd.Series(ad['education'].map(education_to_insurance_provider),name='insurance_provider')) + +ad = pd.concat([ad] + new_col_list, axis=1) + +ad = ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1) +ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) + +cat_cols = ['marital-status', 'relationship', 'race', 'lifestyle', + 'insurance_provider', 'procedure_on_the_day', 'reason_for_visit'] + + +ad_enc = pd.get_dummies(ad, columns=cat_cols) +ad_enc = ad_enc.astype(int) + +adX = ad_enc.drop(columns=['health_risk']) +ady = ad_enc['health_risk'] + +train_X, test_X, train_y, test_y = train_test_split(adX, ady, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-sex/scripts/read_only_files.txt b/fairnessBench/benchmarks/health-eod-sex/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/env/data_description.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/env/data_description.txt similarity index 100% rename from fairnessBench/benchmarks/non_descriptive_ad-di-race/env/data_description.txt rename to fairnessBench/benchmarks/non_descriptive_ad-di-patt1/env/data_description.txt diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/env/train.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/env/train.py similarity index 100% rename from fairnessBench/benchmarks/non_descriptive_ad-di-race/env/train.py rename to fairnessBench/benchmarks/non_descriptive_ad-di-patt1/env/train.py diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/eval.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/eval.py similarity index 100% rename from fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/eval.py rename to fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/eval.py diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/prepare.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/prepare.py similarity index 100% rename from fairnessBench/benchmarks/non_descriptive_ad-di-race/scripts/prepare.py rename to fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/prepare.py diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/data_description.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/data_description.txt new file mode 100644 index 0000000..068cb06 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/data_description.txt @@ -0,0 +1,125 @@ +patt2: Integer + +feat1 : Categorical feature + Private + Self-emp-not-inc + Self-emp-inc + Federal-gov + Local-gov + State-gov + Without-pay + Never-worked. + +feat2 : Categorical + Education Level + Bachelors + Some-college + 11th, HS-grad + Prof-school + Assoc-acdm + Assoc-voc + 9th + 7th-8th + 12th + Masters + 1st-4th + 10th + Doctorate + 5th-6th + Preschool + +patt3 : Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +feat3 : Categorical + Tech-support, + Craft-repair, + Other-service, + Sales, + Exec-managerial, + Prof-specialty, + Handlers-cleaners, + Machine-op-inspct, + Adm-clerical, + Farming-fishing, + Transport-moving, + Priv-house-serv, + Protective-serv, + Armed-Forces + +patt4 : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +patt1 : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +patt5 : Binary + Female + Male + +feat4 : Integer + +feat5 : Integer + +feat6 : Integer + +feat7 : Categorical + United-States, + Cambodia, + England, + Puerto-Rico, + Canada, + Germany, + Outlying-US(Guam-USVI-etc), + India, + Japan, + Greece, + South, + China, + Cuba, + Iran, + Honduras, + Philippines, + Italy, + Poland, + Jamaica, + Vietnam, + Mexico, + Portugal, + Ireland, + France, + Dominican-Republic, + Laos, Ecuador, + Taiwan, + Haiti, + Columbia, + Hungary, + Guatemala, + Nicaragua, + Scotland, + Thailand, + Yugoslavia, + El-Salvador, + Trinadad&Tobago, + Peru, + Hong, + Holand-Netherlands + +target : binary target + <=50K + >50K. \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/train.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/train.py new file mode 100644 index 0000000..8d868a3 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['patt5'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "patt5" : prot_attr}) + +privileged_groups = [{'patt5': 1}] +unprivileged_groups = [{'patt5': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['patt5']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +disparate_impact = metric.disparate_impact() + +print('accuracy score:', accuracy) +print('disparate impact ratio', disparate_impact) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/eval.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/eval.py new file mode 100644 index 0000000..347f335 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'patt5': 1}] + unprivileged_groups = [{'patt5': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['patt5']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/prepare.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/prepare.py new file mode 100644 index 0000000..1a3c316 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/prepare.py @@ -0,0 +1,49 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["sex"] = X["sex"].apply(lambda i: 1 if i == 'Male' else 0) +X = X.replace('?', np.nan) + +X = X.rename(columns={'age': 'patt2', 'workclass': 'feat1', 'education': 'feat2', 'marital-status': 'patt3', 'occupation': 'feat3','relationship': 'patt4', 'race': 'patt1', 'sex': 'patt5', 'capital-gain': 'feat4', 'capital-loss':'feat5', 'hours-per-week': 'feat6', 'native-country': 'feat7'}) + +y=y.rename(columns={'income':'target'}) + +categorical_columns = ["feat1", "feat2", "patt3", "feat3", "patt4", "patt1", "feat7"] + +X_enc = pd.get_dummies(X, columns=categorical_columns) +X_enc = X_enc.replace({True: 1, False: 0}) + + +train_X, test_X, train_y, test_y = train_test_split(X_enc, y, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") + +# feat1 workclass cat +# feat2 education cat +# patt1 (race) cat +# patt2 (age) num +# patt3 (ms) cat +# patt4 (relationship) cat +# patt5 (sex) cat +# feat3 occupation cat +# feat4 cap gain num +# feat5 cap loss num +# feat6 hours-per-week num +# feat7 native country cat +# target income bin cat \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/data_description.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/data_description.txt new file mode 100644 index 0000000..068cb06 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/data_description.txt @@ -0,0 +1,125 @@ +patt2: Integer + +feat1 : Categorical feature + Private + Self-emp-not-inc + Self-emp-inc + Federal-gov + Local-gov + State-gov + Without-pay + Never-worked. + +feat2 : Categorical + Education Level + Bachelors + Some-college + 11th, HS-grad + Prof-school + Assoc-acdm + Assoc-voc + 9th + 7th-8th + 12th + Masters + 1st-4th + 10th + Doctorate + 5th-6th + Preschool + +patt3 : Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +feat3 : Categorical + Tech-support, + Craft-repair, + Other-service, + Sales, + Exec-managerial, + Prof-specialty, + Handlers-cleaners, + Machine-op-inspct, + Adm-clerical, + Farming-fishing, + Transport-moving, + Priv-house-serv, + Protective-serv, + Armed-Forces + +patt4 : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +patt1 : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +patt5 : Binary + Female + Male + +feat4 : Integer + +feat5 : Integer + +feat6 : Integer + +feat7 : Categorical + United-States, + Cambodia, + England, + Puerto-Rico, + Canada, + Germany, + Outlying-US(Guam-USVI-etc), + India, + Japan, + Greece, + South, + China, + Cuba, + Iran, + Honduras, + Philippines, + Italy, + Poland, + Jamaica, + Vietnam, + Mexico, + Portugal, + Ireland, + France, + Dominican-Republic, + Laos, Ecuador, + Taiwan, + Haiti, + Columbia, + Hungary, + Guatemala, + Nicaragua, + Scotland, + Thailand, + Yugoslavia, + El-Salvador, + Trinadad&Tobago, + Peru, + Hong, + Holand-Netherlands + +target : binary target + <=50K + >50K. \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/train.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/train.py new file mode 100644 index 0000000..f37344a --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['patt1'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "patt1" : prot_attr}) + +privileged_groups = [{'patt1': 1}] +unprivileged_groups = [{'patt1': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['patt1']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +equal_opp_diff = metric.equal_opportunity_difference() + +print('accuracy score:', accuracy) +print('equal opportunity difference', equal_opp_diff) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/eval.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/eval.py new file mode 100644 index 0000000..a79a1c4 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'patt1': 1}] + unprivileged_groups = [{'patt1': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['patt1']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/prepare.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/prepare.py new file mode 100644 index 0000000..f798c75 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/prepare.py @@ -0,0 +1,49 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["race"] = X["race"].apply(lambda i: 1 if i == 'White' else 0) +X = X.replace('?', np.nan) + +X = X.rename(columns={'age': 'patt2', 'workclass': 'feat1', 'education': 'feat2', 'marital-status': 'patt3', 'occupation': 'feat3','relationship': 'patt4', 'race': 'patt1', 'sex': 'patt5', 'capital-gain': 'feat4', 'capital-loss':'feat5', 'hours-per-week': 'feat6', 'native-country': 'feat7'}) + +y=y.rename(columns={'income':'target'}) + +categorical_columns = ["feat1", "feat2", "patt3", "feat3", "patt4", "patt5", "feat7"] + +X_enc = pd.get_dummies(X, columns=categorical_columns) +X_enc = X_enc.replace({True: 1, False: 0}) + + +train_X, test_X, train_y, test_y = train_test_split(X_enc, y, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") + +# feat1 workclass cat +# feat2 education cat +# patt1 (race) cat +# patt2 (age) num +# patt3 (ms) cat +# patt4 (relationship) cat +# patt5 (sex) cat +# feat3 occupation cat +# feat4 cap gain num +# feat5 cap loss num +# feat6 hours-per-week num +# feat7 native country cat +# target income bin cat \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/data_description.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/data_description.txt new file mode 100644 index 0000000..068cb06 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/data_description.txt @@ -0,0 +1,125 @@ +patt2: Integer + +feat1 : Categorical feature + Private + Self-emp-not-inc + Self-emp-inc + Federal-gov + Local-gov + State-gov + Without-pay + Never-worked. + +feat2 : Categorical + Education Level + Bachelors + Some-college + 11th, HS-grad + Prof-school + Assoc-acdm + Assoc-voc + 9th + 7th-8th + 12th + Masters + 1st-4th + 10th + Doctorate + 5th-6th + Preschool + +patt3 : Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +feat3 : Categorical + Tech-support, + Craft-repair, + Other-service, + Sales, + Exec-managerial, + Prof-specialty, + Handlers-cleaners, + Machine-op-inspct, + Adm-clerical, + Farming-fishing, + Transport-moving, + Priv-house-serv, + Protective-serv, + Armed-Forces + +patt4 : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +patt1 : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +patt5 : Binary + Female + Male + +feat4 : Integer + +feat5 : Integer + +feat6 : Integer + +feat7 : Categorical + United-States, + Cambodia, + England, + Puerto-Rico, + Canada, + Germany, + Outlying-US(Guam-USVI-etc), + India, + Japan, + Greece, + South, + China, + Cuba, + Iran, + Honduras, + Philippines, + Italy, + Poland, + Jamaica, + Vietnam, + Mexico, + Portugal, + Ireland, + France, + Dominican-Republic, + Laos, Ecuador, + Taiwan, + Haiti, + Columbia, + Hungary, + Guatemala, + Nicaragua, + Scotland, + Thailand, + Yugoslavia, + El-Salvador, + Trinadad&Tobago, + Peru, + Hong, + Holand-Netherlands + +target : binary target + <=50K + >50K. \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/train.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/train.py new file mode 100644 index 0000000..c475d01 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['patt5'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "patt5" : prot_attr}) + +privileged_groups = [{'patt5': 1}] +unprivileged_groups = [{'patt5': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['patt5']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +equal_opp_diff = metric.equal_opportunity_difference() + +print('accuracy score:', accuracy) +print('equal opportunity difference', equal_opp_diff) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/eval.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/eval.py new file mode 100644 index 0000000..347f335 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'patt5': 1}] + unprivileged_groups = [{'patt5': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['patt5']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/prepare.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/prepare.py new file mode 100644 index 0000000..1a3c316 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/prepare.py @@ -0,0 +1,49 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["sex"] = X["sex"].apply(lambda i: 1 if i == 'Male' else 0) +X = X.replace('?', np.nan) + +X = X.rename(columns={'age': 'patt2', 'workclass': 'feat1', 'education': 'feat2', 'marital-status': 'patt3', 'occupation': 'feat3','relationship': 'patt4', 'race': 'patt1', 'sex': 'patt5', 'capital-gain': 'feat4', 'capital-loss':'feat5', 'hours-per-week': 'feat6', 'native-country': 'feat7'}) + +y=y.rename(columns={'income':'target'}) + +categorical_columns = ["feat1", "feat2", "patt3", "feat3", "patt4", "patt1", "feat7"] + +X_enc = pd.get_dummies(X, columns=categorical_columns) +X_enc = X_enc.replace({True: 1, False: 0}) + + +train_X, test_X, train_y, test_y = train_test_split(X_enc, y, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") + +# feat1 workclass cat +# feat2 education cat +# patt1 (race) cat +# patt2 (age) num +# patt3 (ms) cat +# patt4 (relationship) cat +# patt5 (sex) cat +# feat3 occupation cat +# feat4 cap gain num +# feat5 cap loss num +# feat6 hours-per-week num +# feat7 native country cat +# target income bin cat \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv From d8882098b958caf07e8ad7e373ab85f48c52b4d0 Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Sat, 17 Jan 2026 20:03:11 +0000 Subject: [PATCH 20/22] updated task.json with research problems wrt new health and non descriptive tasks --- fairnessBench/benchmarks/tasks.json | 129 +++++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 4 deletions(-) diff --git a/fairnessBench/benchmarks/tasks.json b/fairnessBench/benchmarks/tasks.json index cf84b05..52d2b15 100644 --- a/fairnessBench/benchmarks/tasks.json +++ b/fairnessBench/benchmarks/tasks.json @@ -420,8 +420,129 @@ "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across race. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", "benchmark_folder_name": "health-di-race" }, - "nondescriptive_di_balance-race": { - "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across race. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", - "benchmark_folder_name": "non_descriptive_ad-di-race" - } + "health_di_balance-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-di-sex" + }, + "health_di_best-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making the model fair with respect to disparate impact. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the disparate impact with respect to race. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-di-race" + }, + "health_di_best-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making a model fair with respect to disparate impact. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the disparate impact with respect to sex. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-di-sex" + }, + "health_di_target10-race": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve disparate impact with respect to race by 10% in health data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in disparate impact. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-di-race" + }, + "health_di_target10-sex": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve disparate impact with respect to sex by 10% in health data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in disparate impact. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-di-sex" + }, + "health_di_implicit-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the health data. Your goal is to Iterate over different models and maximize predictive performance. use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-di-race" + }, + "health_di_implicit-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the health data. Your goal is to Iterate over different models and maximize predictive performance. use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-di-sex" + }, + "health_eod_best-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making a model fair with respect to equal opportunity difference. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the equal opportunity difference with respect to sex. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-eod-sex" + }, + "health_eod_best-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making the model fair with respect to equal opportunity difference. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the equal opportunity difference with respect to race. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-eod-race" + }, + "health_eod_balance-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-eod-sex" + }, + "health_eod_balance-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across race. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-eod-race" + }, + "health_eod_target10-sex": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve equal opportunity difference with respect to sex by 10% in health data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in equal opportunity difference. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-eod-sex" + }, + "health_eod_target10-race": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve equal opportunity difference with respect to race by 10% in health data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in equal opportunity difference. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-eod-race" + }, + "health_eod_implicit-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the health data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-eod-sex" + }, + "health_eod_implicit-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the health data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-eod-race" + }, + "nondescriptive_di_balance-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across patt1. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-di-patt1" + }, + "nondescriptive_di_balance-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across patt5. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-di-patt5" + }, + "nondescriptive_di_best-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making the model fair with respect to disparate impact. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the disparate impact with respect to patt1. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-di-patt1" + }, + "nondescriptive_di_best-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making a model fair with respect to disparate impact. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the disparate impact with respect to patt5. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-di-patt5" + }, + "nondescriptive_di_target10-patt1": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve disparate impact with respect to patt1 by 10% in non-descriptive data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in disparate impact. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-di-patt1" + }, + "nondescriptive_di_target10-patt5": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve disparate impact with respect to patt5 by 10% in non-descriptive data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in disparate impact. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-di-patt5" + }, + "nondescriptive_di_implicit-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the non-descriptive data. Your goal is to Iterate over different models and maximize predictive performance. use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-di-patt1" + }, + "nondescriptive_di_implicit-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the non-descriptive data. Your goal is to Iterate over different models and maximize predictive performance. use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-di-patt5" + }, + "nondescriptive_eod_best-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making a model fair with respect to equal opportunity difference. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the equal opportunity difference with respect to patt5. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt5" + }, + "nondescriptive_eod_best-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making the model fair with respect to equal opportunity difference. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the equal opportunity difference with respect to patt1. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt1" + }, + "nondescriptive_eod_balance-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across patt5. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt5" + }, + "nondescriptive_eod_balance-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across patt1. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt1" + }, + "nondescriptive_eod_target10-patt5": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve equal opportunity difference with respect to patt5 by 10% in non-descriptive data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in equal opportunity difference. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt5" + }, + "nondescriptive_eod_target10-patt1": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve equal opportunity difference with respect to patt1 by 10% in non-descriptive data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in equal opportunity difference. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt1" + }, + "nondescriptive_eod_implicit-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the non-descriptive data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt5" + }, + "nondescriptive_eod_implicit-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the non-descriptive data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt1" + } + } From 41661c5be6c4c5e57da5e1e78770ff1531d84afc Mon Sep 17 00:00:00 2001 From: AymanBx Date: Mon, 19 Jan 2026 20:13:29 +0000 Subject: [PATCH 21/22] tasks.json: Removed empty line after the final object --- fairnessBench/benchmarks/tasks.json | 1 - 1 file changed, 1 deletion(-) diff --git a/fairnessBench/benchmarks/tasks.json b/fairnessBench/benchmarks/tasks.json index 52d2b15..f19bf53 100644 --- a/fairnessBench/benchmarks/tasks.json +++ b/fairnessBench/benchmarks/tasks.json @@ -544,5 +544,4 @@ "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the non-descriptive data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", "benchmark_folder_name": "non_descriptive_ad-eod-patt1" } - } From 596088d636ddd457566d792d315c53f39199b167 Mon Sep 17 00:00:00 2001 From: Surbhi Rathore Date: Tue, 20 Jan 2026 18:35:50 +0000 Subject: [PATCH 22/22] updated prompt sensitivity problems in task.json --- fairnessBench/benchmarks/tasks.json | 52 +++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/fairnessBench/benchmarks/tasks.json b/fairnessBench/benchmarks/tasks.json index f19bf53..06c4026 100644 --- a/fairnessBench/benchmarks/tasks.json +++ b/fairnessBench/benchmarks/tasks.json @@ -543,5 +543,57 @@ "nondescriptive_eod_implicit-patt1": { "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the non-descriptive data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", "benchmark_folder_name": "non_descriptive_ad-eod-patt1" + }, + "adult_balance-eod-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-nosuccess-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-shortgoal-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. To achieve the best accuracy-fairness compromise regarding equal opportunity difference, systematically test various algorithms, variable selection approaches, and bias-mitigation preprocessing techniques. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-noreq-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Refine train.py to balance performance and sex-based fairness metrics Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-nochange-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-nohow-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-rephrased01-sex": { + "research_problem": "Review the data_description.txt file to get familiar with the dataset and its features and note down essential points in your research logs. Modify the train.py script to train a model that optimizes both accuracy and fairness, focusing on reducing equal opportunity difference with respect to sex. Experiment with various models, feature selection methods, and fairness-aware preprocessing to balance model performance and disparate impact. Do not look at CSV files. Use the execute script to check the results.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-rephrased10-sex": { + "research_problem": "Several methods for analyzing data_description.txt should be explored to interpret the variables and dataset structure. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-rephrased06-sex": { + "research_problem": "The data_description.txt file contains feature information - outline multiple techniques to analyze it and understand the data. To track your activities, condense the main insights within your research documentation. When updating train.py, ensure the model training process balances precision and impartiality with respect to differential effects across gender. Test various models, feature selectors, and fairness preprocessing to optimize the accuracy-equal opportunity difference trade-off. Bypass direct analysis of any comma-separated value documents validating whether the objective is achieved involves running the script action.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-rephrased05-sex": { + "research_problem": "Demonstrate multiple techniques for parsing data_description.txt so I can grasp the information and its characteristics. Document essential findings in your research records to monitor your progress. Modify train.py (the provided script) to ensure the trained model achieves an optimal trade-off between performance and equity regarding sex-based equal opportunity difference. To achieve the best accuracy-fairness compromise regarding equal opportunity difference, systematically test various algorithms, variable selection approaches, and bias-mitigation preprocessing techniques CSV files should not be opened or examined directly Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-rephrased03-sex": { + "research_problem": "Use the data_description.txt file to learn about the dataset and what each feature represents. Then, redo the training process in train.py to ensure that the resulting model maintains high accuracy while minimizing equal opportunity difference across sex. Explore different machine learning algorithms, ways to select relevant features, and preprocessing strategies designed to minimize bias and maximize accuracy. Do not inspect CSVs directly. Use the execute script to check if the desired outcome was reached.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-altmetricdetail-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Multiple models, attribute selection techniques, and equity-oriented preprocessing approaches should be tested to identify the best balance between correctness and disparity in recall. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-altmetricdetail2-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. By exploring different models along with feature engineering strategies and equity-focused data preparation methods, you'll determine the ideal balance between correct predictions and groupwise differences in sensitivity. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" } }