diff --git a/fairnessBench/LLM.py b/fairnessBench/LLM.py index df3b020..83a1f14 100644 --- a/fairnessBench/LLM.py +++ b/fairnessBench/LLM.py @@ -18,22 +18,6 @@ # AS: Setup llama loaded_hf_models = {} -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -try: - # Need export HF_HOME=/datasets/ai/llama3 - # llama_= "meta-llama/Llama-3.3-70B-Instruct" # Gave us decent results. - # llama_= "meta-llama/Llama-3.1-405B-Instruct" # Terrible hallusinations - # llama_= "meta-llama/Llama-3.1-8B-Instruct" # Trying smaller models for test runs - - tokenizer = AutoTokenizer.from_pretrained(llama_) - quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16) - model = AutoModelForCausalLM.from_pretrained(llama_, quantization_config = quant_config, device_map="auto",torch_dtype=torch.float16) - loaded_hf_models = {"llama": (model, tokenizer)} - print(f"Loaded local {llama_} successfuly using device: {model.device}.") -except Exception as e: - print(f"Failed to load local llama - Current device:{device}\nIssue: {e}") - - def complete_text_hf(prompt, stop_sequences=[], model="llama", max_tokens_to_sample = 2500, temperature=0.5, log_file=None, device=0, **kwargs): if model in loaded_hf_models: hf_model, tokenizer = loaded_hf_models[model] @@ -606,13 +590,13 @@ def complete_text(prompt, log_file, model, device=0, **kwargs): if model.startswith("claude"): # use anthropic API - completion = complete_text_claude(prompt, stop_sequences=[anthropic.HUMAN_PROMPT,"Observation:", "Observation"], log_file=log_file, model=model, **kwargs) + completion = complete_text_claude(prompt, stop_sequences=[anthropic.HUMAN_PROMPT,"Observation:"], log_file=log_file, model=model, **kwargs) elif model.startswith("gemini"): - completion = complete_text_gemini(prompt, stop_sequences=["Observation:", "Observation"], log_file=log_file, model=model, **kwargs) + completion = complete_text_gemini(prompt, stop_sequences=["Observation:"], log_file=log_file, model=model, **kwargs) elif model.startswith("llama"): - completion = complete_text_hf(prompt, stop_sequences=["Observation:", "Observation"], log_file=log_file, model=model, device=device, **kwargs) + completion = complete_text_hf(prompt, stop_sequences=["Observation:"], log_file=log_file, model=model, device=device, **kwargs) elif model.startswith("qwen"): - completion = complete_text_qwen(prompt, stop_sequences=["Observation:", "Observation"], log_file=log_file, model=model, device=device, **kwargs) + completion = complete_text_qwen(prompt, stop_sequences=["Observation:"], log_file=log_file, model=model, device=device, **kwargs) elif model.startswith("granite"): completion = complete_text_granite(prompt, stop_sequences=["}"], log_file=log_file, model=model, device=device, **kwargs) elif model.startswith("deepseek"): @@ -621,10 +605,10 @@ def complete_text(prompt, log_file, model, device=0, **kwargs): completion = complete_text_gemma(prompt, stop_sequences=["}"], log_file=log_file, model=model, device=device, **kwargs) elif "/" in model: # use CRFM API since this specifies organization like "openai/..." - completion = complete_text_crfm(prompt, stop_sequences=["Observation:", "Observation"], log_file=log_file, model=model, **kwargs) + completion = complete_text_crfm(prompt, stop_sequences=["Observation:"], log_file=log_file, model=model, **kwargs) else: # use OpenAI API - completion = complete_text_openai(prompt, stop_sequences=["Observation:", "Observation"], log_file=log_file, model=model, **kwargs) + completion = complete_text_openai(prompt, stop_sequences=["Observation:"], log_file=log_file, model=model, **kwargs) return completion diff --git a/fairnessBench/benchmarks/adult-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-di-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-di-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-di-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-di-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-di-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-di-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-di-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-eod-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-eod-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-eod-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-eod-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-eod-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-eod-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-eod-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-eod-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-erd-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-erd-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-erd-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-erd-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-erd-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-erd-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-erd-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-erd-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-err-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-err-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-err-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-err-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-err-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-err-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-err-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-err-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-ford-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-ford-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-ford-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-ford-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-ford-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-ford-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-ford-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-ford-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-spd-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-spd-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-spd-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-spd-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adult-spd-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adult-spd-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adult-spd-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adult-spd-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adultrecon-allmetric-gender/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adultrecon-allmetric-gender/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adultrecon-allmetric-gender/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adultrecon-allmetric-gender/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/adultrecon-allmetric-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/adultrecon-allmetric-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/adultrecon-allmetric-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/adultrecon-allmetric-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/creditdefault-di-gender/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/creditdefault-di-gender/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/creditdefault-di-gender/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/creditdefault-di-gender/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/creditdefault-eod-gender/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/creditdefault-eod-gender/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/creditdefault-eod-gender/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/creditdefault-eod-gender/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/dollarstreet-income/env/train.py b/fairnessBench/benchmarks/dollarstreet-income/env/train.py index f060560..f56a1cf 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/env/train.py +++ b/fairnessBench/benchmarks/dollarstreet-income/env/train.py @@ -45,8 +45,8 @@ def __getitem__(self, idx): train_csv = "./images_v2_imagenet_train.csv" test_csv = "./images_v2_imagenet_test.csv" -train_dataset = DollarStreetDataset(train_csv, ".", transform) -test_dataset = DollarStreetDataset(test_csv, ".", transform) +train_dataset = DollarStreetDataset(csv_path=train_csv, root_dir=".", transform=transform) +test_dataset = DollarStreetDataset(csv_path = test_csv, root_dir=".", transform=transform) num_classes = len(train_dataset.synset2idx) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4) test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4) diff --git a/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt index e69de29..9a933bd 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/dollarstreet-income/scripts/env_read_only_files.txt @@ -0,0 +1,6 @@ +assets/* +./images_v2_imagenet_test.csv +./topics_to_imagenet_classes_map.json +./README.md +./images_v2.csv +./images_v2_imagenet_train.csv diff --git a/fairnessBench/benchmarks/dollarstreet-income/scripts/eval.py b/fairnessBench/benchmarks/dollarstreet-income/scripts/eval.py index 4d5c13c..95732fc 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/scripts/eval.py +++ b/fairnessBench/benchmarks/dollarstreet-income/scripts/eval.py @@ -5,18 +5,19 @@ def get_score(agent_files): submission_path = os.path.join(agent_files, "submission.csv") - submission_df= pd.read_csv(submission_path, index_col=0) + submission_df= pd.read_csv(submission_path) test_df = pd.read_csv(os.path.join("../env", "images_v2_imagenet_test.csv")) test_df["imagenet_sysnet_id"] = test_df["imagenet_sysnet_id"].apply(ast.literal_eval) test_df["synset"] = test_df["imagenet_sysnet_id"].apply(lambda x: x[0]) + all_synsets = sorted(set(test_df["synset"])) synset2idx = {syn: idx for idx, syn in enumerate(all_synsets)} test_df["true_label_idx"] = test_df["synset"].map(synset2idx) - submission_df["predicted_label"] = submission_df.values.argmax(axis=1) - test_df["predicted_label"] = submission_df["predicted_label"] + test_df["predicted_label"] = submission_df.values.argmax(axis=1) + test_df["correct"] = test_df["predicted_label"] == test_df["true_label_idx"] test_df["income_q"] = pd.qcut(test_df["income"], q=4, labels=["Q1", "Q2", "Q3", "Q4"]) test_df["income_group"] = test_df["income_q"].apply(lambda q: "Disadvantaged" if q == "Q1" else "Advantaged") diff --git a/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt b/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt index 4f3d041..9a933bd 100644 --- a/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt +++ b/fairnessBench/benchmarks/dollarstreet-income/scripts/read_only_files.txt @@ -1,6 +1,6 @@ -./assets/* -./images_v2_imagenet_test.csv +assets/* +./images_v2_imagenet_test.csv ./topics_to_imagenet_classes_map.json ./README.md ./images_v2.csv -./images_v2_imagenet_train.csv \ No newline at end of file +./images_v2_imagenet_train.csv diff --git a/fairnessBench/benchmarks/germancredit-di-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/germancredit-di-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/germancredit-di-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/germancredit-di-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/germancredit-eod-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/germancredit-eod-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/germancredit-eod-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/germancredit-eod-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-di-race/env/data_description.txt b/fairnessBench/benchmarks/health-di-race/env/data_description.txt new file mode 100644 index 0000000..1ef9d2f --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/env/data_description.txt @@ -0,0 +1,125 @@ +patient_age: Integer + +lifestyle: Categorical feature + Sedentary + Moderately-active + Active + Highly-active + Unhealthy + Healthy + At-risk + +insurance_provider : Categorical + UnitedHealthcare + Blue Cross Blue Shield + Aetna + Cigna + Humana + Kaiser Permanente + Anthem + Molina Healthcare + Centene Corporation + Highmark Health + Health Net + WellCare Health Plans + CareFirst + Independence Blue Cross + Fidelis Care + Oscar Health + +marital-status: Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +procedure_on_the_visit : Categorical + Administrative-check + Specialist-consultation + Minor-procedure + Major-surgery + General-checkup + Diagnostic-test + Rehabilitation + Emergency + Preventive-screening + Therapy-session + Follow-up + Mental-health-consultation + Chronic-disease-management + Vaccination + +relationship : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +race : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +gender : Binary + Female + Male + +cholesterol : Integer + +blood-pressure : Integer + +daily-activity-hours : Integer + +reason-for-visit : Categorical + General-checkup + Routine-follow-up + New-symptom + Fever + Cold-Flu-Cough + Headache-migraine + Heart-concerns + Respiratory-issue + Digestive-issue + Pain-related + Skin-concern + Eye + ENT + Mental-health-concerns + Stress + Fatigue-weakness + Weight-concerns + Diabetes/Bloodsugar + BP-concern + Cholesterol-lipid-concern + Allergy-symptoms + Prenatal + Gynecological-concern + Pediatrics + Age-related + Chronic-disease + Medication-side-effect + Injury + Trauma + Preventive-counseling + Genetic-risks + Vaccination-inquiry + Health-counseling + Substance-concern + Surgical-consultation + Specialist-referral + Discharge + Lab-result-discussion' + Second-opinion + Administrative-inquiry + Other + +health_risk : binary target + 1 + 0 \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-race/env/train.py b/fairnessBench/benchmarks/health-di-race/env/train.py new file mode 100644 index 0000000..578e4c3 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['race'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "race" : prot_attr}) + +privileged_groups = [{'race': 1}] +unprivileged_groups = [{'race': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['race']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +disparate_impact = metric.disparate_impact() + +print('accuracy score:', accuracy) +print('disparate impact ratio', disparate_impact) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-di-race/scripts/eval.py b/fairnessBench/benchmarks/health-di-race/scripts/eval.py new file mode 100644 index 0000000..88dd899 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'race': 1}] + unprivileged_groups = [{'race': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['race']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-race/scripts/prepare.py b/fairnessBench/benchmarks/health-di-race/scripts/prepare.py new file mode 100644 index 0000000..d79ea06 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/scripts/prepare.py @@ -0,0 +1,104 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["race"] = X["race"].apply(lambda i: 1 if i == 'White' else 0) +X = X.replace('?', np.nan) + +ad = pd.concat([X, y], axis=1) +ad = ad.dropna() + +# SR: scaling the values in cols capital-gain, capital-loss and hours-per-week into a realistic range. It preserves relative ordering while only changing the scale for respective cols. + +cholesterol_scale = MinMaxScaler(feature_range=(100, 300)) +ad['cholesterol'] = cholesterol_scale.fit_transform(ad[['capital-gain']]) + +bp_scale = MinMaxScaler(feature_range=(80, 180)) +ad['blood-pressure'] = bp_scale.fit_transform(ad[['capital-loss']]) + +activity_scale = MinMaxScaler(feature_range=(0, 24)) +ad['daily-activity-hours'] = activity_scale.fit_transform(ad[['hours-per-week']]) + +# dropping unwanted cols, we have never used these two cols ('fnlwgt', 'education-num') in any analysis so far with adult data. +ad.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'], axis=1, inplace=True) + +# column values mapping eg: occupation = Adm-clerical is mapped to procedure = Administrative-check +occupation_to_procedure = {'Adm-clerical': 'Administrative-check', 'Exec-managerial': 'Specialist-consultation', + 'Handlers-cleaners': 'Minor-procedure', 'Prof-specialty': 'Major-surgery', + 'Other-service': 'General-checkup', 'Sales': 'Diagnostic-test', + 'Craft-repair': 'Rehabilitation', 'Transport-moving': 'Emergency', + 'Farming-fishing': 'Preventive-screening', 'Machine-op-inspct': 'Therapy-session', + 'Tech-support': 'Follow-up', 'Protective-serv': 'Mental-health-consultation', + 'Armed-Forces': 'Chronic-disease-management', 'Priv-house-serv': 'Vaccination'} + + +# column values mapping eg: country = United-States is mapped to reason = General-checkup +country_to_reason = {'United-States': 'General-checkup', 'Cuba': 'Routine-follow-up', 'Jamaica': 'New-symptom', + 'India': 'Fever', 'Mexico': 'Cold-Flu-Cough', 'South': 'Headache-migraine', + 'Puerto-Rico': 'Heart-concerns', 'Honduras': 'Respiratory-issue', 'England': 'Digestive-issue', + 'Canada': 'Pain-related', 'Germany': 'Skin-concern', 'Iran': 'Eye', 'Philippines': 'ENT', + 'Italy': 'Mental-health-concerns', 'Poland': 'Stress', 'Columbia': 'Fatigue-weakness', + 'Cambodia': 'Weight-concerns', 'Thailand': 'Diabetes/Bloodsugar', 'Ecuador': 'BP-concern', + 'Laos': 'Cholesterol-lipid-concern', 'Taiwan': 'Allergy-symptoms', 'Haiti': 'Prenatal', + 'Portugal': 'Gynecological-concern', 'Dominican-Republic': 'Pediatrics', + 'El-Salvador': 'Age-related', 'France': 'Chronic-disease', 'Guatemala': 'Medication-side-effect', + 'China': 'Injury', 'Japan': 'Trauma', 'Yugoslavia': 'Preventive-counseling', 'Peru': 'Genetic-risks', + 'Outlying-US(Guam-USVI-etc)': 'Vaccination-inquiry', 'Scotland': 'Health-counseling', + 'Trinadad&Tobago': 'Substance-concern', 'Greece': 'Surgical-consultation', + 'Nicaragua': 'Specialist-referral', 'Vietnam': 'Discharge', 'Hong': 'Lab-result-discussion', + 'Ireland': 'Second-opinion', 'Hungary': 'Administrative-inquiry', 'Holand-Netherlands': 'Other'} + +# column values mapping eg: workclass = Private is mapped to lifestyle = Sedentary +workclass_to_lifestyle = { 'Private':'Sedentary', 'Self-emp-not-inc':'Moderately-active', 'Local-gov':'Active', + 'State-gov': 'Highly-active', 'Self-emp-inc': 'Unhealthy', 'Federal-gov': 'Healthy', + 'Without-pay': 'At-risk'} + +# column values mapping eg: education = HS-grad is mapped to insurance_provider = UnitedHealthcare +education_to_insurance_provider = {'HS-grad':'UnitedHealthcare', 'Some-college': 'Blue Cross Blue Shield', 'Bachelors': 'Aetna', + 'Masters': 'Cigna', 'Assoc-voc': 'Humana', '11th': 'Kaiser Permanente', 'Assoc-acdm': 'Anthem', + '10th': 'Molina Healthcare', '7th-8th': 'Centene Corporation', 'Prof-school': 'Highmark Health', + '9th': 'Health Net', '12th':'WellCare Health Plans', 'Doctorate':'CareFirst', + '5th-6th': 'Independence Blue Cross', '1st-4th': 'Fidelis Care', 'Preschool': 'Oscar Health'} + +new_col_list = [] + +new_col_list.append(pd.Series(ad['occupation'].map(occupation_to_procedure),name='procedure_on_the_day')) +new_col_list.append(pd.Series(ad['native-country'].map(country_to_reason),name='reason_for_visit')) +new_col_list.append(pd.Series(ad['workclass'].map(workclass_to_lifestyle), name='lifestyle')) +new_col_list.append(pd.Series(ad['education'].map(education_to_insurance_provider),name='insurance_provider')) + +ad = pd.concat([ad] + new_col_list, axis=1) + +ad = ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1) +ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) + +cat_cols = ['marital-status', 'relationship', 'gender', 'lifestyle', + 'insurance_provider', 'procedure_on_the_day', 'reason_for_visit'] + + +ad_enc = pd.get_dummies(ad, columns=cat_cols) +ad_enc = ad_enc.astype(int) + +adX = ad_enc.drop(columns=['health_risk']) +ady = ad_enc['health_risk'] + +train_X, test_X, train_y, test_y = train_test_split(adX, ady, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-race/scripts/read_only_files.txt b/fairnessBench/benchmarks/health-di-race/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-di-race/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-di-sex/env/data_description.txt b/fairnessBench/benchmarks/health-di-sex/env/data_description.txt new file mode 100644 index 0000000..1ef9d2f --- /dev/null +++ b/fairnessBench/benchmarks/health-di-sex/env/data_description.txt @@ -0,0 +1,125 @@ +patient_age: Integer + +lifestyle: Categorical feature + Sedentary + Moderately-active + Active + Highly-active + Unhealthy + Healthy + At-risk + +insurance_provider : Categorical + UnitedHealthcare + Blue Cross Blue Shield + Aetna + Cigna + Humana + Kaiser Permanente + Anthem + Molina Healthcare + Centene Corporation + Highmark Health + Health Net + WellCare Health Plans + CareFirst + Independence Blue Cross + Fidelis Care + Oscar Health + +marital-status: Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +procedure_on_the_visit : Categorical + Administrative-check + Specialist-consultation + Minor-procedure + Major-surgery + General-checkup + Diagnostic-test + Rehabilitation + Emergency + Preventive-screening + Therapy-session + Follow-up + Mental-health-consultation + Chronic-disease-management + Vaccination + +relationship : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +race : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +gender : Binary + Female + Male + +cholesterol : Integer + +blood-pressure : Integer + +daily-activity-hours : Integer + +reason-for-visit : Categorical + General-checkup + Routine-follow-up + New-symptom + Fever + Cold-Flu-Cough + Headache-migraine + Heart-concerns + Respiratory-issue + Digestive-issue + Pain-related + Skin-concern + Eye + ENT + Mental-health-concerns + Stress + Fatigue-weakness + Weight-concerns + Diabetes/Bloodsugar + BP-concern + Cholesterol-lipid-concern + Allergy-symptoms + Prenatal + Gynecological-concern + Pediatrics + Age-related + Chronic-disease + Medication-side-effect + Injury + Trauma + Preventive-counseling + Genetic-risks + Vaccination-inquiry + Health-counseling + Substance-concern + Surgical-consultation + Specialist-referral + Discharge + Lab-result-discussion' + Second-opinion + Administrative-inquiry + Other + +health_risk : binary target + 1 + 0 \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-sex/env/train.py b/fairnessBench/benchmarks/health-di-sex/env/train.py new file mode 100644 index 0000000..bc908e4 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-sex/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['gender'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "gender" : prot_attr}) + +privileged_groups = [{'gender': 1}] +unprivileged_groups = [{'gender': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['gender']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +disparate_impact = metric.disparate_impact() + +print('accuracy score:', accuracy) +print('disparate impact ratio', disparate_impact) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/health-di-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/health-di-sex/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-di-sex/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-di-sex/scripts/eval.py b/fairnessBench/benchmarks/health-di-sex/scripts/eval.py new file mode 100644 index 0000000..ede1df7 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-sex/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'gender': 1}] + unprivileged_groups = [{'gender': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['gender']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-sex/scripts/prepare.py b/fairnessBench/benchmarks/health-di-sex/scripts/prepare.py new file mode 100644 index 0000000..b86b911 --- /dev/null +++ b/fairnessBench/benchmarks/health-di-sex/scripts/prepare.py @@ -0,0 +1,104 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["sex"] = X["sex"].apply(lambda i: 1 if i == 'Male' else 0) +X = X.replace('?', np.nan) + +ad = pd.concat([X, y], axis=1) +ad = ad.dropna() + +# SR: scaling the values in cols capital-gain, capital-loss and hours-per-week into a realistic range. It preserves relative ordering while only changing the scale for respective cols. + +cholesterol_scale = MinMaxScaler(feature_range=(100, 300)) +ad['cholesterol'] = cholesterol_scale.fit_transform(ad[['capital-gain']]) + +bp_scale = MinMaxScaler(feature_range=(80, 180)) +ad['blood-pressure'] = bp_scale.fit_transform(ad[['capital-loss']]) + +activity_scale = MinMaxScaler(feature_range=(0, 24)) +ad['daily-activity-hours'] = activity_scale.fit_transform(ad[['hours-per-week']]) + +# dropping unwanted cols, we have never used these two cols ('fnlwgt', 'education-num') in any analysis so far with adult data. +ad.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'], axis=1, inplace=True) + +# column values mapping eg: occupation = Adm-clerical is mapped to procedure = Administrative-check +occupation_to_procedure = {'Adm-clerical': 'Administrative-check', 'Exec-managerial': 'Specialist-consultation', + 'Handlers-cleaners': 'Minor-procedure', 'Prof-specialty': 'Major-surgery', + 'Other-service': 'General-checkup', 'Sales': 'Diagnostic-test', + 'Craft-repair': 'Rehabilitation', 'Transport-moving': 'Emergency', + 'Farming-fishing': 'Preventive-screening', 'Machine-op-inspct': 'Therapy-session', + 'Tech-support': 'Follow-up', 'Protective-serv': 'Mental-health-consultation', + 'Armed-Forces': 'Chronic-disease-management', 'Priv-house-serv': 'Vaccination'} + + +# column values mapping eg: country = United-States is mapped to reason = General-checkup +country_to_reason = {'United-States': 'General-checkup', 'Cuba': 'Routine-follow-up', 'Jamaica': 'New-symptom', + 'India': 'Fever', 'Mexico': 'Cold-Flu-Cough', 'South': 'Headache-migraine', + 'Puerto-Rico': 'Heart-concerns', 'Honduras': 'Respiratory-issue', 'England': 'Digestive-issue', + 'Canada': 'Pain-related', 'Germany': 'Skin-concern', 'Iran': 'Eye', 'Philippines': 'ENT', + 'Italy': 'Mental-health-concerns', 'Poland': 'Stress', 'Columbia': 'Fatigue-weakness', + 'Cambodia': 'Weight-concerns', 'Thailand': 'Diabetes/Bloodsugar', 'Ecuador': 'BP-concern', + 'Laos': 'Cholesterol-lipid-concern', 'Taiwan': 'Allergy-symptoms', 'Haiti': 'Prenatal', + 'Portugal': 'Gynecological-concern', 'Dominican-Republic': 'Pediatrics', + 'El-Salvador': 'Age-related', 'France': 'Chronic-disease', 'Guatemala': 'Medication-side-effect', + 'China': 'Injury', 'Japan': 'Trauma', 'Yugoslavia': 'Preventive-counseling', 'Peru': 'Genetic-risks', + 'Outlying-US(Guam-USVI-etc)': 'Vaccination-inquiry', 'Scotland': 'Health-counseling', + 'Trinadad&Tobago': 'Substance-concern', 'Greece': 'Surgical-consultation', + 'Nicaragua': 'Specialist-referral', 'Vietnam': 'Discharge', 'Hong': 'Lab-result-discussion', + 'Ireland': 'Second-opinion', 'Hungary': 'Administrative-inquiry', 'Holand-Netherlands': 'Other'} + +# column values mapping eg: workclass = Private is mapped to lifestyle = Sedentary +workclass_to_lifestyle = { 'Private':'Sedentary', 'Self-emp-not-inc':'Moderately-active', 'Local-gov':'Active', + 'State-gov': 'Highly-active', 'Self-emp-inc': 'Unhealthy', 'Federal-gov': 'Healthy', + 'Without-pay': 'At-risk'} + +# column values mapping eg: education = HS-grad is mapped to insurance_provider = UnitedHealthcare +education_to_insurance_provider = {'HS-grad':'UnitedHealthcare', 'Some-college': 'Blue Cross Blue Shield', 'Bachelors': 'Aetna', + 'Masters': 'Cigna', 'Assoc-voc': 'Humana', '11th': 'Kaiser Permanente', 'Assoc-acdm': 'Anthem', + '10th': 'Molina Healthcare', '7th-8th': 'Centene Corporation', 'Prof-school': 'Highmark Health', + '9th': 'Health Net', '12th':'WellCare Health Plans', 'Doctorate':'CareFirst', + '5th-6th': 'Independence Blue Cross', '1st-4th': 'Fidelis Care', 'Preschool': 'Oscar Health'} + +new_col_list = [] + +new_col_list.append(pd.Series(ad['occupation'].map(occupation_to_procedure),name='procedure_on_the_day')) +new_col_list.append(pd.Series(ad['native-country'].map(country_to_reason),name='reason_for_visit')) +new_col_list.append(pd.Series(ad['workclass'].map(workclass_to_lifestyle), name='lifestyle')) +new_col_list.append(pd.Series(ad['education'].map(education_to_insurance_provider),name='insurance_provider')) + +ad = pd.concat([ad] + new_col_list, axis=1) + +ad = ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1) +ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) + +cat_cols = ['marital-status', 'relationship', 'race', 'lifestyle', + 'insurance_provider', 'procedure_on_the_day', 'reason_for_visit'] + + +ad_enc = pd.get_dummies(ad, columns=cat_cols) +ad_enc = ad_enc.astype(int) + +adX = ad_enc.drop(columns=['health_risk']) +ady = ad_enc['health_risk'] + +train_X, test_X, train_y, test_y = train_test_split(adX, ady, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-di-sex/scripts/read_only_files.txt b/fairnessBench/benchmarks/health-di-sex/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-di-sex/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-eod-race/env/data_description.txt b/fairnessBench/benchmarks/health-eod-race/env/data_description.txt new file mode 100644 index 0000000..1ef9d2f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/env/data_description.txt @@ -0,0 +1,125 @@ +patient_age: Integer + +lifestyle: Categorical feature + Sedentary + Moderately-active + Active + Highly-active + Unhealthy + Healthy + At-risk + +insurance_provider : Categorical + UnitedHealthcare + Blue Cross Blue Shield + Aetna + Cigna + Humana + Kaiser Permanente + Anthem + Molina Healthcare + Centene Corporation + Highmark Health + Health Net + WellCare Health Plans + CareFirst + Independence Blue Cross + Fidelis Care + Oscar Health + +marital-status: Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +procedure_on_the_visit : Categorical + Administrative-check + Specialist-consultation + Minor-procedure + Major-surgery + General-checkup + Diagnostic-test + Rehabilitation + Emergency + Preventive-screening + Therapy-session + Follow-up + Mental-health-consultation + Chronic-disease-management + Vaccination + +relationship : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +race : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +gender : Binary + Female + Male + +cholesterol : Integer + +blood-pressure : Integer + +daily-activity-hours : Integer + +reason-for-visit : Categorical + General-checkup + Routine-follow-up + New-symptom + Fever + Cold-Flu-Cough + Headache-migraine + Heart-concerns + Respiratory-issue + Digestive-issue + Pain-related + Skin-concern + Eye + ENT + Mental-health-concerns + Stress + Fatigue-weakness + Weight-concerns + Diabetes/Bloodsugar + BP-concern + Cholesterol-lipid-concern + Allergy-symptoms + Prenatal + Gynecological-concern + Pediatrics + Age-related + Chronic-disease + Medication-side-effect + Injury + Trauma + Preventive-counseling + Genetic-risks + Vaccination-inquiry + Health-counseling + Substance-concern + Surgical-consultation + Specialist-referral + Discharge + Lab-result-discussion' + Second-opinion + Administrative-inquiry + Other + +health_risk : binary target + 1 + 0 \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-race/env/train.py b/fairnessBench/benchmarks/health-eod-race/env/train.py new file mode 100644 index 0000000..8ba7f61 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['race'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "race" : prot_attr}) + +privileged_groups = [{'race': 1}] +unprivileged_groups = [{'race': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['race']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +equal_opp_diff = metric.equal_opportunity_difference() + +print('accuracy score:', accuracy) +print('equal opportunity difference', equal_opp_diff) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/health-eod-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/health-eod-race/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-eod-race/scripts/eval.py b/fairnessBench/benchmarks/health-eod-race/scripts/eval.py new file mode 100644 index 0000000..88dd899 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'race': 1}] + unprivileged_groups = [{'race': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['race']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-race/scripts/prepare.py b/fairnessBench/benchmarks/health-eod-race/scripts/prepare.py new file mode 100644 index 0000000..d79ea06 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/scripts/prepare.py @@ -0,0 +1,104 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["race"] = X["race"].apply(lambda i: 1 if i == 'White' else 0) +X = X.replace('?', np.nan) + +ad = pd.concat([X, y], axis=1) +ad = ad.dropna() + +# SR: scaling the values in cols capital-gain, capital-loss and hours-per-week into a realistic range. It preserves relative ordering while only changing the scale for respective cols. + +cholesterol_scale = MinMaxScaler(feature_range=(100, 300)) +ad['cholesterol'] = cholesterol_scale.fit_transform(ad[['capital-gain']]) + +bp_scale = MinMaxScaler(feature_range=(80, 180)) +ad['blood-pressure'] = bp_scale.fit_transform(ad[['capital-loss']]) + +activity_scale = MinMaxScaler(feature_range=(0, 24)) +ad['daily-activity-hours'] = activity_scale.fit_transform(ad[['hours-per-week']]) + +# dropping unwanted cols, we have never used these two cols ('fnlwgt', 'education-num') in any analysis so far with adult data. +ad.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'], axis=1, inplace=True) + +# column values mapping eg: occupation = Adm-clerical is mapped to procedure = Administrative-check +occupation_to_procedure = {'Adm-clerical': 'Administrative-check', 'Exec-managerial': 'Specialist-consultation', + 'Handlers-cleaners': 'Minor-procedure', 'Prof-specialty': 'Major-surgery', + 'Other-service': 'General-checkup', 'Sales': 'Diagnostic-test', + 'Craft-repair': 'Rehabilitation', 'Transport-moving': 'Emergency', + 'Farming-fishing': 'Preventive-screening', 'Machine-op-inspct': 'Therapy-session', + 'Tech-support': 'Follow-up', 'Protective-serv': 'Mental-health-consultation', + 'Armed-Forces': 'Chronic-disease-management', 'Priv-house-serv': 'Vaccination'} + + +# column values mapping eg: country = United-States is mapped to reason = General-checkup +country_to_reason = {'United-States': 'General-checkup', 'Cuba': 'Routine-follow-up', 'Jamaica': 'New-symptom', + 'India': 'Fever', 'Mexico': 'Cold-Flu-Cough', 'South': 'Headache-migraine', + 'Puerto-Rico': 'Heart-concerns', 'Honduras': 'Respiratory-issue', 'England': 'Digestive-issue', + 'Canada': 'Pain-related', 'Germany': 'Skin-concern', 'Iran': 'Eye', 'Philippines': 'ENT', + 'Italy': 'Mental-health-concerns', 'Poland': 'Stress', 'Columbia': 'Fatigue-weakness', + 'Cambodia': 'Weight-concerns', 'Thailand': 'Diabetes/Bloodsugar', 'Ecuador': 'BP-concern', + 'Laos': 'Cholesterol-lipid-concern', 'Taiwan': 'Allergy-symptoms', 'Haiti': 'Prenatal', + 'Portugal': 'Gynecological-concern', 'Dominican-Republic': 'Pediatrics', + 'El-Salvador': 'Age-related', 'France': 'Chronic-disease', 'Guatemala': 'Medication-side-effect', + 'China': 'Injury', 'Japan': 'Trauma', 'Yugoslavia': 'Preventive-counseling', 'Peru': 'Genetic-risks', + 'Outlying-US(Guam-USVI-etc)': 'Vaccination-inquiry', 'Scotland': 'Health-counseling', + 'Trinadad&Tobago': 'Substance-concern', 'Greece': 'Surgical-consultation', + 'Nicaragua': 'Specialist-referral', 'Vietnam': 'Discharge', 'Hong': 'Lab-result-discussion', + 'Ireland': 'Second-opinion', 'Hungary': 'Administrative-inquiry', 'Holand-Netherlands': 'Other'} + +# column values mapping eg: workclass = Private is mapped to lifestyle = Sedentary +workclass_to_lifestyle = { 'Private':'Sedentary', 'Self-emp-not-inc':'Moderately-active', 'Local-gov':'Active', + 'State-gov': 'Highly-active', 'Self-emp-inc': 'Unhealthy', 'Federal-gov': 'Healthy', + 'Without-pay': 'At-risk'} + +# column values mapping eg: education = HS-grad is mapped to insurance_provider = UnitedHealthcare +education_to_insurance_provider = {'HS-grad':'UnitedHealthcare', 'Some-college': 'Blue Cross Blue Shield', 'Bachelors': 'Aetna', + 'Masters': 'Cigna', 'Assoc-voc': 'Humana', '11th': 'Kaiser Permanente', 'Assoc-acdm': 'Anthem', + '10th': 'Molina Healthcare', '7th-8th': 'Centene Corporation', 'Prof-school': 'Highmark Health', + '9th': 'Health Net', '12th':'WellCare Health Plans', 'Doctorate':'CareFirst', + '5th-6th': 'Independence Blue Cross', '1st-4th': 'Fidelis Care', 'Preschool': 'Oscar Health'} + +new_col_list = [] + +new_col_list.append(pd.Series(ad['occupation'].map(occupation_to_procedure),name='procedure_on_the_day')) +new_col_list.append(pd.Series(ad['native-country'].map(country_to_reason),name='reason_for_visit')) +new_col_list.append(pd.Series(ad['workclass'].map(workclass_to_lifestyle), name='lifestyle')) +new_col_list.append(pd.Series(ad['education'].map(education_to_insurance_provider),name='insurance_provider')) + +ad = pd.concat([ad] + new_col_list, axis=1) + +ad = ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1) +ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) + +cat_cols = ['marital-status', 'relationship', 'gender', 'lifestyle', + 'insurance_provider', 'procedure_on_the_day', 'reason_for_visit'] + + +ad_enc = pd.get_dummies(ad, columns=cat_cols) +ad_enc = ad_enc.astype(int) + +adX = ad_enc.drop(columns=['health_risk']) +ady = ad_enc['health_risk'] + +train_X, test_X, train_y, test_y = train_test_split(adX, ady, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-race/scripts/read_only_files.txt b/fairnessBench/benchmarks/health-eod-race/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-race/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-eod-sex/env/data_description.txt b/fairnessBench/benchmarks/health-eod-sex/env/data_description.txt new file mode 100644 index 0000000..1ef9d2f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/env/data_description.txt @@ -0,0 +1,125 @@ +patient_age: Integer + +lifestyle: Categorical feature + Sedentary + Moderately-active + Active + Highly-active + Unhealthy + Healthy + At-risk + +insurance_provider : Categorical + UnitedHealthcare + Blue Cross Blue Shield + Aetna + Cigna + Humana + Kaiser Permanente + Anthem + Molina Healthcare + Centene Corporation + Highmark Health + Health Net + WellCare Health Plans + CareFirst + Independence Blue Cross + Fidelis Care + Oscar Health + +marital-status: Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +procedure_on_the_visit : Categorical + Administrative-check + Specialist-consultation + Minor-procedure + Major-surgery + General-checkup + Diagnostic-test + Rehabilitation + Emergency + Preventive-screening + Therapy-session + Follow-up + Mental-health-consultation + Chronic-disease-management + Vaccination + +relationship : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +race : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +gender : Binary + Female + Male + +cholesterol : Integer + +blood-pressure : Integer + +daily-activity-hours : Integer + +reason-for-visit : Categorical + General-checkup + Routine-follow-up + New-symptom + Fever + Cold-Flu-Cough + Headache-migraine + Heart-concerns + Respiratory-issue + Digestive-issue + Pain-related + Skin-concern + Eye + ENT + Mental-health-concerns + Stress + Fatigue-weakness + Weight-concerns + Diabetes/Bloodsugar + BP-concern + Cholesterol-lipid-concern + Allergy-symptoms + Prenatal + Gynecological-concern + Pediatrics + Age-related + Chronic-disease + Medication-side-effect + Injury + Trauma + Preventive-counseling + Genetic-risks + Vaccination-inquiry + Health-counseling + Substance-concern + Surgical-consultation + Specialist-referral + Discharge + Lab-result-discussion' + Second-opinion + Administrative-inquiry + Other + +health_risk : binary target + 1 + 0 \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-sex/env/train.py b/fairnessBench/benchmarks/health-eod-sex/env/train.py new file mode 100644 index 0000000..2888465 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['gender'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "gender" : prot_attr}) + +privileged_groups = [{'gender': 1}] +unprivileged_groups = [{'gender': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['gender']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +equal_opp_diff = metric.equal_opportunity_difference() + +print('accuracy score:', accuracy) +print('equal opportunity difference', equal_opp_diff) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/health-eod-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/health-eod-sex/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/health-eod-sex/scripts/eval.py b/fairnessBench/benchmarks/health-eod-sex/scripts/eval.py new file mode 100644 index 0000000..ede1df7 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'gender': 1}] + unprivileged_groups = [{'gender': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['gender']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-sex/scripts/prepare.py b/fairnessBench/benchmarks/health-eod-sex/scripts/prepare.py new file mode 100644 index 0000000..b86b911 --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/scripts/prepare.py @@ -0,0 +1,104 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["sex"] = X["sex"].apply(lambda i: 1 if i == 'Male' else 0) +X = X.replace('?', np.nan) + +ad = pd.concat([X, y], axis=1) +ad = ad.dropna() + +# SR: scaling the values in cols capital-gain, capital-loss and hours-per-week into a realistic range. It preserves relative ordering while only changing the scale for respective cols. + +cholesterol_scale = MinMaxScaler(feature_range=(100, 300)) +ad['cholesterol'] = cholesterol_scale.fit_transform(ad[['capital-gain']]) + +bp_scale = MinMaxScaler(feature_range=(80, 180)) +ad['blood-pressure'] = bp_scale.fit_transform(ad[['capital-loss']]) + +activity_scale = MinMaxScaler(feature_range=(0, 24)) +ad['daily-activity-hours'] = activity_scale.fit_transform(ad[['hours-per-week']]) + +# dropping unwanted cols, we have never used these two cols ('fnlwgt', 'education-num') in any analysis so far with adult data. +ad.drop(['capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'], axis=1, inplace=True) + +# column values mapping eg: occupation = Adm-clerical is mapped to procedure = Administrative-check +occupation_to_procedure = {'Adm-clerical': 'Administrative-check', 'Exec-managerial': 'Specialist-consultation', + 'Handlers-cleaners': 'Minor-procedure', 'Prof-specialty': 'Major-surgery', + 'Other-service': 'General-checkup', 'Sales': 'Diagnostic-test', + 'Craft-repair': 'Rehabilitation', 'Transport-moving': 'Emergency', + 'Farming-fishing': 'Preventive-screening', 'Machine-op-inspct': 'Therapy-session', + 'Tech-support': 'Follow-up', 'Protective-serv': 'Mental-health-consultation', + 'Armed-Forces': 'Chronic-disease-management', 'Priv-house-serv': 'Vaccination'} + + +# column values mapping eg: country = United-States is mapped to reason = General-checkup +country_to_reason = {'United-States': 'General-checkup', 'Cuba': 'Routine-follow-up', 'Jamaica': 'New-symptom', + 'India': 'Fever', 'Mexico': 'Cold-Flu-Cough', 'South': 'Headache-migraine', + 'Puerto-Rico': 'Heart-concerns', 'Honduras': 'Respiratory-issue', 'England': 'Digestive-issue', + 'Canada': 'Pain-related', 'Germany': 'Skin-concern', 'Iran': 'Eye', 'Philippines': 'ENT', + 'Italy': 'Mental-health-concerns', 'Poland': 'Stress', 'Columbia': 'Fatigue-weakness', + 'Cambodia': 'Weight-concerns', 'Thailand': 'Diabetes/Bloodsugar', 'Ecuador': 'BP-concern', + 'Laos': 'Cholesterol-lipid-concern', 'Taiwan': 'Allergy-symptoms', 'Haiti': 'Prenatal', + 'Portugal': 'Gynecological-concern', 'Dominican-Republic': 'Pediatrics', + 'El-Salvador': 'Age-related', 'France': 'Chronic-disease', 'Guatemala': 'Medication-side-effect', + 'China': 'Injury', 'Japan': 'Trauma', 'Yugoslavia': 'Preventive-counseling', 'Peru': 'Genetic-risks', + 'Outlying-US(Guam-USVI-etc)': 'Vaccination-inquiry', 'Scotland': 'Health-counseling', + 'Trinadad&Tobago': 'Substance-concern', 'Greece': 'Surgical-consultation', + 'Nicaragua': 'Specialist-referral', 'Vietnam': 'Discharge', 'Hong': 'Lab-result-discussion', + 'Ireland': 'Second-opinion', 'Hungary': 'Administrative-inquiry', 'Holand-Netherlands': 'Other'} + +# column values mapping eg: workclass = Private is mapped to lifestyle = Sedentary +workclass_to_lifestyle = { 'Private':'Sedentary', 'Self-emp-not-inc':'Moderately-active', 'Local-gov':'Active', + 'State-gov': 'Highly-active', 'Self-emp-inc': 'Unhealthy', 'Federal-gov': 'Healthy', + 'Without-pay': 'At-risk'} + +# column values mapping eg: education = HS-grad is mapped to insurance_provider = UnitedHealthcare +education_to_insurance_provider = {'HS-grad':'UnitedHealthcare', 'Some-college': 'Blue Cross Blue Shield', 'Bachelors': 'Aetna', + 'Masters': 'Cigna', 'Assoc-voc': 'Humana', '11th': 'Kaiser Permanente', 'Assoc-acdm': 'Anthem', + '10th': 'Molina Healthcare', '7th-8th': 'Centene Corporation', 'Prof-school': 'Highmark Health', + '9th': 'Health Net', '12th':'WellCare Health Plans', 'Doctorate':'CareFirst', + '5th-6th': 'Independence Blue Cross', '1st-4th': 'Fidelis Care', 'Preschool': 'Oscar Health'} + +new_col_list = [] + +new_col_list.append(pd.Series(ad['occupation'].map(occupation_to_procedure),name='procedure_on_the_day')) +new_col_list.append(pd.Series(ad['native-country'].map(country_to_reason),name='reason_for_visit')) +new_col_list.append(pd.Series(ad['workclass'].map(workclass_to_lifestyle), name='lifestyle')) +new_col_list.append(pd.Series(ad['education'].map(education_to_insurance_provider),name='insurance_provider')) + +ad = pd.concat([ad] + new_col_list, axis=1) + +ad = ad.drop(['occupation', 'native-country', 'workclass', 'education'], axis=1) +ad = ad.rename(columns={'age': 'patient_age', 'sex': 'gender', 'income': 'health_risk'}) + +cat_cols = ['marital-status', 'relationship', 'race', 'lifestyle', + 'insurance_provider', 'procedure_on_the_day', 'reason_for_visit'] + + +ad_enc = pd.get_dummies(ad, columns=cat_cols) +ad_enc = ad_enc.astype(int) + +adX = ad_enc.drop(columns=['health_risk']) +ady = ad_enc['health_risk'] + +train_X, test_X, train_y, test_y = train_test_split(adX, ady, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") \ No newline at end of file diff --git a/fairnessBench/benchmarks/health-eod-sex/scripts/read_only_files.txt b/fairnessBench/benchmarks/health-eod-sex/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/health-eod-sex/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/env/data_description.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/env/data_description.txt new file mode 100644 index 0000000..068cb06 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/env/data_description.txt @@ -0,0 +1,125 @@ +patt2: Integer + +feat1 : Categorical feature + Private + Self-emp-not-inc + Self-emp-inc + Federal-gov + Local-gov + State-gov + Without-pay + Never-worked. + +feat2 : Categorical + Education Level + Bachelors + Some-college + 11th, HS-grad + Prof-school + Assoc-acdm + Assoc-voc + 9th + 7th-8th + 12th + Masters + 1st-4th + 10th + Doctorate + 5th-6th + Preschool + +patt3 : Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +feat3 : Categorical + Tech-support, + Craft-repair, + Other-service, + Sales, + Exec-managerial, + Prof-specialty, + Handlers-cleaners, + Machine-op-inspct, + Adm-clerical, + Farming-fishing, + Transport-moving, + Priv-house-serv, + Protective-serv, + Armed-Forces + +patt4 : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +patt1 : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +patt5 : Binary + Female + Male + +feat4 : Integer + +feat5 : Integer + +feat6 : Integer + +feat7 : Categorical + United-States, + Cambodia, + England, + Puerto-Rico, + Canada, + Germany, + Outlying-US(Guam-USVI-etc), + India, + Japan, + Greece, + South, + China, + Cuba, + Iran, + Honduras, + Philippines, + Italy, + Poland, + Jamaica, + Vietnam, + Mexico, + Portugal, + Ireland, + France, + Dominican-Republic, + Laos, Ecuador, + Taiwan, + Haiti, + Columbia, + Hungary, + Guatemala, + Nicaragua, + Scotland, + Thailand, + Yugoslavia, + El-Salvador, + Trinadad&Tobago, + Peru, + Hong, + Holand-Netherlands + +target : binary target + <=50K + >50K. \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/env/train.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/env/train.py new file mode 100644 index 0000000..93a14ea --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['patt1'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "patt1" : prot_attr}) + +privileged_groups = [{'patt1': 1}] +unprivileged_groups = [{'patt1': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['patt1']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +disparate_impact = metric.disparate_impact() + +print('accuracy score:', accuracy) +print('disparate impact ratio', disparate_impact) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/eval.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/eval.py new file mode 100644 index 0000000..a79a1c4 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'patt1': 1}] + unprivileged_groups = [{'patt1': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['patt1']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/prepare.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/prepare.py new file mode 100644 index 0000000..f798c75 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/prepare.py @@ -0,0 +1,49 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["race"] = X["race"].apply(lambda i: 1 if i == 'White' else 0) +X = X.replace('?', np.nan) + +X = X.rename(columns={'age': 'patt2', 'workclass': 'feat1', 'education': 'feat2', 'marital-status': 'patt3', 'occupation': 'feat3','relationship': 'patt4', 'race': 'patt1', 'sex': 'patt5', 'capital-gain': 'feat4', 'capital-loss':'feat5', 'hours-per-week': 'feat6', 'native-country': 'feat7'}) + +y=y.rename(columns={'income':'target'}) + +categorical_columns = ["feat1", "feat2", "patt3", "feat3", "patt4", "patt5", "feat7"] + +X_enc = pd.get_dummies(X, columns=categorical_columns) +X_enc = X_enc.replace({True: 1, False: 0}) + + +train_X, test_X, train_y, test_y = train_test_split(X_enc, y, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") + +# feat1 workclass cat +# feat2 education cat +# patt1 (race) cat +# patt2 (age) num +# patt3 (ms) cat +# patt4 (relationship) cat +# patt5 (sex) cat +# feat3 occupation cat +# feat4 cap gain num +# feat5 cap loss num +# feat6 hours-per-week num +# feat7 native country cat +# target income bin cat \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt1/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/data_description.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/data_description.txt new file mode 100644 index 0000000..068cb06 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/data_description.txt @@ -0,0 +1,125 @@ +patt2: Integer + +feat1 : Categorical feature + Private + Self-emp-not-inc + Self-emp-inc + Federal-gov + Local-gov + State-gov + Without-pay + Never-worked. + +feat2 : Categorical + Education Level + Bachelors + Some-college + 11th, HS-grad + Prof-school + Assoc-acdm + Assoc-voc + 9th + 7th-8th + 12th + Masters + 1st-4th + 10th + Doctorate + 5th-6th + Preschool + +patt3 : Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +feat3 : Categorical + Tech-support, + Craft-repair, + Other-service, + Sales, + Exec-managerial, + Prof-specialty, + Handlers-cleaners, + Machine-op-inspct, + Adm-clerical, + Farming-fishing, + Transport-moving, + Priv-house-serv, + Protective-serv, + Armed-Forces + +patt4 : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +patt1 : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +patt5 : Binary + Female + Male + +feat4 : Integer + +feat5 : Integer + +feat6 : Integer + +feat7 : Categorical + United-States, + Cambodia, + England, + Puerto-Rico, + Canada, + Germany, + Outlying-US(Guam-USVI-etc), + India, + Japan, + Greece, + South, + China, + Cuba, + Iran, + Honduras, + Philippines, + Italy, + Poland, + Jamaica, + Vietnam, + Mexico, + Portugal, + Ireland, + France, + Dominican-Republic, + Laos, Ecuador, + Taiwan, + Haiti, + Columbia, + Hungary, + Guatemala, + Nicaragua, + Scotland, + Thailand, + Yugoslavia, + El-Salvador, + Trinadad&Tobago, + Peru, + Hong, + Holand-Netherlands + +target : binary target + <=50K + >50K. \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/train.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/train.py new file mode 100644 index 0000000..8d868a3 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['patt5'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "patt5" : prot_attr}) + +privileged_groups = [{'patt5': 1}] +unprivileged_groups = [{'patt5': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['patt5']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +disparate_impact = metric.disparate_impact() + +print('accuracy score:', accuracy) +print('disparate impact ratio', disparate_impact) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/eval.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/eval.py new file mode 100644 index 0000000..347f335 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'patt5': 1}] + unprivileged_groups = [{'patt5': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['patt5']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/prepare.py b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/prepare.py new file mode 100644 index 0000000..1a3c316 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/prepare.py @@ -0,0 +1,49 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["sex"] = X["sex"].apply(lambda i: 1 if i == 'Male' else 0) +X = X.replace('?', np.nan) + +X = X.rename(columns={'age': 'patt2', 'workclass': 'feat1', 'education': 'feat2', 'marital-status': 'patt3', 'occupation': 'feat3','relationship': 'patt4', 'race': 'patt1', 'sex': 'patt5', 'capital-gain': 'feat4', 'capital-loss':'feat5', 'hours-per-week': 'feat6', 'native-country': 'feat7'}) + +y=y.rename(columns={'income':'target'}) + +categorical_columns = ["feat1", "feat2", "patt3", "feat3", "patt4", "patt1", "feat7"] + +X_enc = pd.get_dummies(X, columns=categorical_columns) +X_enc = X_enc.replace({True: 1, False: 0}) + + +train_X, test_X, train_y, test_y = train_test_split(X_enc, y, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") + +# feat1 workclass cat +# feat2 education cat +# patt1 (race) cat +# patt2 (age) num +# patt3 (ms) cat +# patt4 (relationship) cat +# patt5 (sex) cat +# feat3 occupation cat +# feat4 cap gain num +# feat5 cap loss num +# feat6 hours-per-week num +# feat7 native country cat +# target income bin cat \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-di-patt5/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/data_description.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/data_description.txt new file mode 100644 index 0000000..068cb06 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/data_description.txt @@ -0,0 +1,125 @@ +patt2: Integer + +feat1 : Categorical feature + Private + Self-emp-not-inc + Self-emp-inc + Federal-gov + Local-gov + State-gov + Without-pay + Never-worked. + +feat2 : Categorical + Education Level + Bachelors + Some-college + 11th, HS-grad + Prof-school + Assoc-acdm + Assoc-voc + 9th + 7th-8th + 12th + Masters + 1st-4th + 10th + Doctorate + 5th-6th + Preschool + +patt3 : Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +feat3 : Categorical + Tech-support, + Craft-repair, + Other-service, + Sales, + Exec-managerial, + Prof-specialty, + Handlers-cleaners, + Machine-op-inspct, + Adm-clerical, + Farming-fishing, + Transport-moving, + Priv-house-serv, + Protective-serv, + Armed-Forces + +patt4 : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +patt1 : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +patt5 : Binary + Female + Male + +feat4 : Integer + +feat5 : Integer + +feat6 : Integer + +feat7 : Categorical + United-States, + Cambodia, + England, + Puerto-Rico, + Canada, + Germany, + Outlying-US(Guam-USVI-etc), + India, + Japan, + Greece, + South, + China, + Cuba, + Iran, + Honduras, + Philippines, + Italy, + Poland, + Jamaica, + Vietnam, + Mexico, + Portugal, + Ireland, + France, + Dominican-Republic, + Laos, Ecuador, + Taiwan, + Haiti, + Columbia, + Hungary, + Guatemala, + Nicaragua, + Scotland, + Thailand, + Yugoslavia, + El-Salvador, + Trinadad&Tobago, + Peru, + Hong, + Holand-Netherlands + +target : binary target + <=50K + >50K. \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/train.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/train.py new file mode 100644 index 0000000..f37344a --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['patt1'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "patt1" : prot_attr}) + +privileged_groups = [{'patt1': 1}] +unprivileged_groups = [{'patt1': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['patt1']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +equal_opp_diff = metric.equal_opportunity_difference() + +print('accuracy score:', accuracy) +print('equal opportunity difference', equal_opp_diff) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/eval.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/eval.py new file mode 100644 index 0000000..a79a1c4 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'patt1': 1}] + unprivileged_groups = [{'patt1': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['patt1']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/prepare.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/prepare.py new file mode 100644 index 0000000..f798c75 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/prepare.py @@ -0,0 +1,49 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["race"] = X["race"].apply(lambda i: 1 if i == 'White' else 0) +X = X.replace('?', np.nan) + +X = X.rename(columns={'age': 'patt2', 'workclass': 'feat1', 'education': 'feat2', 'marital-status': 'patt3', 'occupation': 'feat3','relationship': 'patt4', 'race': 'patt1', 'sex': 'patt5', 'capital-gain': 'feat4', 'capital-loss':'feat5', 'hours-per-week': 'feat6', 'native-country': 'feat7'}) + +y=y.rename(columns={'income':'target'}) + +categorical_columns = ["feat1", "feat2", "patt3", "feat3", "patt4", "patt5", "feat7"] + +X_enc = pd.get_dummies(X, columns=categorical_columns) +X_enc = X_enc.replace({True: 1, False: 0}) + + +train_X, test_X, train_y, test_y = train_test_split(X_enc, y, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") + +# feat1 workclass cat +# feat2 education cat +# patt1 (race) cat +# patt2 (age) num +# patt3 (ms) cat +# patt4 (relationship) cat +# patt5 (sex) cat +# feat3 occupation cat +# feat4 cap gain num +# feat5 cap loss num +# feat6 hours-per-week num +# feat7 native country cat +# target income bin cat \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt1/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/data_description.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/data_description.txt new file mode 100644 index 0000000..068cb06 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/data_description.txt @@ -0,0 +1,125 @@ +patt2: Integer + +feat1 : Categorical feature + Private + Self-emp-not-inc + Self-emp-inc + Federal-gov + Local-gov + State-gov + Without-pay + Never-worked. + +feat2 : Categorical + Education Level + Bachelors + Some-college + 11th, HS-grad + Prof-school + Assoc-acdm + Assoc-voc + 9th + 7th-8th + 12th + Masters + 1st-4th + 10th + Doctorate + 5th-6th + Preschool + +patt3 : Categorical + Married-civ-spouse, + Divorced, + Never-married, + Separated, + Widowed, + Married-spouse-absent, + Married-AF-spouse + +feat3 : Categorical + Tech-support, + Craft-repair, + Other-service, + Sales, + Exec-managerial, + Prof-specialty, + Handlers-cleaners, + Machine-op-inspct, + Adm-clerical, + Farming-fishing, + Transport-moving, + Priv-house-serv, + Protective-serv, + Armed-Forces + +patt4 : Categorical + Wife, + Own-child, + Husband, + Not-in-family, + Other-relative, + Unmarried + +patt1 : Categorical + White, + Asian-Pac-Islander, + Amer-Indian-Eskimo, + Other, + Black + +patt5 : Binary + Female + Male + +feat4 : Integer + +feat5 : Integer + +feat6 : Integer + +feat7 : Categorical + United-States, + Cambodia, + England, + Puerto-Rico, + Canada, + Germany, + Outlying-US(Guam-USVI-etc), + India, + Japan, + Greece, + South, + China, + Cuba, + Iran, + Honduras, + Philippines, + Italy, + Poland, + Jamaica, + Vietnam, + Mexico, + Portugal, + Ireland, + France, + Dominican-Republic, + Laos, Ecuador, + Taiwan, + Haiti, + Columbia, + Hungary, + Guatemala, + Nicaragua, + Scotland, + Thailand, + Yugoslavia, + El-Salvador, + Trinadad&Tobago, + Peru, + Hong, + Holand-Netherlands + +target : binary target + <=50K + >50K. \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/train.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/train.py new file mode 100644 index 0000000..c475d01 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/env/train.py @@ -0,0 +1,44 @@ +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric +from sklearn.linear_model import LogisticRegression + + +train_X = pd.read_csv("train_X.csv", index_col=0) +train_y = pd.read_csv("train_y.csv", index_col=0) +test_X = pd.read_csv("test_X.csv", index_col=0) +test_y = pd.read_csv("test_y.csv", index_col=0) + +lr = LogisticRegression(max_iter=1000) +lr.fit(train_X, train_y) + +pred_y = lr.predict(test_X) + +prot_attr = test_X['patt5'] + +output = pd.DataFrame({"Actual_y": test_y.values.ravel(), "Predicted_y": pred_y, "patt5" : prot_attr}) + +privileged_groups = [{'patt5': 1}] +unprivileged_groups = [{'patt5': 0}] + +binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=output, label_names=['Actual_y'], + protected_attribute_names=['patt5']) + +binaryLabelDataset_pred = binaryLabelDataset.copy() +binaryLabelDataset_pred.labels = output['Predicted_y'].values.reshape(-1, 1) + +metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + +accuracy = metric.accuracy() + +equal_opp_diff = metric.equal_opportunity_difference() + +print('accuracy score:', accuracy) +print('equal opportunity difference', equal_opp_diff) + +output.to_csv("submission.csv", index=False) + diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/env_read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/env_read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/eval.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/eval.py new file mode 100644 index 0000000..347f335 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/eval.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import pandas as pd +from aif360.datasets import BinaryLabelDataset +from aif360.metrics import ClassificationMetric + +def get_score(agent_files): + submission_path = os.path.join(agent_files, "submission.csv") + submission = pd.read_csv(submission_path) + + privileged_groups = [{'patt5': 1}] + unprivileged_groups = [{'patt5': 0}] + + binaryLabelDataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, + df=submission, label_names=['Actual_y'], + protected_attribute_names=['patt5']) + + binaryLabelDataset_pred = binaryLabelDataset.copy() + binaryLabelDataset_pred.labels = submission['Predicted_y'].values.reshape(-1, 1) + + metric = ClassificationMetric(binaryLabelDataset, binaryLabelDataset_pred, + unprivileged_groups=unprivileged_groups, + privileged_groups=privileged_groups) + + accuracy = metric.accuracy() + precision = metric.precision() + recall = metric.true_positive_rate() + + disparate_impact = metric.disparate_impact() + statistical_parity_diff = metric.statistical_parity_difference() + equal_opp_diff = metric.equal_opportunity_difference() + error_rate_diff = metric.error_rate_difference() + error_rate_ratio = metric.error_rate_ratio() + false_omission_rate_diff = metric.false_omission_rate_difference() + + performance = {"acc": accuracy, "precision": precision, "recall": recall, "di": disparate_impact, + "statistical_parity_diff": statistical_parity_diff, "equal_opp_diff": equal_opp_diff, + "error_rate_diff": error_rate_diff, "error_rate_ratio" : error_rate_ratio, + "false_omission_rate_diff": false_omission_rate_diff} + return performance + + +if __name__ == "__main__": + print(get_score()) + \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/prepare.py b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/prepare.py new file mode 100644 index 0000000..1a3c316 --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/prepare.py @@ -0,0 +1,49 @@ +# prepared by user when setting up + +import os +import numpy as np +import pandas as pd +from ucimlrepo import fetch_ucirepo +from sklearn.model_selection import train_test_split + +adult = fetch_ucirepo(id=2) + +X = adult.data.features.copy() +y = adult.data.targets.copy() + +y["income"] = y["income"].str.rstrip('.') +y["income"] = y["income"].apply(lambda i: 1 if i == '>50K' else 0) +X["sex"] = X["sex"].apply(lambda i: 1 if i == 'Male' else 0) +X = X.replace('?', np.nan) + +X = X.rename(columns={'age': 'patt2', 'workclass': 'feat1', 'education': 'feat2', 'marital-status': 'patt3', 'occupation': 'feat3','relationship': 'patt4', 'race': 'patt1', 'sex': 'patt5', 'capital-gain': 'feat4', 'capital-loss':'feat5', 'hours-per-week': 'feat6', 'native-country': 'feat7'}) + +y=y.rename(columns={'income':'target'}) + +categorical_columns = ["feat1", "feat2", "patt3", "feat3", "patt4", "patt1", "feat7"] + +X_enc = pd.get_dummies(X, columns=categorical_columns) +X_enc = X_enc.replace({True: 1, False: 0}) + + +train_X, test_X, train_y, test_y = train_test_split(X_enc, y, test_size=0.2, random_state=1) + +os.chdir("../env/") +train_X.to_csv("train_X.csv") +train_y.to_csv("train_y.csv") +test_X.to_csv("test_X.csv") +test_y.to_csv("test_y.csv") + +# feat1 workclass cat +# feat2 education cat +# patt1 (race) cat +# patt2 (age) num +# patt3 (ms) cat +# patt4 (relationship) cat +# patt5 (sex) cat +# feat3 occupation cat +# feat4 cap gain num +# feat5 cap loss num +# feat6 hours-per-week num +# feat7 native country cat +# target income bin cat \ No newline at end of file diff --git a/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/read_only_files.txt b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/read_only_files.txt new file mode 100644 index 0000000..5ee277f --- /dev/null +++ b/fairnessBench/benchmarks/non_descriptive_ad-eod-patt5/scripts/read_only_files.txt @@ -0,0 +1,4 @@ +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/randoadult-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/randoadult-di-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/randoadult-di-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/randoadult-di-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/randoadult-di-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/randoadult-di-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/randoadult-di-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/randoadult-di-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/randoadult-eod-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/randoadult-eod-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/randoadult-eod-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/randoadult-eod-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/randoadult-eod-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/randoadult-eod-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/randoadult-eod-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/randoadult-eod-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/sampadult-di-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/sampadult-di-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/sampadult-di-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/sampadult-di-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/sampadult-di-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/sampadult-di-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/sampadult-di-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/sampadult-di-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/sampadult-eod-race/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/sampadult-eod-race/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/sampadult-eod-race/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/sampadult-eod-race/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/sampadult-eod-sex/scripts/env_read_only_files.txt b/fairnessBench/benchmarks/sampadult-eod-sex/scripts/env_read_only_files.txt index ecc89d9..5ee277f 100644 --- a/fairnessBench/benchmarks/sampadult-eod-sex/scripts/env_read_only_files.txt +++ b/fairnessBench/benchmarks/sampadult-eod-sex/scripts/env_read_only_files.txt @@ -1,4 +1,4 @@ -train_X.csv -train_y.csv -test_X.csv -test_y.csv +./train_X.csv +./train_y.csv +./test_X.csv +./test_y.csv diff --git a/fairnessBench/benchmarks/tasks.json b/fairnessBench/benchmarks/tasks.json index 45ef804..06c4026 100644 --- a/fairnessBench/benchmarks/tasks.json +++ b/fairnessBench/benchmarks/tasks.json @@ -412,8 +412,188 @@ "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the synthetic adult data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", "benchmark_folder_name": "sampadult-eod-sex" }, - "dollarstreet_income": { + "dollarstreet_finetunebalance": { "research_problem": "Go through the data_description.txt file to understand the dataset and its features, then summarize key points in your research logs to track your actions. Update the provided train.py script to fine-tune a pretrained image classifier with a focus on improving fairness across income levels. You may experiment with freezing/unfreezing strategies, learning rates, adjusting model architecture and fairness-oriented preprocessing or optimizing loss functions to achieve this goal. Do not inspect any data (csv or image) files directly. Use the execute script action to evaluate whether the modifications improve fairness outcomes.", "benchmark_folder_name": "dollarstreet-income" + }, + "health_di_balance-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across race. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-di-race" + }, + "health_di_balance-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-di-sex" + }, + "health_di_best-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making the model fair with respect to disparate impact. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the disparate impact with respect to race. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-di-race" + }, + "health_di_best-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making a model fair with respect to disparate impact. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the disparate impact with respect to sex. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-di-sex" + }, + "health_di_target10-race": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve disparate impact with respect to race by 10% in health data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in disparate impact. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-di-race" + }, + "health_di_target10-sex": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve disparate impact with respect to sex by 10% in health data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in disparate impact. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-di-sex" + }, + "health_di_implicit-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the health data. Your goal is to Iterate over different models and maximize predictive performance. use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-di-race" + }, + "health_di_implicit-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the health data. Your goal is to Iterate over different models and maximize predictive performance. use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-di-sex" + }, + "health_eod_best-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making a model fair with respect to equal opportunity difference. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the equal opportunity difference with respect to sex. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-eod-sex" + }, + "health_eod_best-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making the model fair with respect to equal opportunity difference. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the equal opportunity difference with respect to race. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-eod-race" + }, + "health_eod_balance-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-eod-sex" + }, + "health_eod_balance-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across race. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "health-eod-race" + }, + "health_eod_target10-sex": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve equal opportunity difference with respect to sex by 10% in health data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in equal opportunity difference. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-eod-sex" + }, + "health_eod_target10-race": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve equal opportunity difference with respect to race by 10% in health data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in equal opportunity difference. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-eod-race" + }, + "health_eod_implicit-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the health data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-eod-sex" + }, + "health_eod_implicit-race": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the health data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "health-eod-race" + }, + "nondescriptive_di_balance-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across patt1. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-di-patt1" + }, + "nondescriptive_di_balance-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across patt5. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and disparate impact. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-di-patt5" + }, + "nondescriptive_di_best-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making the model fair with respect to disparate impact. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the disparate impact with respect to patt1. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-di-patt1" + }, + "nondescriptive_di_best-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making a model fair with respect to disparate impact. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the disparate impact with respect to patt5. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-di-patt5" + }, + "nondescriptive_di_target10-patt1": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve disparate impact with respect to patt1 by 10% in non-descriptive data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in disparate impact. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-di-patt1" + }, + "nondescriptive_di_target10-patt5": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve disparate impact with respect to patt5 by 10% in non-descriptive data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in disparate impact. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-di-patt5" + }, + "nondescriptive_di_implicit-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the non-descriptive data. Your goal is to Iterate over different models and maximize predictive performance. use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-di-patt1" + }, + "nondescriptive_di_implicit-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the non-descriptive data. Your goal is to Iterate over different models and maximize predictive performance. use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-di-patt5" + }, + "nondescriptive_eod_best-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making a model fair with respect to equal opportunity difference. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the equal opportunity difference with respect to patt5. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt5" + }, + "nondescriptive_eod_best-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model with a focus on making the model fair with respect to equal opportunity difference. Iterate over different models, feature selection techniques, or fairness-oriented preprocessing steps to improve the equal opportunity difference with respect to patt1. Do not inspect any CSV files directly. Use execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt1" + }, + "nondescriptive_eod_balance-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across patt5. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt5" + }, + "nondescriptive_eod_balance-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across patt1. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt1" + }, + "nondescriptive_eod_target10-patt5": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve equal opportunity difference with respect to patt5 by 10% in non-descriptive data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in equal opportunity difference. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt5" + }, + "nondescriptive_eod_target10-patt1": { + "research_problem": "Review the data_description.txt file to gain an understanding of the data and its features. Make a note of key points and your progress in the research logs to maintain traceability. Next, modify the existing train.py script to train a model that aims to improve equal opportunity difference with respect to patt1 by 10% in non-descriptive data. Explore and experiment with different models, feature selection strategies, or fairness-aware preprocessing techniques to achieve at least a 10% improvement in equal opportunity difference. Avoid reading CSV files directly. After making your modifications, use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt1" + }, + "nondescriptive_eod_implicit-patt5": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the non-descriptive data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt5" + }, + "nondescriptive_eod_implicit-patt1": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a best performing model for the non-descriptive data. Your goal is to Iterate over different models and maximize predictive performance. Use execute script action to verify if the goal is achieved. Do not inspect any CSV files directly.", + "benchmark_folder_name": "non_descriptive_ad-eod-patt1" + }, + "adult_balance-eod-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-nosuccess-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-shortgoal-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. To achieve the best accuracy-fairness compromise regarding equal opportunity difference, systematically test various algorithms, variable selection approaches, and bias-mitigation preprocessing techniques. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-noreq-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to disparate impact across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Refine train.py to balance performance and sex-based fairness metrics Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-nochange-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-nohow-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-rephrased01-sex": { + "research_problem": "Review the data_description.txt file to get familiar with the dataset and its features and note down essential points in your research logs. Modify the train.py script to train a model that optimizes both accuracy and fairness, focusing on reducing equal opportunity difference with respect to sex. Experiment with various models, feature selection methods, and fairness-aware preprocessing to balance model performance and disparate impact. Do not look at CSV files. Use the execute script to check the results.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-rephrased10-sex": { + "research_problem": "Several methods for analyzing data_description.txt should be explored to interpret the variables and dataset structure. Summarize key points in your research logs to track your actions. Then, update the provided train.py script to train a model that best balances accuracy and fairness with respect to equal opportunity difference across sex. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-rephrased06-sex": { + "research_problem": "The data_description.txt file contains feature information - outline multiple techniques to analyze it and understand the data. To track your activities, condense the main insights within your research documentation. When updating train.py, ensure the model training process balances precision and impartiality with respect to differential effects across gender. Test various models, feature selectors, and fairness preprocessing to optimize the accuracy-equal opportunity difference trade-off. Bypass direct analysis of any comma-separated value documents validating whether the objective is achieved involves running the script action.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-rephrased05-sex": { + "research_problem": "Demonstrate multiple techniques for parsing data_description.txt so I can grasp the information and its characteristics. Document essential findings in your research records to monitor your progress. Modify train.py (the provided script) to ensure the trained model achieves an optimal trade-off between performance and equity regarding sex-based equal opportunity difference. To achieve the best accuracy-fairness compromise regarding equal opportunity difference, systematically test various algorithms, variable selection approaches, and bias-mitigation preprocessing techniques CSV files should not be opened or examined directly Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-rephrased03-sex": { + "research_problem": "Use the data_description.txt file to learn about the dataset and what each feature represents. Then, redo the training process in train.py to ensure that the resulting model maintains high accuracy while minimizing equal opportunity difference across sex. Explore different machine learning algorithms, ways to select relevant features, and preprocessing strategies designed to minimize bias and maximize accuracy. Do not inspect CSVs directly. Use the execute script to check if the desired outcome was reached.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-altmetricdetail-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. Multiple models, attribute selection techniques, and equity-oriented preprocessing approaches should be tested to identify the best balance between correctness and disparity in recall. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" + }, + "adult_balance-eod-altmetricdetail2-sex": { + "research_problem": "Go through the data_description.txt file to understand the data and its features. Summarize key points in your research logs to track your actions. By exploring different models along with feature engineering strategies and equity-focused data preparation methods, you'll determine the ideal balance between correct predictions and groupwise differences in sensitivity. Iterate over different models, feature selection strategies, and fairness-oriented preprocessing techniques to find an optimal trade-off between accuracy and equal opportunity difference. Do not inspect any CSV files directly. Use the execute script action to verify if the goal is achieved.", + "benchmark_folder_name": "adult-eod-sex" } -} \ No newline at end of file +} diff --git a/fairnessBench/environment.py b/fairnessBench/environment.py index b111272..2789ae8 100644 --- a/fairnessBench/environment.py +++ b/fairnessBench/environment.py @@ -200,13 +200,13 @@ def _initialize_task_env(self): llm_ignore_files = open(os.path.join(benchmark_dir, "scripts", "env_read_only_files.txt"), "r").read().split("\n") for path, subdirs, files in os.walk(os.path.join(work_dir)): - # relpath = os.path.relpath(path, work_dir) + relpath = os.path.relpath(path, work_dir) # filter out the files that are read only - # filenames = [os.path.join(relpath, filename) for filename in files] + filenames = [os.path.join(relpath, filename) for filename in files] for llm_ignore_file in llm_ignore_files: - # llm_ignore_filenames = [n for n in filenames if fnmatch.fnmatch(n, llm_ignore)] - if llm_ignore_file in files: - self.env_read_only_files.append(llm_ignore_file) + llm_ignore_filenames = [n for n in filenames if fnmatch.fnmatch(n, llm_ignore_file)] + #if llm_ignore_file in files: + self.env_read_only_files.extend(llm_ignore_filenames) # init backup folder and remove all content if it exists diff --git a/fairnessBench/eval.py b/fairnessBench/eval.py index 18632cc..c512267 100644 --- a/fairnessBench/eval.py +++ b/fairnessBench/eval.py @@ -131,7 +131,7 @@ class EvaluationResult: extra: Dict[str, bool] -def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermediate=False): +def run_eval(log_folder, benchmark_folder_name, eval_model=None, eval_intermediate=False, llm_eval=False, log_eval=False, flake8=False): results = {} # Log folder is the specific log folder for one model and one task @@ -139,7 +139,8 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed for subdir, dirs, files in os.walk(log_folder): for file in files: - if file == 'trace.json': + # trace.json is an indicator that at least one step occurred + if file == 'trace.json': # TODO: Add new keys result = EvaluationResult( path=os.path.join(subdir, file), summary="", @@ -200,27 +201,29 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed result.score.append(eval_step_score) print(e) # Getting llm_eval here using llm_eval_repeat function - # try: - # llm_score = repeat_llm_eval(5, train_script, eval_model) - # result.llm_score.append(llm_score) - # except Exception as e: - # print("\nllm_eval didn't work\n") - # print(e) - # pass + if llm_eval and eval_model: + try: + llm_score = repeat_llm_eval(5, train_script, eval_model) + result.llm_score.append(llm_score) + except Exception as e: + print("\nllm_eval didn't work\n") + print(e) + pass # Getting Flake8 score here - try: - flake8_score = get_flake8(train_script) - result.flake8_score.append(flake8_score) - print(flake8_score) - except Exception as e: - print("\nFlake8_eval didn't work\n") - print(e) - pass + if flake8: + try: + flake8_score = get_flake8(train_script) + result.flake8_score.append(flake8_score) + # print(f"Flake8 final score: {flake8_score0}") + except Exception as e: + print("\nFlake8_eval didn't work\n") + print(e) + pass - # Add the ids of the steps that were evaluated to the JSON file result.score_steps = list(subsampled_list) - + + # Evaluate the final step folder_path = os.path.join(subdir, 'traces/step_final_files') train_script = os.path.join(folder_path, ".train.py") if (os.path.exists(os.path.join(folder_path, ".train.py"))) else os.path.join(folder_path, "train.py") @@ -229,29 +232,30 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed eval_final_score = module.get_score(folder_path) result.score.append(eval_final_score) result.final_score = eval_final_score - print(eval_final_score) + print(f"Final score: {eval_final_score}") except Exception as e: print(e) pass # Getting llm_eval here using llm_eval_repeat function - if eval_model: + if llm_eval and eval_model: try: - llm_score = repeat_llm_eval(1, train_script, eval_model) + llm_score = repeat_llm_eval(5, train_script, eval_model) result.final_llm_score = llm_score except Exception as e: print("\nllm_eval didn't work\n") print(e) pass # Getting Flake8 score - try: - flake8_score = get_flake8(train_script) - result.final_flake8_score = flake8_score - print(flake8_score) - except Exception as e: - print("\nFlake8_eval didn't work\n") - print(e) - pass - if eval_model: + if flake8: + try: + flake8_score = get_flake8(train_script) + result.final_flake8_score = flake8_score + print(f"Flake8 final score: {flake8_score}") + except Exception as e: + print("\nFlake8_eval didn't work\n") + print(e) + pass + if log_eval and eval_model: # Getting LLM log eval here using llm_eval_log_repeat function log_file=os.path.join(subdir.rsplit('/',1)[0], "agent_log/main_log") try: @@ -260,13 +264,12 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed # use the latest step in the eval if history_step: # create a temporary file with the history_step content - #history_text = "\n\n".join(history_step) temp_txt_path = os.path.join(subdir.rsplit('/',1)[0], "agent_log/temp_history_step.txt") with open(temp_txt_path, 'w') as f: lines = [line.strip() for line in history_step.splitlines() ] f.write('\n'.join(lines)) print(f"Using history_steps from {temp_txt_path} for evaluation") - log_score = repeat_llm_eval_log(1, temp_txt_path, eval_model) + log_score = repeat_llm_eval_log(5, temp_txt_path, eval_model) else: # if there's no agent step which i doubt fallback to the main_log(will be tooo long and make model fail. lol) print("No history step files found.") @@ -296,39 +299,47 @@ def run_eval(log_folder, benchmark_folder_name, eval_model = None, eval_intermed results[os.path.join(subdir, file)] = result - return results + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--log-folder", type=str, default="logs") - parser.add_argument("--task", type=str, default="adult") - parser.add_argument("--output-file", type=str, default="results.json") - parser.add_argument("--eval_model", type=str, default=None) - parser.add_argument("--eval-intermediate", action="store_true") + parser.add_argument("--log-folder", type=str, default="final_exp_logs") # Logs of model/task with multiple runs + parser.add_argument("--task", type=str, default="adult_di_best-sex") # All runs in a single model/task will be evaluated + parser.add_argument("--eval-intermediate", action="store_true") # Set true to evaluate all steps not just the final step + parser.add_argument("--eval-model", type=str, default=None) # LLM evaluator + parser.add_argument("--llm-eval", type=bool, action="store_true") # Set true to evaluate train.py with llm + parser.add_argument("--log-eval", type=bool, action="store_true") # Set true to evaluate logs (thought-action) with llm + parser.add_argument("--flake8", type=bool, action="store_true") # Set true to evaluate train.py with flake8 + parser.add_argument("--output-file", type=str, default="results.json") # JSON result file for model/task args = parser.parse_args() + # Report if the task has no logs found if not os.path.exists(args.log_folder): print(f"WARNING\nWARNING\nWARNING: The log folder {args.log_folder} doesn't exist. \nWARNING\nWARNING") exit() - - if os.path.exists(args.output_file): - with open(args.output_file) as f: - content = json.load(f) - if content: - print(f"WARNING\nWARNING\nWARNING: Results for {args.output_file} already exists\nWARNING\nWARNING") - exit() - - - benchmark_folder_name = get_task_info(args.task)[0] - results = run_eval(args.log_folder, benchmark_folder_name, eval_model = args.eval_model, eval_intermediate = args.eval_intermediate) + # To find task/scripts/eval.py + benchmark_folder_name = get_task_info(args.task)[0] # Return is (folder, research problem) + results = run_eval(args.log_folder, benchmark_folder_name, eval_intermediate = args.eval_intermediate, eval_model = args.eval_model, llm_eval = args.llm_eval, log_eval = args.log_eval, flake8 = args.flake8) + + # Report a failure in the run_eval function - Prevent empty file from being generated if not results: - print(f"WARNING\nWARNING\nWARNING: Results for {args.log_folder.rsplit('/')} is empty\nWARNING\nWARNING") + print(f"WARNING\nWARNING\nWARNING: Eval failed. Results for {args.log_folder.rsplit('/')} were empty\nWARNING\nWARNING") else: + # Report if json result file already exists so that we don't overwrite + if os.path.exists(args.output_file): + with open(args.output_file) as f: + content = json.load(f) + if content: # If file is empty it's ok to overwrite + print(f"WARNING\nWARNING\nWARNING: Results for {args.output_file} already exists. Concatinating...\nWARNING\nWARNING") + # exit() + for key, eval in content.items(): + if key not in results.keys(): + results.update({key:eval}) + json.dump(results, open(args.output_file, "w"), indent=4, cls=EnhancedJSONEncoder) - - + \ No newline at end of file diff --git a/fairnessBench/runner.py b/fairnessBench/runner.py index bd7b4b7..7437428 100644 --- a/fairnessBench/runner.py +++ b/fairnessBench/runner.py @@ -29,8 +29,8 @@ def run(agent_cls, args): print("Research problem: ", research_problem) print("Lower level actions enabled: ", [action.name for action in env.low_level_actions]) print("High level actions enabled: ", [action.name for action in env.high_level_actions]) - print("Read only files: ", env.read_only_files, file=sys.stderr) - print("Env read only files: ", env.env_read_only_files, file=sys.stderr) + print("Read only files: ", env.read_only_files if len(env.read_only_files) < 10 else env.read_only_files[:10], file=sys.stderr) + print("Env read only files: ", env.env_read_only_files if len(env.env_read_only_files) < 10 else env.env_read_only_files[:10], file=sys.stderr) print("=====================================") # AS: Create agent object from whichever agent was requested in agrs diff --git a/multi_run_experiment.sh b/multi_run_experiment.sh index 322e994..da6e8b1 100644 --- a/multi_run_experiment.sh +++ b/multi_run_experiment.sh @@ -5,6 +5,9 @@ # For every task this script will be run at least 3 times; model, retrival, agent/s # This scrip calls on the runner.py +# Base path depends on where we want to place out logs (base log folder) (work/scratch/project/...) +base="/scratch3/workspace/ayman_sandouk_uri_edu-fairness/fairnessBench/" + # grab preliminary info exp_path=$1 task=$2 @@ -21,12 +24,11 @@ do shift done - - extra_args="${@}" folder=$exp_path python=$(which python) + echo "exp_path: $exp_path" echo "task: $task" echo "n_devices: $n_device" @@ -40,21 +42,21 @@ for i in "${devices[@]}" do # time in current Unix timestamp ts=$(date +%s) - + echo "Run: #$ts" # Check for log folder with a time-named folder in it or create one - if [ -d "/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts" ]; then - echo "Folder /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts already exists. removing it" - rm -rf /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts + if [ -d "$base/$folder/$ts" ]; then + echo "Folder $base/$folder/$ts already exists. removing it" + rm -rf $base/$folder/$ts fi - mkdir -p "/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts" + mkdir -p "$base/$folder/$ts" # Call the prepare task script python -u -m fairnessBench.prepare_task $task $python # Printing command for debugging purposes and executing task with runner.py - echo "python -u -m fairnessBench.runner --python $python --task $task --device $i --log-dir /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts --work-dir /scratch3/workspace/ayman_sandouk_uri_edu-fairness/fairnessBench/workspaces/$folder/$ts ${extra_args}" > /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts/log 2>&1 & + echo "python -u -m fairnessBench.runner --python $python --task $task --device $i --log-dir $base/$folder/$ts --work-dir $base/workspaces/$folder/$ts ${extra_args}" > $base/$folder/$ts/log 2>&1 & - eval "python -u -m fairnessBench.runner --python $python --task $task --device $i --log-dir /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts --work-dir /scratch3/workspace/ayman_sandouk_uri_edu-fairness/fairnessBench/workspaces/$folder/$ts ${extra_args}" > /project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/new_$folder/$ts/log 2>&1 & + eval "python -u -m fairnessBench.runner --python $python --task $task --device $i --log-dir $base/$folder/$ts --work-dir $base/workspaces/$folder/$ts ${extra_args}" > $base/$folder/$ts/log 2>&1 & # 2 seconds between runs sleep 2