diff --git a/.gitignore b/.gitignore index c557836..2d47b79 100644 --- a/.gitignore +++ b/.gitignore @@ -13,10 +13,11 @@ Run_scripts/* run_experiments_*.sh test*.txt eval*.sh +fairnessbench_analysis/*/*.png +fairnessbench_analysis/*/*.csv - - - +# path +path.py # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/fairnessbench_analysis/adult_di_code_llmeval.py b/fairnessbench_analysis/adult_di_code_llmeval.py new file mode 100644 index 0000000..69714c3 --- /dev/null +++ b/fairnessbench_analysis/adult_di_code_llmeval.py @@ -0,0 +1,59 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS +# Loading useful dataframes +file = CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) + + +# Removing missing rows fairnessBench +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +wider_code = wider_code[wider_cols] + +# filtering the adult dataset and the di task_metric +adult= wider_code[wider_code['task_dataset']=='adult'] +adult_di=adult[adult['task_metric']=='di'] + +long_df = adult_di.melt( + id_vars=['model','task_dataset','task_metric','resrch_prob','dem'], + value_vars=[ + '1. Data Collection and Processing', + '2. Bias Detection and Mitigation', + '3. Fairness Metric Selection', + '4. Model Selection and Training', + '5. Evaluation and Testing' + ], + var_name='rubric_section', + value_name='score' +) + +sns.set_context(context='poster',font_scale=1.0) +plt.figsize=(16,12) +m=sns.catplot( + data=long_df, + x="rubric_section", + y="score", + hue="model", + col="resrch_prob", + row='dem', + kind="bar", + aspect=2 +) +m.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +ax=m.axes +ax = m.axes +for ax in m.axes.flatten(): + plt.setp(ax.get_xticklabels(), rotation=30) + ax.axhline(y=4.0, color='black', linestyle='-.', alpha=0.3) + +output = os.path.join(GRAPHS, 'adult_di_code_llm_eval.png') +plt.savefig(output, dpi=400 , bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/adult_fairness.py b/fairnessbench_analysis/adult_fairness.py new file mode 100644 index 0000000..c104bd1 --- /dev/null +++ b/fairnessbench_analysis/adult_fairness.py @@ -0,0 +1,64 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +perf_alt = pd.read_csv(file) + + +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +# Filtering only adult dataset from the dataframe +wider_adult = wider_code[wider_code['task_dataset']=='adult'] +fairness_metrics= ['di','error_rate_ratio','statistical_parity_diff','equal_opp_diff','error_rate_diff','false_omission_rate_diff'] +wider_ADULT = ( + wider_adult.groupby(['model','task-dem','task_metric'])[fairness_metrics].mean() +).reset_index() +ad_df= wider_ADULT['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_ADULT=pd.concat([wider_ADULT,ad_df],axis=1) +wider_ADULT=wider_ADULT.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_ADULT.loc[:, 'task_metric_value'] = wider_ADULT.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.catplot(data=wider_ADULT,x='resrch_prob',y='task_metric_value',hue='dem',row='task_metric',col='model',kind='bar' + ,aspect=1) + +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# adding horizontal lines at di=1 to each cell plot +ax=g.axes +for i in range(4): + ax[0,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + ax[1,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[2,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[3,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + ax[4,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[5,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + +output = os.path.join(GRAPHS, 'adult_fairness.png') +plt.savefig(output,dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/balancing_fairness.py b/fairnessbench_analysis/balancing_fairness.py new file mode 100644 index 0000000..b0bcd3a --- /dev/null +++ b/fairnessbench_analysis/balancing_fairness.py @@ -0,0 +1,63 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +perf_alt = pd.read_csv(file) + +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) +wider = pd.concat([wider_code, task_task_dem],axis=1) +cols= ['model','task','task_dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider=wider[cols] + +# Filtering only balance task from the dataframe +wider_balance = wider[wider['rsch_prob']=='balance'] +wider_balance=wider_balance.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +wider_balance= wider_balance.copy() +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_balance.loc[:, 'task_metric_value'] = wider_balance.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.relplot(data=wider_balance,x='acc',y='task_metric_value',hue='task_dataset',style='dem',row='task_metric',col='model',kind='scatter', + aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# add horizontal lines at di=1 for the first row and a vertical line for acc=1 +ax=g.axes +for i in range(len(ax)): + for j in range(len(ax[0])): + if i in [0,3]: # first row (di) + ax[i,j].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + elif i in [1, 2, 4, 5]: # other fairness metrics + ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) + ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + +output = os.path.join(GRAPHS,'balancing_fairness.png') +plt.savefig(output,dpi=400,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/code_log_llm_eval.py b/fairnessbench_analysis/code_log_llm_eval.py new file mode 100644 index 0000000..906635f --- /dev/null +++ b/fairnessbench_analysis/code_log_llm_eval.py @@ -0,0 +1,54 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + + +# Loading useful dataframes +code_eval = pd.read_csv(CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv') +log_eval = pd.read_csv(CSV_FILES/'Results_Final_log_clean2025-09-18T00:48:52.486398.csv') +perf_df= pd.read_csv(CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv') + + +# Removing missing rows +code_eval= code_eval.dropna(how="any") +code_eval = code_eval.fillna(0) +log_eval= log_eval.dropna(how='any') +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_df= perf_df.dropna(subset=perf, how='all') +perf_df = perf_df.fillna(0) + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','final_flake8_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +wider_code = wider_code[wider_cols] + +score_cols = ["1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +code_tall = wider_code.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], + value_vars=score_cols,var_name='score') +output= os.path.join(GRAPHS,'codeval') +sns.catplot(code_tall,col='model',row='resrch_prob',x= 'task_dataset',y='value',hue='score',kind='bar').savefig(output) + + +# log eval +task_data_metric = log_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_log = pd.concat([log_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id',"1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"] +wider_log = wider_log[wider_cols] +wider_log.head() + +score_cols = ["1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"] +log_tall = wider_log.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], + value_vars=score_cols,var_name='score') + + +output=os.path.join(GRAPHS,'logval') +sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig(output) + diff --git a/fairnessbench_analysis/correlation_flake8_code.py b/fairnessbench_analysis/correlation_flake8_code.py new file mode 100644 index 0000000..2b4c237 --- /dev/null +++ b/fairnessbench_analysis/correlation_flake8_code.py @@ -0,0 +1,57 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file= CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) + +# Removing missing rows +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing",'final_flake8_score'] +wider_code = wider_code[wider_cols] + +# Correlation between flake8 and code llm eval on claude_adult_di_erd task +code_cols=['1. Data Collection and Processing','2. Bias Detection and Mitigation','3. Fairness Metric Selection','4. Model Selection and Training', '5. Evaluation and Testing'] +group_cols = ["model", "task_dataset", "resrch_prob", "task_metric"] # Add 'task_dem' if needed + +def flake8_corr_matrix(group): + # Compute correlation between flake8_score and each rubric section + corrs = [group["final_flake8_score"].corr(group[rubric]) for rubric in code_cols] + return pd.Series(corrs, index=code_cols) + +corrs = ( + wider_code.groupby(group_cols) + .apply(flake8_corr_matrix) + .reset_index() +) +corrs=corrs.fillna(0) + +group_filter = ( + (corrs['model'] == 'claude-3-7-sonnet-20250219') & + (corrs['task_dataset'] == 'adult') & + (corrs['resrch_prob'] == 'balance') & + (corrs['task_metric'] == 'erd') +) +corr_row = corrs.loc[group_filter, code_cols] + +plt.figure(figsize=(8, 2)) +sns.heatmap( + corr_row.values.reshape(1, -1), + annot=True, + cmap='coolwarm', + xticklabels=code_cols, + yticklabels=['Flake8 score'] +) +plt.title("Flake8 vs Rubric Correlation (claude-3, adult, balance, erd)") + +output= os.path.join(GRAPHS,'flake8_vs_code_correlation.png') +plt.savefig(output,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/cv_scores_evalmodels.py b/fairnessbench_analysis/cv_scores_evalmodels.py new file mode 100644 index 0000000..e7d9cd7 --- /dev/null +++ b/fairnessbench_analysis/cv_scores_evalmodels.py @@ -0,0 +1,20 @@ +import os +import pandas as pd +from path import FILES + +gemma_df = pd.read_csv(FILES/'Gemma_cv.csv') +deepseek_df= pd.read_csv(FILES/'Deepseek_cv.csv') +granite_df=pd.read_csv(FILES/'Granite_cv.csv') + +gemma_df['eval'] = 'gemma' +deepseek_df['eval'] = 'deepseek' +granite_df['eval'] = 'granite' + +cols = ['eval', 'model', 'task'] + [c for c in gemma_df.columns if c not in ['eval', 'model', 'task']] +gemma_df = gemma_df[cols] +deepseek_df = deepseek_df[cols] +granite_df = granite_df[cols] + +all_eval_cv = pd.concat([gemma_df, deepseek_df, granite_df], axis=0, ignore_index=True) + +all_eval_cv.to_csv('cv_scores_evalmodel.csv',index=False) \ No newline at end of file diff --git a/fairnessbench_analysis/di_across_datasets.py b/fairnessbench_analysis/di_across_datasets.py new file mode 100644 index 0000000..adcaeab --- /dev/null +++ b/fairnessbench_analysis/di_across_datasets.py @@ -0,0 +1,48 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file= CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +perf_alt = pd.read_csv(file) + +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +# Filtering only DI from the dataframe +wider_di = wider_code[wider_code['task_metric']=='di'] +wider_DI = ( + wider_di.groupby(['task_dataset','task-dem'])[['di','acc']].mean().reset_index() +) +dem_df= wider_di['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_DI=pd.concat([wider_di,dem_df],axis=1) +wider_DI=wider_DI.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +# ploting the scatter plot for di vs acc +sns.set_context(context='poster',font_scale=0.8) +g=sns.relplot(data=wider_DI,x='acc',y='di',hue='task_dataset',row='resrch_prob',style='dem',col='model',kind='scatter',aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# adding horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=1.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + +# saving the plot +output= os.path.join(GRAPHS,'di_vs_acc_scatter.png') + +plt.savefig(output, dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/explode_results.py b/fairnessbench_analysis/explode_results.py new file mode 100644 index 0000000..a8a9285 --- /dev/null +++ b/fairnessbench_analysis/explode_results.py @@ -0,0 +1,158 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import json +from path import PROJECT_ROOT,CSV_FILES + +# loading the performance results +perf_path = PROJECT_ROOT +result_files = [ + os.path.join(perf_path, fname) + for fname in os.listdir(perf_path) + if os.path.isfile(os.path.join(perf_path, fname)) +] +result_list = [] +for rf in result_files: + try: + if os.path.getsize(rf) == 0: + print(f"Skipping empty file: {rf}") + continue + df = pd.read_json(rf).T + result_list.append(df) + except Exception as e: + print(f"Skipping file {rf} due to error: {e}") +performance_df = pd.concat(result_list) + +end_series = lambda s: pd.Series(s[-5:]) +model_run = performance_df['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['model','task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['model','task']).cumcount() +mr_keep = ['model','task','run_ts','run_id'] + +# extracting the performance scores for the results to save in a csv file +exploded_score = performance_df['final_score'].apply(pd.Series).reset_index().drop(columns=[0]) +exploded_score['score_count'] = exploded_score.groupby('index').cumcount() +sp = exploded_score['index'].str.split('/').apply(end_series) +sp = sp.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) + +exploded_score = exploded_score.join(sp[['model', 'task', 'run_ts']]) + +exploded_score['run_id'] = exploded_score.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exploded_score.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exploded_score = exploded_score[cols] +exploded_score = exploded_score.drop(exploded_score.columns[4],axis=1) + +output_file=os.path.join(CSV_FILES, 'Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv') +exploded_score.to_csv(output_file,index=False) +# loading llm eval results +result_path = PROJECT_ROOT +result_files = [ + os.path.join(result_path, fname) + for fname in os.listdir(result_path) + if os.path.isfile(os.path.join(result_path, fname)) +] +result_list = [] +for rf in result_files: + try: + if os.path.getsize(rf) == 0: + print(f"Skipping empty file: {rf}") + continue + df = pd.read_json(rf).T + result_list.append(df) + except Exception as e: + print(f"Skipping file {rf} due to error: {e}") +result_df = pd.concat(result_list) + +end_series = lambda s: pd.Series(s[-5:]) +model_run = result_df['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['model','task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['model','task']).cumcount() +mr_keep = ['model','task','run_ts','run_id'] +# extracting llm code evaluation +raw_df= result_df[["final_llm_score"]].explode('final_llm_score',)['final_llm_score'].apply(pd.Series).reset_index().drop(columns=[0]) +exp_code= raw_df["raw_scores"].apply(pd.Series).drop(columns=[0]) +exp_code = raw_df.join(raw_df["raw_scores"].apply(pd.Series)).drop(columns= ['raw_scores', 'justifications', 'subtotals',0]) +splits = exp_code['index'].str.split('/').apply(end_series) +splits = splits.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) + +exp_code = exp_code.join(splits[['model', 'task', 'run_ts']]) + +exp_code['run_id'] = exp_code.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_code.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exp_code = exp_code[cols] +exp_code = exp_code.drop(exp_code.columns[4],axis=1) + +# adding flake8 results to the code llm eval df +flake8_df = result_df[['path', 'final_flake8_score']].copy() +sps = flake8_df['path'].str.split('/').apply(end_series) +sps = sps.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) +flake8_df = flake8_df.join(sps[['model', 'task', 'run_ts']]) + +# merging both dfs +exp_code = exp_code.merge( + flake8_df[['model', 'task', 'run_ts', 'final_flake8_score']], + on=['model', 'task', 'run_ts'], + how='left' +) +output_file=os.path.join(CSV_FILES, 'Result_Final_code_clean' + datetime.isoformat(datetime.now()) +'.csv') +exp_code.to_csv(output_file,index=False) +# extracting log llm eval results +raw_log= result_df[["final_log_score"]].explode('final_log_score',)['final_log_score'].apply(pd.Series).reset_index().drop(columns = [0]) +exp_log= raw_log["raw_scores"].apply(pd.Series).drop(columns = [0]) +exp_log = raw_log.join(raw_log["raw_scores"].apply(pd.Series)).drop(columns= ['raw_scores', 'justifications', 'subtotals',0]) +exp_log = exp_log.rename(columns={"total_llm_score":"total_log_score"}) +split = exp_log['index'].str.split('/').apply(end_series) +split = split.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) + +exp_log = exp_log.join(split[['model', 'task', 'run_ts']]) + +exp_log['run_id'] = exp_log.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_log.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exp_log = exp_log[cols] +exp_log = exp_log.drop(exp_log.columns[4],axis=1) + +output_file=os.path.join(CSV_FILES, 'Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv') +exp_log.to_csv(output_file,index=False) +# loading baseline results +result_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/sanity_results' +result_files = [ + os.path.join(result_path, resjson) + for resjson in os.listdir(result_path) + if os.path.isfile(os.path.join(result_path, resjson)) +] + +result_list = [pd.read_json(rf).T for rf in result_files] +result_df = pd.concat(result_list) + +end_series = lambda s: pd.Series(s[-4:]) +model_run = result_df['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['task','run_ts']).cumcount() +mr_keep = ['task','run_ts','run_id'] + +exploded_score = result_df[['score']].explode('score',)['score'].apply(pd.Series).reset_index().drop(columns = [0]) +exploded_score['score_count'] = exploded_score.groupby('index').cumcount() +sp = exploded_score['index'].str.split('/').apply(end_series) +sp = sp.rename(columns={i: c for i, c in enumerate([ 'task', 'run_ts'])}) + +exploded_score = exploded_score.join(sp[['task', 'run_ts']]) + +exploded_score['run_id'] = exploded_score.groupby(['task','run_ts']).cumcount() + +cols = [ 'task', 'run_ts','run_id'] + [col for col in exploded_score.columns if col not in [ 'task', 'run_ts', 'run_id']] +exploded_score = exploded_score[cols] +exploded_score = exploded_score.drop(exploded_score.columns[3],axis=1) +cols_to_prefix = [col for col in exploded_score.columns if col not in ['task', 'run_ts', 'run_id']] +exploded_score = exploded_score.rename( + columns={col: f'baseline_{col}' for col in cols_to_prefix} +) + +output_file=os.path.join(CSV_FILES, 'Baseline_cleaned_perfomance' + datetime.isoformat(datetime.now()) +'.csv') +exploded_score.to_csv(output_file,index=False) diff --git a/fairnessbench_analysis/performance_flake8_code.py b/fairnessbench_analysis/performance_flake8_code.py new file mode 100644 index 0000000..ec823c3 --- /dev/null +++ b/fairnessbench_analysis/performance_flake8_code.py @@ -0,0 +1,45 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file = CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) + + +# Removing missing rows +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing",'final_flake8_score'] +wider_code = wider_code[wider_cols] + +sns.set_context(context='poster',font_scale=0.8) +g = sns.relplot(data=wider_code, + x='final_flake8_score', + y='total_llm_score', + col='model', + row='task_dataset', + hue='resrch_prob', + kind='scatter', + alpha=0.7, + height=4, + aspect=1) + +g.set_axis_labels('Flake8 Score', 'LLM Code Score') +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# add horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=85.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=75.0, color='black', linestyle='-.', alpha=0.2) + +output = os.path.join(GRAPHS,'performance_flake8_code.png') +plt.savefig(output,dpi=300) + + diff --git a/fairnessbench_analysis/readme.md b/fairnessbench_analysis/readme.md new file mode 100644 index 0000000..2ca8829 --- /dev/null +++ b/fairnessbench_analysis/readme.md @@ -0,0 +1,50 @@ +# fairnessbench analysis + +This folder contains all the code and data for analyzing the fairnessbench results. +The main analysis script is explode_results.py, which loads the raw results data and creates clean CSV files ready for analysis. + +# A. Setup: + +**Local path configuration** +1. Create `paths.py` at the repo root. +2. Create the csv_files and graphs directories, then set CSV_FILES and GRAPHS in paths.py to the absolute paths on your machine. +3. `paths.py` is in `.gitignore`. +### Required variables in `paths.py` +- **PROJECT_ROOT** — Directory that contains all *raw results*. +- **CSV_FILES** — Directory that contains the *clean CSV files* produced by `explode_results.py`. +- **GRAPHS** — Directory where analysis scripts will save generated *figures/plots*. +- **FILES** — Directory that stores *CSV files from different evaluation models* (used by `cv_scores_evalmodels.py`). + +# B. Run Analysis: +**Run main file** +```python +python explode_results.py +``` +This will create the following files in the csv_files/ directory: +- Result_Final_code_clean*.csv: File contains raw scores and final scores from the llm evaluation on the training scripts(code). +- Result_Final_log_clean*: File contains raw scores and final scores from the llm evaluation on the reasoning process of the agent(log). +- Final_step_performance*.csv: File contains performance metric(e.g. accuracy,disparate impact etc.) scores of the models on each task. + +These files are then used for futher analysis. + +**Analysis** +In FairnessBench we run several analyses on our results. Each `.py` file performs a different analysis and generates plots that are stored in the `graphs/` directory. +To run an analysis, change the input CSV filename in the script to the file required for that analysis. +**Example:** To analyze different types of fairness for the Adult dataset, run `adult_fairness.py`. Before running it, update the script’s input CSV to the new file generated in the `csv_files/` directory. + +```python +python ....py +``` +**Key files:** +- adult_di_code_llmeval.py: Analyzes disparate impact (DI) for the Adult dataset using the LLM code evaluation. +- adult_fairness.py: Analyze the fairness metrics used in the benchmark for the Adult dataset. +- balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem. +- code_log_llm_eval.py: Generates plots showing how rubric-section scores vary across tasks and datasets for both code and log evaluations. +- correlation_flake8_code.py: Analyzes the correlation between LLM code-evaluation scores and Flake8 (linter-based) scores. +- cv_scores_evalmodels.py: Analyzes the consistency of different evaluation models based on cross-validation (CV) scores. +- di_across_datasets.py: Analyzes disparate impact (DI) and accuracy across datasets and research problems. +- performance_flake8_code.py: Generates a scatter plot of Flake8 performance vs. LLM code-evaluation scores across models, datasets, and research problems. +- target_selection.py: Analyze the models performance and fairness metrics for the targey selection tasks. +- target10_sucess_rate.py: Generates a plot showing how agent performance differs from the baseline on the Target10 research problem, by dataset and model. + + diff --git a/fairnessbench_analysis/target10_sucess_rate.py b/fairnessbench_analysis/target10_sucess_rate.py new file mode 100644 index 0000000..6bd9347 --- /dev/null +++ b/fairnessbench_analysis/target10_sucess_rate.py @@ -0,0 +1,106 @@ +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +perf_df = pd.read_csv(CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv') + +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_df= perf_df.dropna(subset=perf, how='all') +perf_df = perf_df.replace([np.inf, -np.inf], np.nan).fillna(0) + +# loading baseline + +baseline_df = pd.read_csv(CSV_FILES/'Baseline_cleaned_perfomance2025-09-18T00:48:53.537033.csv') + +base= ['baseline_acc','baseline_precision', 'baseline_recall', 'baseline_di', 'baseline_statistical_parity_diff', + 'baseline_equal_opp_diff', 'baseline_error_rate_diff', 'baseline_error_rate_ratio', + 'baseline_false_omission_rate_diff'] +baseline_df= baseline_df.dropna(subset=base, how='all') +baseline_df= baseline_df.fillna(0) +baseline_df= baseline_df.drop(columns=['run_ts','run_id','baseline_score_count']) + +# merging both dfs +merged_results= perf_df.merge(baseline_df, how='left',on=['task']) +merged_results= merged_results.dropna(how='any') +# rearranging cols +task_data_metric = merged_results['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_10 = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resch_prob',1:'dem'}) +wider = pd.concat([task_10, task_data_metric],axis=1) +col= ['task_dataset','task_metric','resch_prob','dem'] +wider=wider[col] +clean_df = pd.concat([merged_results, wider], axis=1) + +columns=['model','task','task_dataset','task_metric','resch_prob','dem','run_ts','run_id', +'acc','baseline_acc','precision','baseline_precision','recall', +'baseline_recall','di','baseline_di','statistical_parity_diff','baseline_statistical_parity_diff','equal_opp_diff', +'baseline_equal_opp_diff','error_rate_diff','baseline_error_rate_diff','error_rate_ratio','baseline_error_rate_ratio', +'false_omission_rate_diff','baseline_false_omission_rate_diff','score_count'] +clean_df=clean_df[columns] + +# filtering target10 task +df= clean_df[clean_df['resch_prob'] == 'target10'] + +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} + +metric_best = { + 'acc': 1, + 'di': 1, + 'spd': 0, + 'eod': 0, + 'err' :1, + 'erd' : 0, + 'ford': 0, +} + +# subtract diff directions so that + is improvement and - is worse in result +metric_best_fx = { + 'acc': lambda r: r['task_metric_value'] - r['task_metric_value_baseline'], + 'di': lambda r: abs(1- r['task_metric_value_baseline'])- abs(1- r['task_metric_value']), + 'spd': lambda r: r['task_metric_value_baseline'] - r['task_metric_value'], + 'eod': lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], + 'err' : lambda r: abs(1- r['task_metric_value_baseline'])- abs(1- r['task_metric_value']), + 'erd' : lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], + 'ford': lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], +} + +imp_text = {True:'improvement', False:'no improvement'} +def improvement(r): + return imp_text[r['agent-improvement']>0] + +df.loc[:,'task_metric_value'] = df.apply(lambda r: r[metric_map[r['task_metric']]],axis=1) +df.loc[:,'task_metric_value_baseline'] = df.apply(lambda r: r['baseline_'+metric_map[r['task_metric']]],axis=1) +df.loc[:,'agent-baseline'] = df.loc[:,'task_metric_value'] - df.loc[:,'task_metric_value_baseline'] +df.loc[:,'agent-improvement'] = df.apply(lambda r: metric_best_fx[r['task_metric']](r),axis=1) +df.loc[:,'agent-impact'] = df.apply(improvement,axis=1) + +def success(s): + return sum(s>.1) + +def total(s): + return len(s) + +def improvement(s): + return sum(s>0) + +df_improvement_stats = df.groupby(['model','task_dataset',])['agent-improvement'].agg(['mean',success,total,improvement]).reset_index() +df_improvement_stats_tall = df_improvement_stats.melt(id_vars=['model','task_dataset'], + value_vars=['total','success','improvement'],var_name='count_type',value_name='count') +sns.set_context(context='poster',font_scale = .5) +output= os.path.join(GRAPHS,'target10_success.png' ) +sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar').savefig(output) + diff --git a/fairnessbench_analysis/target_selection.py b/fairnessbench_analysis/target_selection.py new file mode 100644 index 0000000..45e7d15 --- /dev/null +++ b/fairnessbench_analysis/target_selection.py @@ -0,0 +1,61 @@ +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +res = pd.read_csv(file) + +task_data_metric = res['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([res, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +wider_adrecon = wider_code[wider_code['task_dataset']=='adrecon'].reset_index(drop=True) + +# % of times (per model/etc) that actually gets a final result +allmetrics = ['acc', 'precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', + 'error_rate_diff', 'error_rate_ratio', 'false_omission_rate_diff'] + +wider_adrecon['all_metric_vals'] = wider_adrecon[allmetrics].notna().all(axis=1) + +res = wider_adrecon.groupby('model')['all_metric_vals'].mean() * 100 +res.round(2) +wider_adrecon = wider_adrecon.dropna() + +# regular performance of final models +wider_adrecon = wider_adrecon.rename(columns={'statistical_parity_diff': 'spd', 'equal_opp_diff': 'eod', + 'error_rate_diff': 'erd', 'error_rate_ratio': 'err', + 'false_omission_rate_diff': 'ford', 'precision': 'p', + 'recall': 'r'}) + +allmet = ['acc', 'p', 'r', 'di', 'spd', 'eod', 'erd', 'err', 'ford'] +adrec_res = (wider_adrecon.groupby(['model','task-dem'])[allmet].mean()).reset_index() +adrec_task_dem= adrec_res['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +adrec_res=pd.concat([adrec_res,adrec_task_dem],axis=1) +adrec_res=adrec_res.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') +allmet = ['acc', 'p', 'r', 'di', 'spd', 'eod', 'erd', 'err', 'ford'] + +adrec_long = pd.melt(adrec_res, id_vars=['model', 'task-dem', 'resrch_prob', 'dem'], value_vars= allmet, var_name='task_metric', value_name='task_metric_value') +adrec_long['model-dem'] = adrec_long['model'] + '-' + adrec_long['dem'] +adrec_long = adrec_long.rename(columns={'task_metric': 'task_metrics'}) + +high_good = ['acc', 'p', 'r', 'di','err'] # +low_good = ['spd', 'eod', 'erd', 'ford'] + +metric_rename = {} + +for m in high_good: + metric_rename[m] = f"{m} ↑" + +for m in low_good: + metric_rename[m] = f"{m} ↓" + +adrec_long['task_metric'] = adrec_long['task_metrics'].map(metric_rename) +g = sns.catplot(data=adrec_long,kind='bar',x='task_metric',y='task_metric_value',hue='model',col='dem',height=4,aspect=1.5) + +output= os.path.join(GRAPHS,"adrec_allmetric.png") +plt.savefig(output,dpi=400,bbox_inches='tight') \ No newline at end of file