From 94c11b04fabaefe2cdc51f17e00256d28af2cf8a Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Wed, 6 Aug 2025 08:06:02 +0000 Subject: [PATCH 1/5] eval anylysis files --- fairnessbench_analysis/Deepseek_cv.csv | 3 + fairnessbench_analysis/Gemma_cv.csv | 8 + fairnessbench_analysis/Granite_cv.csv | 3 + .../adult_di_code_llmeval.py | 53 ++++++ fairnessbench_analysis/adult_fairness.py | 58 +++++++ fairnessbench_analysis/balancing_fairness.py | 57 +++++++ fairnessbench_analysis/code_log_llm_eval.py | 46 ++++++ .../correlation_flake8_code.py | 51 ++++++ .../cv_scores_evalmodels.py | 17 ++ fairnessbench_analysis/di_across_datasets.py | 41 +++++ fairnessbench_analysis/explode_results.py | 154 ++++++++++++++++++ .../performance_flake8_code.py | 38 +++++ .../target10_sucess_rate.py | 101 ++++++++++++ fairnessbench_analysis/target_selection.py | 54 ++++++ 14 files changed, 684 insertions(+) create mode 100644 fairnessbench_analysis/Deepseek_cv.csv create mode 100644 fairnessbench_analysis/Gemma_cv.csv create mode 100644 fairnessbench_analysis/Granite_cv.csv create mode 100644 fairnessbench_analysis/adult_di_code_llmeval.py create mode 100644 fairnessbench_analysis/adult_fairness.py create mode 100644 fairnessbench_analysis/balancing_fairness.py create mode 100644 fairnessbench_analysis/code_log_llm_eval.py create mode 100644 fairnessbench_analysis/correlation_flake8_code.py create mode 100644 fairnessbench_analysis/cv_scores_evalmodels.py create mode 100644 fairnessbench_analysis/di_across_datasets.py create mode 100644 fairnessbench_analysis/explode_results.py create mode 100644 fairnessbench_analysis/performance_flake8_code.py create mode 100644 fairnessbench_analysis/target10_sucess_rate.py create mode 100644 fairnessbench_analysis/target_selection.py diff --git a/fairnessbench_analysis/Deepseek_cv.csv b/fairnessbench_analysis/Deepseek_cv.csv new file mode 100644 index 0000000..22fefe5 --- /dev/null +++ b/fairnessbench_analysis/Deepseek_cv.csv @@ -0,0 +1,3 @@ +model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing +claude-3-7-sonnet-20250219,adult_err_balance-race,1.1035347137172349,0.2793492449300636,0.1818753236896995,0.15465752264217267,0.23111601141098256 +qwen,adult_err_balance-race,0.0,0.29632197530102916,0.3853887243714261,0.4880266796833574,0.34656317045517515 diff --git a/fairnessbench_analysis/Gemma_cv.csv b/fairnessbench_analysis/Gemma_cv.csv new file mode 100644 index 0000000..f9f85a4 --- /dev/null +++ b/fairnessbench_analysis/Gemma_cv.csv @@ -0,0 +1,8 @@ +model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing +claude-3-7-sonnet-20250219,adult_err_balance-race,0.15946509441945983,0.11649810905340587,0.0,0.15946509441945983,0.0 +claude-3-7-sonnet-20250219,adult_err_balance-sex,0.17494570236436235,0.0,0.0,0.1957400731715678,0.0 +gpt-4o,adult_err_balance-race,0.0,0.0,0.0,0.0,0.0 +gpt-4o,adult_err_balance-sex,0.0,0.0,0.0,0.0,0.0 +llama,adult_err_balance-race,0.20573779994945587,0.19716158838352976,0.0,0.12297509238026914,0.0 +llama,adult_err_balance-sex,0.3043212760213842,0.12426253043692712,0.0,0.16886551261045996,0.0 +qwen,adult_err_balance-sex,0.2138089935299395,0.19716158838352973,0.0,0.19716158838352973,0.0 diff --git a/fairnessbench_analysis/Granite_cv.csv b/fairnessbench_analysis/Granite_cv.csv new file mode 100644 index 0000000..e0023d9 --- /dev/null +++ b/fairnessbench_analysis/Granite_cv.csv @@ -0,0 +1,3 @@ +model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing +claude-3-7-sonnet-20250219,adult_err_balance-race,0.04932502891543654,0.09584211726899525,0.14691056734678462,0.0,0.04844009143018392 +qwen,adult_err_balance-race,0.18672359914948844,1.282654434033444,0.2803402154503214,0.062112999374994156,0.28247912462432095 diff --git a/fairnessbench_analysis/adult_di_code_llmeval.py b/fairnessbench_analysis/adult_di_code_llmeval.py new file mode 100644 index 0000000..96ffbce --- /dev/null +++ b/fairnessbench_analysis/adult_di_code_llmeval.py @@ -0,0 +1,53 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +# Removing missing rows fairnessBench +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +wider_code = wider_code[wider_cols] + +# filtering the adult dataset and the di task_metric +adult= wider_code[wider_code['task_dataset']=='adult'] +adult_di=adult[adult['task_metric']=='di'] + +long_df = adult_di.melt( + id_vars=['model','task_dataset','task_metric','resrch_prob','dem'], + value_vars=[ + '1. Data Collection and Processing', + '2. Bias Detection and Mitigation', + '3. Fairness Metric Selection', + '4. Model Selection and Training', + '5. Evaluation and Testing' + ], + var_name='rubric_section', + value_name='score' +) + +sns.set_context(context='poster',font_scale=1.0) +plt.figsize=(16,12) +m=sns.catplot( + data=long_df, + x="rubric_section", + y="score", + hue="model", + col="resrch_prob", + row='dem', + kind="bar", + aspect=2 +) +m.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +ax=m.axes +ax = m.axes +for ax in m.axes.flatten(): + plt.setp(ax.get_xticklabels(), rotation=30) + ax.axhline(y=4.0, color='black', linestyle='-.', alpha=0.3) +plt.savefig('adult_di_code_llm_eval.png', dpi=400 , bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/adult_fairness.py b/fairnessbench_analysis/adult_fairness.py new file mode 100644 index 0000000..bef5c41 --- /dev/null +++ b/fairnessbench_analysis/adult_fairness.py @@ -0,0 +1,58 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +# Filtering only adult dataset from the dataframe +wider_adult = wider_code[wider_code['task_dataset']=='adult'] +fairness_metrics= ['di','error_rate_ratio','statistical_parity_diff','equal_opp_diff','error_rate_diff','false_omission_rate_diff'] +wider_ADULT = ( + wider_adult.groupby(['model','task-dem','task_metric'])[fairness_metrics].mean() +).reset_index() +ad_df= wider_ADULT['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_ADULT=pd.concat([wider_ADULT,ad_df],axis=1) +wider_ADULT=wider_ADULT.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_ADULT.loc[:, 'task_metric_value'] = wider_ADULT.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.catplot(data=wider_ADULT,x='resrch_prob',y='task_metric_value',hue='dem',row='task_metric',col='model',kind='bar' + ,aspect=1) + +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# adding horizontal lines at di=1 to each cell plot +ax=g.axes +for i in range(4): + ax[0,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + ax[1,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[2,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[3,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + ax[4,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[5,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + +plt.savefig('adult_fairness.png',dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/balancing_fairness.py b/fairnessbench_analysis/balancing_fairness.py new file mode 100644 index 0000000..d77f134 --- /dev/null +++ b/fairnessbench_analysis/balancing_fairness.py @@ -0,0 +1,57 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) +wider = pd.concat([wider_code, task_task_dem],axis=1) +cols= ['model','task','task_dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider=wider[cols] + +# Filtering only balance task from the dataframe +wider_balance = wider[wider['rsch_prob']=='balance'] +wider_balance=wider_balance.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +wider_balance= wider_balance.copy() +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_balance.loc[:, 'task_metric_value'] = wider_balance.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.relplot(data=wider_balance,x='acc',y='task_metric_value',hue='task_dataset',style='dem',row='task_metric',col='model',kind='scatter', + aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# add horizontal lines at di=1 for the first row and a vertical line for acc=1 +ax=g.axes +for i in range(len(ax)): + for j in range(len(ax[0])): + if i in [0,3]: # first row (di) + ax[i,j].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + elif i in [1, 2, 4, 5]: # other fairness metrics + ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) + ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) +plt.savefig('balancing_fairness.png',dpi=400,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/code_log_llm_eval.py b/fairnessbench_analysis/code_log_llm_eval.py new file mode 100644 index 0000000..64618f6 --- /dev/null +++ b/fairnessbench_analysis/code_log_llm_eval.py @@ -0,0 +1,46 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +log_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Results_Final_log_clean2025-08-06T04:22:17.377479.csv') +perf_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') + +# Removing missing rows +code_eval= code_eval.dropna(how="any") +code_eval = code_eval.fillna(0) +log_eval= log_eval.dropna(how='any') +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_df= perf_df.dropna(subset=perf, how='all') +perf_df = perf_df.fillna(0) + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','final_flake8_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +wider_code = wider_code[wider_cols] + +score_cols = ["1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +code_tall = wider_code.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], + value_vars=score_cols,var_name='score') +sns.catplot(code_tall,col='model',row='resrch_prob',x= 'task_dataset',y='value',hue='score',kind='bar').savefig('codeval') + + +# log eval +task_data_metric = log_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_log = pd.concat([log_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id',"1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"] +wider_log = wider_log[wider_cols] +wider_log.head() + +score_cols = ["1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"] +log_tall = wider_log.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], + value_vars=score_cols,var_name='score') +sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig('logval') + diff --git a/fairnessbench_analysis/correlation_flake8_code.py b/fairnessbench_analysis/correlation_flake8_code.py new file mode 100644 index 0000000..c8c15fa --- /dev/null +++ b/fairnessbench_analysis/correlation_flake8_code.py @@ -0,0 +1,51 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +# Removing missing rows +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing",'final_flake8_score'] +wider_code = wider_code[wider_cols] + +# Correlation between flake8 and code llm eval on claude_adult_di_erd task +code_cols=['1. Data Collection and Processing','2. Bias Detection and Mitigation','3. Fairness Metric Selection','4. Model Selection and Training', '5. Evaluation and Testing'] +group_cols = ["model", "task_dataset", "resrch_prob", "task_metric"] # Add 'task_dem' if needed + +def flake8_corr_matrix(group): + # Compute correlation between flake8_score and each rubric section + corrs = [group["final_flake8_score"].corr(group[rubric]) for rubric in code_cols] + return pd.Series(corrs, index=code_cols) + +corrs = ( + wider_code.groupby(group_cols) + .apply(flake8_corr_matrix) + .reset_index() +) +corrs=corrs.fillna(0) + +group_filter = ( + (corrs['model'] == 'claude-3-7-sonnet-20250219') & + (corrs['task_dataset'] == 'adult') & + (corrs['resrch_prob'] == 'balance') & + (corrs['task_metric'] == 'erd') +) +corr_row = corrs.loc[group_filter, code_cols] + +plt.figure(figsize=(8, 2)) +sns.heatmap( + corr_row.values.reshape(1, -1), + annot=True, + cmap='coolwarm', + xticklabels=code_cols, + yticklabels=['Flake8 score'] +) +plt.title("Flake8 vs Rubric Correlation (claude-3, adult, balance, erd)") +plt.savefig('flake8_vs_code_correlation.png',bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/cv_scores_evalmodels.py b/fairnessbench_analysis/cv_scores_evalmodels.py new file mode 100644 index 0000000..7d2979b --- /dev/null +++ b/fairnessbench_analysis/cv_scores_evalmodels.py @@ -0,0 +1,17 @@ +import pandas as pd + +gemma_df = pd.read_csv('../fairnessBench/fairnessbench_analysis/Gemma_cv.csv') +deepseek_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Deepseek_cv.csv') +granite_df=pd.read_csv('../fairnessBench/fairnessbench_analysis/Granite_cv.csv') + +gemma_df['eval'] = 'gemma' +deepseek_df['eval'] = 'deepseek' +granite_df['eval'] = 'granite' + +cols = ['eval', 'model', 'task'] + [c for c in gemma_df.columns if c not in ['eval', 'model', 'task']] +gemma_df = gemma_df[cols] +deepseek_df = deepseek_df[cols] +granite_df = granite_df[cols] + +all_eval_cv = pd.concat([gemma_df, deepseek_df, granite_df], axis=0, ignore_index=True) +all_eval_cv.to_csv('cv_scores_evalmodel.csv',index=False) \ No newline at end of file diff --git a/fairnessbench_analysis/di_across_datasets.py b/fairnessbench_analysis/di_across_datasets.py new file mode 100644 index 0000000..5226b39 --- /dev/null +++ b/fairnessbench_analysis/di_across_datasets.py @@ -0,0 +1,41 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +# Filtering only DI from the dataframe +wider_di = wider_code[wider_code['task_metric']=='di'] +wider_DI = ( + wider_di.groupby(['task_dataset','task-dem'])[['di','acc']].mean().reset_index() +) +dem_df= wider_di['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_DI=pd.concat([wider_di,dem_df],axis=1) +wider_DI=wider_DI.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +# ploting the scatter plot for di vs acc +sns.set_context(context='poster',font_scale=0.8) +g=sns.relplot(data=wider_DI,x='acc',y='di',hue='task_dataset',row='resrch_prob',style='dem',col='model',kind='scatter',aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# adding horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=1.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) +# saving the plot +plt.savefig('di_vs_acc_scatter.png', dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/explode_results.py b/fairnessbench_analysis/explode_results.py new file mode 100644 index 0000000..ee328d5 --- /dev/null +++ b/fairnessbench_analysis/explode_results.py @@ -0,0 +1,154 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import json + +# loading the performance results +perf_path = '../results_manually_combined' +result_files = [ + os.path.join(perf_path, fname) + for fname in os.listdir(perf_path) + if os.path.isfile(os.path.join(perf_path, fname)) +] +result_list = [] +for rf in result_files: + try: + if os.path.getsize(rf) == 0: + print(f"Skipping empty file: {rf}") + continue + df = pd.read_json(rf).T + result_list.append(df) + except Exception as e: + print(f"Skipping file {rf} due to error: {e}") +performance_df = pd.concat(result_list) + +end_series = lambda s: pd.Series(s[-5:]) +model_run = performance_df['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['model','task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['model','task']).cumcount() +mr_keep = ['model','task','run_ts','run_id'] + +# extracting the performance scores for the results to save in a csv file +exploded_score = performance_df['final_score'].apply(pd.Series).reset_index().drop(columns=[0]) +exploded_score['score_count'] = exploded_score.groupby('index').cumcount() +sp = exploded_score['index'].str.split('/').apply(end_series) +sp = sp.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) + +exploded_score = exploded_score.join(sp[['model', 'task', 'run_ts']]) + +exploded_score['run_id'] = exploded_score.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exploded_score.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exploded_score = exploded_score[cols] +exploded_score = exploded_score.drop(exploded_score.columns[4],axis=1) +exploded_score.to_csv('Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) + + +# loading llm eval results +result_path = '../results_final_total' +result_files = [ + os.path.join(result_path, fname) + for fname in os.listdir(result_path) + if os.path.isfile(os.path.join(result_path, fname)) +] +result_list = [] +for rf in result_files: + try: + if os.path.getsize(rf) == 0: + print(f"Skipping empty file: {rf}") + continue + df = pd.read_json(rf).T + result_list.append(df) + except Exception as e: + print(f"Skipping file {rf} due to error: {e}") +result_df = pd.concat(result_list) + +end_series = lambda s: pd.Series(s[-5:]) +model_run = result_df['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['model','task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['model','task']).cumcount() +mr_keep = ['model','task','run_ts','run_id'] +# extracting llm code evaluation +raw_df= result_df[["final_llm_score"]].explode('final_llm_score',)['final_llm_score'].apply(pd.Series).reset_index().drop(columns=[0]) +exp_code= raw_df["raw_scores"].apply(pd.Series).drop(columns=[0]) +exp_code = raw_df.join(raw_df["raw_scores"].apply(pd.Series)).drop(columns= ['raw_scores', 'justifications', 'subtotals',0]) +splits = exp_code['index'].str.split('/').apply(end_series) +splits = splits.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) + +exp_code = exp_code.join(splits[['model', 'task', 'run_ts']]) + +exp_code['run_id'] = exp_code.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_code.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exp_code = exp_code[cols] +exp_code = exp_code.drop(exp_code.columns[4],axis=1) + +# adding flake8 results to the code llm eval df +flake8_df = result_df[['path', 'final_flake8_score']].copy() +sps = flake8_df['path'].str.split('/').apply(end_series) +sps = sps.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) +flake8_df = flake8_df.join(sps[['model', 'task', 'run_ts']]) + +# merging both dfs +exp_code = exp_code.merge( + flake8_df[['model', 'task', 'run_ts', 'final_flake8_score']], + on=['model', 'task', 'run_ts'], + how='left' +) +exp_code.to_csv('Result_Final_code_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) + +# extracting log llm eval results +raw_log= result_df[["final_log_score"]].explode('final_log_score',)['final_log_score'].apply(pd.Series).reset_index().drop(columns = [0]) +exp_log= raw_log["raw_scores"].apply(pd.Series).drop(columns = [0]) +exp_log = raw_log.join(raw_log["raw_scores"].apply(pd.Series)).drop(columns= ['raw_scores', 'justifications', 'subtotals',0]) +exp_log = exp_log.rename(columns={"total_llm_score":"total_log_score"}) +split = exp_log['index'].str.split('/').apply(end_series) +split = split.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) + +exp_log = exp_log.join(split[['model', 'task', 'run_ts']]) + +exp_log['run_id'] = exp_log.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_log.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exp_log = exp_log[cols] +exp_log = exp_log.drop(exp_log.columns[4],axis=1) +#exp_log.to_csv('Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) + +# loading baseline results +result_path = '../sanity_results' +result_files = [ + os.path.join(result_path, resjson) + for resjson in os.listdir(result_path) + if os.path.isfile(os.path.join(result_path, resjson)) +] + +result_list = [pd.read_json(rf).T for rf in result_files] +result_df = pd.concat(result_list) + +end_series = lambda s: pd.Series(s[-4:]) +model_run = result_df['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['task','run_ts']).cumcount() +mr_keep = ['task','run_ts','run_id'] + +exploded_score = result_df[['score']].explode('score',)['score'].apply(pd.Series).reset_index().drop(columns = [0]) +exploded_score['score_count'] = exploded_score.groupby('index').cumcount() +sp = exploded_score['index'].str.split('/').apply(end_series) +sp = sp.rename(columns={i: c for i, c in enumerate([ 'task', 'run_ts'])}) + +exploded_score = exploded_score.join(sp[['task', 'run_ts']]) + +exploded_score['run_id'] = exploded_score.groupby(['task','run_ts']).cumcount() + +cols = [ 'task', 'run_ts','run_id'] + [col for col in exploded_score.columns if col not in [ 'task', 'run_ts', 'run_id']] +exploded_score = exploded_score[cols] +exploded_score = exploded_score.drop(exploded_score.columns[3],axis=1) +cols_to_prefix = [col for col in exploded_score.columns if col not in ['task', 'run_ts', 'run_id']] +exploded_score = exploded_score.rename( + columns={col: f'baseline_{col}' for col in cols_to_prefix} +) +exploded_score.to_csv('Baseline_cleaned_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) diff --git a/fairnessbench_analysis/performance_flake8_code.py b/fairnessbench_analysis/performance_flake8_code.py new file mode 100644 index 0000000..c142544 --- /dev/null +++ b/fairnessbench_analysis/performance_flake8_code.py @@ -0,0 +1,38 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +# Removing missing rows +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing",'final_flake8_score'] +wider_code = wider_code[wider_cols] + +sns.set_context(context='poster',font_scale=0.8) +g = sns.relplot(data=wider_code, + x='final_flake8_score', + y='total_llm_score', + col='model', + row='task_dataset', + hue='resrch_prob', + kind='scatter', + alpha=0.7, + height=4, + aspect=1) + +g.set_axis_labels('Flake8 Score', 'LLM Code Score') +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# add horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=85.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=75.0, color='black', linestyle='-.', alpha=0.2) +plt.savefig('performance_flake8_code.png',dpi=300) + + diff --git a/fairnessbench_analysis/target10_sucess_rate.py b/fairnessbench_analysis/target10_sucess_rate.py new file mode 100644 index 0000000..c0117a0 --- /dev/null +++ b/fairnessbench_analysis/target10_sucess_rate.py @@ -0,0 +1,101 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns + +perf_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') + +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_df= perf_df.dropna(subset=perf, how='all') +perf_df = perf_df.replace([np.inf, -np.inf], np.nan).fillna(0) + +# loading baseline +baseline_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Baseline_cleaned_perfomance2025-08-06T06:09:53.582383.csv') + +base= ['baseline_acc','baseline_precision', 'baseline_recall', 'baseline_di', 'baseline_statistical_parity_diff', + 'baseline_equal_opp_diff', 'baseline_error_rate_diff', 'baseline_error_rate_ratio', + 'baseline_false_omission_rate_diff'] +baseline_df= baseline_df.dropna(subset=base, how='all') +baseline_df= baseline_df.fillna(0) +baseline_df= baseline_df.drop(columns=['run_ts','run_id','baseline_score_count']) + +# merging both dfs +merged_results= perf_df.merge(baseline_df, how='left',on=['task']) +merged_results= merged_results.dropna(how='any') +# rearranging cols +task_data_metric = merged_results['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_10 = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resch_prob',1:'dem'}) +wider = pd.concat([task_10, task_data_metric],axis=1) +col= ['task_dataset','task_metric','resch_prob','dem'] +wider=wider[col] +clean_df = pd.concat([merged_results, wider], axis=1) + +columns=['model','task','task_dataset','task_metric','resch_prob','dem','run_ts','run_id', +'acc','baseline_acc','precision','baseline_precision','recall', +'baseline_recall','di','baseline_di','statistical_parity_diff','baseline_statistical_parity_diff','equal_opp_diff', +'baseline_equal_opp_diff','error_rate_diff','baseline_error_rate_diff','error_rate_ratio','baseline_error_rate_ratio', +'false_omission_rate_diff','baseline_false_omission_rate_diff','score_count'] +clean_df=clean_df[columns] + +# filtering target10 task +df= clean_df[clean_df['resch_prob'] == 'target10'] + +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} + +metric_best = { + 'acc': 1, + 'di': 1, + 'spd': 0, + 'eod': 0, + 'err' :1, + 'erd' : 0, + 'ford': 0, +} + +# subtract diff directions so that + is improvement and - is worse in result +metric_best_fx = { + 'acc': lambda r: r['task_metric_value'] - r['task_metric_value_baseline'], + 'di': lambda r: abs(1- r['task_metric_value_baseline'])- abs(1- r['task_metric_value']), + 'spd': lambda r: r['task_metric_value_baseline'] - r['task_metric_value'], + 'eod': lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], + 'err' : lambda r: abs(1- r['task_metric_value_baseline'])- abs(1- r['task_metric_value']), + 'erd' : lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], + 'ford': lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], +} + +imp_text = {True:'improvement', False:'no improvement'} +def improvement(r): + return imp_text[r['agent-improvement']>0] + +df.loc[:,'task_metric_value'] = df.apply(lambda r: r[metric_map[r['task_metric']]],axis=1) +df.loc[:,'task_metric_value_baseline'] = df.apply(lambda r: r['baseline_'+metric_map[r['task_metric']]],axis=1) +df.loc[:,'agent-baseline'] = df.loc[:,'task_metric_value'] - df.loc[:,'task_metric_value_baseline'] +df.loc[:,'agent-improvement'] = df.apply(lambda r: metric_best_fx[r['task_metric']](r),axis=1) +df.loc[:,'agent-impact'] = df.apply(improvement,axis=1) + +def success(s): + return sum(s>.1) + +def total(s): + return len(s) + +def improvement(s): + return sum(s>0) + +df_improvement_stats = df.groupby(['model','task_dataset',])['agent-improvement'].agg(['mean',success,total,improvement]).reset_index() +df_improvement_stats_tall = df_improvement_stats.melt(id_vars=['model','task_dataset'], + value_vars=['total','success','improvement'],var_name='count_type',value_name='count') +sns.set_context(context='poster',font_scale = .5) +sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar').savefig('target10_success.png') + diff --git a/fairnessbench_analysis/target_selection.py b/fairnessbench_analysis/target_selection.py new file mode 100644 index 0000000..16234b5 --- /dev/null +++ b/fairnessbench_analysis/target_selection.py @@ -0,0 +1,54 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns + +res = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +task_data_metric = res['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([res, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +wider_adrecon = wider_code[wider_code['task_dataset']=='adrecon'].reset_index(drop=True) + +# % of times (per model/etc) that actually gets a final result +allmetrics = ['acc', 'precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', + 'error_rate_diff', 'error_rate_ratio', 'false_omission_rate_diff'] + +wider_adrecon['all_metric_vals'] = wider_adrecon[allmetrics].notna().all(axis=1) + +res = wider_adrecon.groupby('model')['all_metric_vals'].mean() * 100 +res.round(2) +wider_adrecon = wider_adrecon.dropna() + +# regular performance of final models +wider_adrecon = wider_adrecon.rename(columns={'statistical_parity_diff': 'spd', 'equal_opp_diff': 'eod', + 'error_rate_diff': 'erd', 'error_rate_ratio': 'err', + 'false_omission_rate_diff': 'ford', 'precision': 'p', + 'recall': 'r'}) + +allmet = ['acc', 'p', 'r', 'di', 'spd', 'eod', 'erd', 'err', 'ford'] +adrec_res = (wider_adrecon.groupby(['model','task-dem'])[allmet].mean()).reset_index() +adrec_task_dem= adrec_res['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +adrec_res=pd.concat([adrec_res,adrec_task_dem],axis=1) +adrec_res=adrec_res.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') +allmet = ['acc', 'p', 'r', 'di', 'spd', 'eod', 'erd', 'err', 'ford'] + +adrec_long = pd.melt(adrec_res, id_vars=['model', 'task-dem', 'resrch_prob', 'dem'], value_vars= allmet, var_name='task_metric', value_name='task_metric_value') +adrec_long['model-dem'] = adrec_long['model'] + '-' + adrec_long['dem'] +adrec_long = adrec_long.rename(columns={'task_metric': 'task_metrics'}) + +high_good = ['acc', 'p', 'r', 'di','err'] # +low_good = ['spd', 'eod', 'erd', 'ford'] + +metric_rename = {} + +for m in high_good: + metric_rename[m] = f"{m} ↑" + +for m in low_good: + metric_rename[m] = f"{m} ↓" + +adrec_long['task_metric'] = adrec_long['task_metrics'].map(metric_rename) +g = sns.catplot(data=adrec_long,kind='bar',x='task_metric',y='task_metric_value',hue='model',col='dem',height=4,aspect=1.5) +plt.savefig("adrec_allmetric.png",dpi=400,bbox_inches='tight') \ No newline at end of file From a6694288be49aad6593ba57d5b68eff178a8fd86 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 13 Aug 2025 14:22:43 +0000 Subject: [PATCH 2/5] Organized how analysis outout charts and where it takes CSV files from --- .gitignore | 3 ++- .../adult_di_code_llmeval.py | 8 +++++++- fairnessbench_analysis/adult_fairness.py | 9 +++++++-- fairnessbench_analysis/balancing_fairness.py | 8 +++++++- fairnessbench_analysis/code_log_llm_eval.py | 12 ++++++++--- .../correlation_flake8_code.py | 8 +++++++- .../{ => csv_files}/Deepseek_cv.csv | 0 .../{ => csv_files}/Gemma_cv.csv | 0 .../{ => csv_files}/Granite_cv.csv | 0 .../cv_scores_evalmodels.py | 9 ++++++--- fairnessbench_analysis/di_across_datasets.py | 8 +++++++- fairnessbench_analysis/explode_results.py | 20 +++++++++++++++---- .../performance_flake8_code.py | 9 ++++++++- .../target10_sucess_rate.py | 10 ++++++++-- fairnessbench_analysis/target_selection.py | 9 ++++++++- 15 files changed, 92 insertions(+), 21 deletions(-) rename fairnessbench_analysis/{ => csv_files}/Deepseek_cv.csv (100%) rename fairnessbench_analysis/{ => csv_files}/Gemma_cv.csv (100%) rename fairnessbench_analysis/{ => csv_files}/Granite_cv.csv (100%) diff --git a/.gitignore b/.gitignore index 22b1bca..8945c09 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,8 @@ clean.sh *.json .vscode/ fairnessBench/eval/test/ - +fairnessbench_analysis/*/*.png +fairnessbench_analysis/*/*.csv # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/fairnessbench_analysis/adult_di_code_llmeval.py b/fairnessbench_analysis/adult_di_code_llmeval.py index 96ffbce..f28d0ba 100644 --- a/fairnessbench_analysis/adult_di_code_llmeval.py +++ b/fairnessbench_analysis/adult_di_code_llmeval.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,10 @@ import seaborn as sns # Loading useful dataframes -code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +os.chdir('csv_files') +code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T08:50:51.905807.csv') +os.chdir('..') + # Removing missing rows fairnessBench code_eval= code_eval.dropna(how="any") @@ -50,4 +54,6 @@ for ax in m.axes.flatten(): plt.setp(ax.get_xticklabels(), rotation=30) ax.axhline(y=4.0, color='black', linestyle='-.', alpha=0.3) + +os.chdir('graphs/') plt.savefig('adult_di_code_llm_eval.png', dpi=400 , bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/adult_fairness.py b/fairnessbench_analysis/adult_fairness.py index bef5c41..e9d43db 100644 --- a/fairnessbench_analysis/adult_fairness.py +++ b/fairnessbench_analysis/adult_fairness.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,10 @@ import seaborn as sns # Loading useful dataframes -perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +os.chdir('csv_files') +perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T08:50:41.399910.csv') +os.chdir('..') + # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', @@ -54,5 +58,6 @@ ax[3,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) ax[4,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) ax[5,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) - + +os.chdir('graphs/') plt.savefig('adult_fairness.png',dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/balancing_fairness.py b/fairnessbench_analysis/balancing_fairness.py index d77f134..82923ce 100644 --- a/fairnessbench_analysis/balancing_fairness.py +++ b/fairnessbench_analysis/balancing_fairness.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,10 @@ import seaborn as sns # Loading useful dataframes -perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +os.chdir('csv_files') +perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T08:50:41.399910.csv') +os.chdir('..') + # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', @@ -54,4 +58,6 @@ elif i in [1, 2, 4, 5]: # other fairness metrics ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + +os.chdir('graphs/') plt.savefig('balancing_fairness.png',dpi=400,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/code_log_llm_eval.py b/fairnessbench_analysis/code_log_llm_eval.py index 64618f6..05fd588 100644 --- a/fairnessbench_analysis/code_log_llm_eval.py +++ b/fairnessbench_analysis/code_log_llm_eval.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,9 +6,11 @@ import seaborn as sns # Loading useful dataframes -code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') -log_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Results_Final_log_clean2025-08-06T04:22:17.377479.csv') -perf_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +os.chdir('csv_files') +code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') +log_eval = pd.read_csv('Results_Final_log_clean2025-08-13T10:44:21.146989.csv') +perf_df= pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') +os.chdir('..') # Removing missing rows code_eval= code_eval.dropna(how="any") @@ -42,5 +45,8 @@ score_cols = ["1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"] log_tall = wider_log.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], value_vars=score_cols,var_name='score') + + +os.chdir('graphs/') sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig('logval') diff --git a/fairnessbench_analysis/correlation_flake8_code.py b/fairnessbench_analysis/correlation_flake8_code.py index c8c15fa..683f0f9 100644 --- a/fairnessbench_analysis/correlation_flake8_code.py +++ b/fairnessbench_analysis/correlation_flake8_code.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,10 @@ import seaborn as sns # Loading useful dataframes -code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +os.chdir('csv_files') +code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') +os.chdir('..') + # Removing missing rows code_eval= code_eval.dropna(how="any") @@ -48,4 +52,6 @@ def flake8_corr_matrix(group): yticklabels=['Flake8 score'] ) plt.title("Flake8 vs Rubric Correlation (claude-3, adult, balance, erd)") + +os.chdir('graphs/') plt.savefig('flake8_vs_code_correlation.png',bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/Deepseek_cv.csv b/fairnessbench_analysis/csv_files/Deepseek_cv.csv similarity index 100% rename from fairnessbench_analysis/Deepseek_cv.csv rename to fairnessbench_analysis/csv_files/Deepseek_cv.csv diff --git a/fairnessbench_analysis/Gemma_cv.csv b/fairnessbench_analysis/csv_files/Gemma_cv.csv similarity index 100% rename from fairnessbench_analysis/Gemma_cv.csv rename to fairnessbench_analysis/csv_files/Gemma_cv.csv diff --git a/fairnessbench_analysis/Granite_cv.csv b/fairnessbench_analysis/csv_files/Granite_cv.csv similarity index 100% rename from fairnessbench_analysis/Granite_cv.csv rename to fairnessbench_analysis/csv_files/Granite_cv.csv diff --git a/fairnessbench_analysis/cv_scores_evalmodels.py b/fairnessbench_analysis/cv_scores_evalmodels.py index 7d2979b..f759e69 100644 --- a/fairnessbench_analysis/cv_scores_evalmodels.py +++ b/fairnessbench_analysis/cv_scores_evalmodels.py @@ -1,8 +1,10 @@ +import os import pandas as pd -gemma_df = pd.read_csv('../fairnessBench/fairnessbench_analysis/Gemma_cv.csv') -deepseek_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Deepseek_cv.csv') -granite_df=pd.read_csv('../fairnessBench/fairnessbench_analysis/Granite_cv.csv') +os.chdir('csv_files/') +gemma_df = pd.read_csv('Gemma_cv.csv') +deepseek_df= pd.read_csv('Deepseek_cv.csv') +granite_df=pd.read_csv('Granite_cv.csv') gemma_df['eval'] = 'gemma' deepseek_df['eval'] = 'deepseek' @@ -14,4 +16,5 @@ granite_df = granite_df[cols] all_eval_cv = pd.concat([gemma_df, deepseek_df, granite_df], axis=0, ignore_index=True) + all_eval_cv.to_csv('cv_scores_evalmodel.csv',index=False) \ No newline at end of file diff --git a/fairnessbench_analysis/di_across_datasets.py b/fairnessbench_analysis/di_across_datasets.py index 5226b39..4b1be7f 100644 --- a/fairnessbench_analysis/di_across_datasets.py +++ b/fairnessbench_analysis/di_across_datasets.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,10 @@ import seaborn as sns # Loading useful dataframes -perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +os.chdir('csv_files') +perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') +os.chdir('..') + # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', @@ -37,5 +41,7 @@ for i, ax in enumerate(g.axes.flat): ax.axhline(y=1.0, color='black', linestyle='-.', alpha=0.2) ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + # saving the plot +os.chdir('graphs/') plt.savefig('di_vs_acc_scatter.png', dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/explode_results.py b/fairnessbench_analysis/explode_results.py index ee328d5..37a8afa 100644 --- a/fairnessbench_analysis/explode_results.py +++ b/fairnessbench_analysis/explode_results.py @@ -5,7 +5,7 @@ import json # loading the performance results -perf_path = '../results_manually_combined' +perf_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/results_final_total' result_files = [ os.path.join(perf_path, fname) for fname in os.listdir(perf_path) @@ -43,11 +43,14 @@ cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exploded_score.columns if col not in ['model', 'task', 'run_ts', 'run_id']] exploded_score = exploded_score[cols] exploded_score = exploded_score.drop(exploded_score.columns[4],axis=1) + +os.chdir('csv_files') exploded_score.to_csv('Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) +os.chdir('..') # loading llm eval results -result_path = '../results_final_total' +result_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/results_final_total' result_files = [ os.path.join(result_path, fname) for fname in os.listdir(result_path) @@ -98,7 +101,10 @@ on=['model', 'task', 'run_ts'], how='left' ) + +os.chdir('csv_files') exp_code.to_csv('Result_Final_code_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) +os.chdir('..') # extracting log llm eval results raw_log= result_df[["final_log_score"]].explode('final_log_score',)['final_log_score'].apply(pd.Series).reset_index().drop(columns = [0]) @@ -115,10 +121,13 @@ cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_log.columns if col not in ['model', 'task', 'run_ts', 'run_id']] exp_log = exp_log[cols] exp_log = exp_log.drop(exp_log.columns[4],axis=1) -#exp_log.to_csv('Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) + +os.chdir('csv_files') +exp_log.to_csv('Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) +os.chdir('..') # loading baseline results -result_path = '../sanity_results' +result_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/sanity_results' result_files = [ os.path.join(result_path, resjson) for resjson in os.listdir(result_path) @@ -151,4 +160,7 @@ exploded_score = exploded_score.rename( columns={col: f'baseline_{col}' for col in cols_to_prefix} ) + +os.chdir('csv_files') exploded_score.to_csv('Baseline_cleaned_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) +os.chdir('..') diff --git a/fairnessbench_analysis/performance_flake8_code.py b/fairnessbench_analysis/performance_flake8_code.py index c142544..31b40ff 100644 --- a/fairnessbench_analysis/performance_flake8_code.py +++ b/fairnessbench_analysis/performance_flake8_code.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,11 @@ import seaborn as sns # Loading useful dataframes -code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +os.chdir('csv_files') +code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') +os.chdir('..') + + # Removing missing rows code_eval= code_eval.dropna(how="any") @@ -33,6 +38,8 @@ for i, ax in enumerate(g.axes.flat): ax.axhline(y=85.0, color='black', linestyle='-.', alpha=0.2) ax.axvline(x=75.0, color='black', linestyle='-.', alpha=0.2) + +os.chdir('graphs/') plt.savefig('performance_flake8_code.png',dpi=300) diff --git a/fairnessbench_analysis/target10_sucess_rate.py b/fairnessbench_analysis/target10_sucess_rate.py index c0117a0..449d0ea 100644 --- a/fairnessbench_analysis/target10_sucess_rate.py +++ b/fairnessbench_analysis/target10_sucess_rate.py @@ -1,9 +1,12 @@ +import os import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns -perf_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +os.chdir('csv_files') +perf_df= pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') +os.chdir('..') # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', @@ -13,7 +16,9 @@ perf_df = perf_df.replace([np.inf, -np.inf], np.nan).fillna(0) # loading baseline -baseline_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Baseline_cleaned_perfomance2025-08-06T06:09:53.582383.csv') +os.chdir('csv_files') +baseline_df= pd.read_csv('Baseline_cleaned_perfomance2025-08-13T10:44:21.444178.csv') +os.chdir('..') base= ['baseline_acc','baseline_precision', 'baseline_recall', 'baseline_di', 'baseline_statistical_parity_diff', 'baseline_equal_opp_diff', 'baseline_error_rate_diff', 'baseline_error_rate_ratio', @@ -97,5 +102,6 @@ def improvement(s): df_improvement_stats_tall = df_improvement_stats.melt(id_vars=['model','task_dataset'], value_vars=['total','success','improvement'],var_name='count_type',value_name='count') sns.set_context(context='poster',font_scale = .5) +os.chdir('graphs/') sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar').savefig('target10_success.png') diff --git a/fairnessbench_analysis/target_selection.py b/fairnessbench_analysis/target_selection.py index 16234b5..2430dcc 100644 --- a/fairnessbench_analysis/target_selection.py +++ b/fairnessbench_analysis/target_selection.py @@ -1,9 +1,14 @@ +import os import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns -res = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') + +os.chdir('csv_files') +res = pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') +os.chdir('..') + task_data_metric = res['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) wider_code = pd.concat([res, task_data_metric],axis=1) wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] @@ -51,4 +56,6 @@ adrec_long['task_metric'] = adrec_long['task_metrics'].map(metric_rename) g = sns.catplot(data=adrec_long,kind='bar',x='task_metric',y='task_metric_value',hue='model',col='dem',height=4,aspect=1.5) + +os.chdir('graphs/') plt.savefig("adrec_allmetric.png",dpi=400,bbox_inches='tight') \ No newline at end of file From e352ce6767b902fe4b444dcbb857828648c38696 Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Thu, 18 Sep 2025 03:35:05 +0000 Subject: [PATCH 3/5] New changes to analysis files --- .gitignore | 3 +- .../adult_di_code_llmeval.py | 12 ++++---- fairnessbench_analysis/adult_fairness.py | 11 +++---- fairnessbench_analysis/balancing_fairness.py | 10 +++---- fairnessbench_analysis/code_log_llm_eval.py | 18 ++++++----- .../correlation_flake8_code.py | 10 +++---- .../cv_scores_evalmodels.py | 10 +++---- fairnessbench_analysis/di_across_datasets.py | 11 +++---- fairnessbench_analysis/explode_results.py | 30 +++++++------------ .../performance_flake8_code.py | 10 +++---- .../target10_sucess_rate.py | 15 +++++----- fairnessbench_analysis/target_selection.py | 12 ++++---- 12 files changed, 74 insertions(+), 78 deletions(-) diff --git a/.gitignore b/.gitignore index 06d144c..2d47b79 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,8 @@ eval*.sh fairnessbench_analysis/*/*.png fairnessbench_analysis/*/*.csv - +# path +path.py # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/fairnessbench_analysis/adult_di_code_llmeval.py b/fairnessbench_analysis/adult_di_code_llmeval.py index f28d0ba..69714c3 100644 --- a/fairnessbench_analysis/adult_di_code_llmeval.py +++ b/fairnessbench_analysis/adult_di_code_llmeval.py @@ -4,11 +4,11 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns - +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T08:50:51.905807.csv') -os.chdir('..') +file = CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) + # Removing missing rows fairnessBench code_eval= code_eval.dropna(how="any") @@ -55,5 +55,5 @@ plt.setp(ax.get_xticklabels(), rotation=30) ax.axhline(y=4.0, color='black', linestyle='-.', alpha=0.3) -os.chdir('graphs/') -plt.savefig('adult_di_code_llm_eval.png', dpi=400 , bbox_inches='tight') \ No newline at end of file +output = os.path.join(GRAPHS, 'adult_di_code_llm_eval.png') +plt.savefig(output, dpi=400 , bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/adult_fairness.py b/fairnessbench_analysis/adult_fairness.py index e9d43db..c104bd1 100644 --- a/fairnessbench_analysis/adult_fairness.py +++ b/fairnessbench_analysis/adult_fairness.py @@ -4,11 +4,12 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T08:50:41.399910.csv') -os.chdir('..') +file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +perf_alt = pd.read_csv(file) + # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', @@ -59,5 +60,5 @@ ax[4,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) ax[5,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) -os.chdir('graphs/') -plt.savefig('adult_fairness.png',dpi=300, bbox_inches='tight') +output = os.path.join(GRAPHS, 'adult_fairness.png') +plt.savefig(output,dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/balancing_fairness.py b/fairnessbench_analysis/balancing_fairness.py index 82923ce..b0bcd3a 100644 --- a/fairnessbench_analysis/balancing_fairness.py +++ b/fairnessbench_analysis/balancing_fairness.py @@ -4,11 +4,11 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T08:50:41.399910.csv') -os.chdir('..') +file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +perf_alt = pd.read_csv(file) # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', @@ -59,5 +59,5 @@ ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) -os.chdir('graphs/') -plt.savefig('balancing_fairness.png',dpi=400,bbox_inches='tight') \ No newline at end of file +output = os.path.join(GRAPHS,'balancing_fairness.png') +plt.savefig(output,dpi=400,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/code_log_llm_eval.py b/fairnessbench_analysis/code_log_llm_eval.py index 05fd588..906635f 100644 --- a/fairnessbench_analysis/code_log_llm_eval.py +++ b/fairnessbench_analysis/code_log_llm_eval.py @@ -4,13 +4,14 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS + # Loading useful dataframes -os.chdir('csv_files') -code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') -log_eval = pd.read_csv('Results_Final_log_clean2025-08-13T10:44:21.146989.csv') -perf_df= pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') -os.chdir('..') +code_eval = pd.read_csv(CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv') +log_eval = pd.read_csv(CSV_FILES/'Results_Final_log_clean2025-09-18T00:48:52.486398.csv') +perf_df= pd.read_csv(CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv') + # Removing missing rows code_eval= code_eval.dropna(how="any") @@ -31,7 +32,8 @@ score_cols = ["1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] code_tall = wider_code.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], value_vars=score_cols,var_name='score') -sns.catplot(code_tall,col='model',row='resrch_prob',x= 'task_dataset',y='value',hue='score',kind='bar').savefig('codeval') +output= os.path.join(GRAPHS,'codeval') +sns.catplot(code_tall,col='model',row='resrch_prob',x= 'task_dataset',y='value',hue='score',kind='bar').savefig(output) # log eval @@ -47,6 +49,6 @@ value_vars=score_cols,var_name='score') -os.chdir('graphs/') -sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig('logval') +output=os.path.join(GRAPHS,'logval') +sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig(output) diff --git a/fairnessbench_analysis/correlation_flake8_code.py b/fairnessbench_analysis/correlation_flake8_code.py index 683f0f9..2b4c237 100644 --- a/fairnessbench_analysis/correlation_flake8_code.py +++ b/fairnessbench_analysis/correlation_flake8_code.py @@ -4,11 +4,11 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') -os.chdir('..') +file= CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) # Removing missing rows code_eval= code_eval.dropna(how="any") @@ -53,5 +53,5 @@ def flake8_corr_matrix(group): ) plt.title("Flake8 vs Rubric Correlation (claude-3, adult, balance, erd)") -os.chdir('graphs/') -plt.savefig('flake8_vs_code_correlation.png',bbox_inches='tight') \ No newline at end of file +output= os.path.join(GRAPHS,'flake8_vs_code_correlation.png') +plt.savefig(output,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/cv_scores_evalmodels.py b/fairnessbench_analysis/cv_scores_evalmodels.py index f759e69..e7d9cd7 100644 --- a/fairnessbench_analysis/cv_scores_evalmodels.py +++ b/fairnessbench_analysis/cv_scores_evalmodels.py @@ -1,10 +1,10 @@ import os import pandas as pd - -os.chdir('csv_files/') -gemma_df = pd.read_csv('Gemma_cv.csv') -deepseek_df= pd.read_csv('Deepseek_cv.csv') -granite_df=pd.read_csv('Granite_cv.csv') +from path import FILES + +gemma_df = pd.read_csv(FILES/'Gemma_cv.csv') +deepseek_df= pd.read_csv(FILES/'Deepseek_cv.csv') +granite_df=pd.read_csv(FILES/'Granite_cv.csv') gemma_df['eval'] = 'gemma' deepseek_df['eval'] = 'deepseek' diff --git a/fairnessbench_analysis/di_across_datasets.py b/fairnessbench_analysis/di_across_datasets.py index 4b1be7f..adcaeab 100644 --- a/fairnessbench_analysis/di_across_datasets.py +++ b/fairnessbench_analysis/di_across_datasets.py @@ -4,11 +4,11 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') -os.chdir('..') +file= CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +perf_alt = pd.read_csv(file) # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', @@ -43,5 +43,6 @@ ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) # saving the plot -os.chdir('graphs/') -plt.savefig('di_vs_acc_scatter.png', dpi=300, bbox_inches='tight') \ No newline at end of file +output= os.path.join(GRAPHS,'di_vs_acc_scatter.png') + +plt.savefig(output, dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/explode_results.py b/fairnessbench_analysis/explode_results.py index 37a8afa..a8a9285 100644 --- a/fairnessbench_analysis/explode_results.py +++ b/fairnessbench_analysis/explode_results.py @@ -3,9 +3,10 @@ import numpy as np from datetime import datetime import json +from path import PROJECT_ROOT,CSV_FILES # loading the performance results -perf_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/results_final_total' +perf_path = PROJECT_ROOT result_files = [ os.path.join(perf_path, fname) for fname in os.listdir(perf_path) @@ -44,13 +45,10 @@ exploded_score = exploded_score[cols] exploded_score = exploded_score.drop(exploded_score.columns[4],axis=1) -os.chdir('csv_files') -exploded_score.to_csv('Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) -os.chdir('..') - - +output_file=os.path.join(CSV_FILES, 'Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv') +exploded_score.to_csv(output_file,index=False) # loading llm eval results -result_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/results_final_total' +result_path = PROJECT_ROOT result_files = [ os.path.join(result_path, fname) for fname in os.listdir(result_path) @@ -101,11 +99,8 @@ on=['model', 'task', 'run_ts'], how='left' ) - -os.chdir('csv_files') -exp_code.to_csv('Result_Final_code_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) -os.chdir('..') - +output_file=os.path.join(CSV_FILES, 'Result_Final_code_clean' + datetime.isoformat(datetime.now()) +'.csv') +exp_code.to_csv(output_file,index=False) # extracting log llm eval results raw_log= result_df[["final_log_score"]].explode('final_log_score',)['final_log_score'].apply(pd.Series).reset_index().drop(columns = [0]) exp_log= raw_log["raw_scores"].apply(pd.Series).drop(columns = [0]) @@ -122,10 +117,8 @@ exp_log = exp_log[cols] exp_log = exp_log.drop(exp_log.columns[4],axis=1) -os.chdir('csv_files') -exp_log.to_csv('Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) -os.chdir('..') - +output_file=os.path.join(CSV_FILES, 'Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv') +exp_log.to_csv(output_file,index=False) # loading baseline results result_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/sanity_results' result_files = [ @@ -161,6 +154,5 @@ columns={col: f'baseline_{col}' for col in cols_to_prefix} ) -os.chdir('csv_files') -exploded_score.to_csv('Baseline_cleaned_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) -os.chdir('..') +output_file=os.path.join(CSV_FILES, 'Baseline_cleaned_perfomance' + datetime.isoformat(datetime.now()) +'.csv') +exploded_score.to_csv(output_file,index=False) diff --git a/fairnessbench_analysis/performance_flake8_code.py b/fairnessbench_analysis/performance_flake8_code.py index 31b40ff..ec823c3 100644 --- a/fairnessbench_analysis/performance_flake8_code.py +++ b/fairnessbench_analysis/performance_flake8_code.py @@ -4,11 +4,11 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') -os.chdir('..') +file = CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) # Removing missing rows @@ -39,7 +39,7 @@ ax.axhline(y=85.0, color='black', linestyle='-.', alpha=0.2) ax.axvline(x=75.0, color='black', linestyle='-.', alpha=0.2) -os.chdir('graphs/') -plt.savefig('performance_flake8_code.png',dpi=300) +output = os.path.join(GRAPHS,'performance_flake8_code.png') +plt.savefig(output,dpi=300) diff --git a/fairnessbench_analysis/target10_sucess_rate.py b/fairnessbench_analysis/target10_sucess_rate.py index 449d0ea..6bd9347 100644 --- a/fairnessbench_analysis/target10_sucess_rate.py +++ b/fairnessbench_analysis/target10_sucess_rate.py @@ -3,10 +3,10 @@ import numpy as np import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS -os.chdir('csv_files') -perf_df= pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') -os.chdir('..') +# Loading useful dataframes +perf_df = pd.read_csv(CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv') # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', @@ -16,9 +16,8 @@ perf_df = perf_df.replace([np.inf, -np.inf], np.nan).fillna(0) # loading baseline -os.chdir('csv_files') -baseline_df= pd.read_csv('Baseline_cleaned_perfomance2025-08-13T10:44:21.444178.csv') -os.chdir('..') + +baseline_df = pd.read_csv(CSV_FILES/'Baseline_cleaned_perfomance2025-09-18T00:48:53.537033.csv') base= ['baseline_acc','baseline_precision', 'baseline_recall', 'baseline_di', 'baseline_statistical_parity_diff', 'baseline_equal_opp_diff', 'baseline_error_rate_diff', 'baseline_error_rate_ratio', @@ -102,6 +101,6 @@ def improvement(s): df_improvement_stats_tall = df_improvement_stats.melt(id_vars=['model','task_dataset'], value_vars=['total','success','improvement'],var_name='count_type',value_name='count') sns.set_context(context='poster',font_scale = .5) -os.chdir('graphs/') -sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar').savefig('target10_success.png') +output= os.path.join(GRAPHS,'target10_success.png' ) +sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar').savefig(output) diff --git a/fairnessbench_analysis/target_selection.py b/fairnessbench_analysis/target_selection.py index 2430dcc..45e7d15 100644 --- a/fairnessbench_analysis/target_selection.py +++ b/fairnessbench_analysis/target_selection.py @@ -3,11 +3,11 @@ import numpy as np import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS - -os.chdir('csv_files') -res = pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') -os.chdir('..') +# Loading useful dataframes +file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +res = pd.read_csv(file) task_data_metric = res['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) wider_code = pd.concat([res, task_data_metric],axis=1) @@ -57,5 +57,5 @@ adrec_long['task_metric'] = adrec_long['task_metrics'].map(metric_rename) g = sns.catplot(data=adrec_long,kind='bar',x='task_metric',y='task_metric_value',hue='model',col='dem',height=4,aspect=1.5) -os.chdir('graphs/') -plt.savefig("adrec_allmetric.png",dpi=400,bbox_inches='tight') \ No newline at end of file +output= os.path.join(GRAPHS,"adrec_allmetric.png") +plt.savefig(output,dpi=400,bbox_inches='tight') \ No newline at end of file From 3d8e3f4a00dafdc30fc776991e18409ac846074b Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Thu, 18 Sep 2025 03:51:04 +0000 Subject: [PATCH 4/5] readme file and removed results from the repo --- .../csv_files/Deepseek_cv.csv | 3 -- fairnessbench_analysis/csv_files/Gemma_cv.csv | 8 ---- .../csv_files/Granite_cv.csv | 3 -- fairnessbench_analysis/readme.md | 48 +++++++++++++++++++ 4 files changed, 48 insertions(+), 14 deletions(-) delete mode 100644 fairnessbench_analysis/csv_files/Deepseek_cv.csv delete mode 100644 fairnessbench_analysis/csv_files/Gemma_cv.csv delete mode 100644 fairnessbench_analysis/csv_files/Granite_cv.csv create mode 100644 fairnessbench_analysis/readme.md diff --git a/fairnessbench_analysis/csv_files/Deepseek_cv.csv b/fairnessbench_analysis/csv_files/Deepseek_cv.csv deleted file mode 100644 index 22fefe5..0000000 --- a/fairnessbench_analysis/csv_files/Deepseek_cv.csv +++ /dev/null @@ -1,3 +0,0 @@ -model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing -claude-3-7-sonnet-20250219,adult_err_balance-race,1.1035347137172349,0.2793492449300636,0.1818753236896995,0.15465752264217267,0.23111601141098256 -qwen,adult_err_balance-race,0.0,0.29632197530102916,0.3853887243714261,0.4880266796833574,0.34656317045517515 diff --git a/fairnessbench_analysis/csv_files/Gemma_cv.csv b/fairnessbench_analysis/csv_files/Gemma_cv.csv deleted file mode 100644 index f9f85a4..0000000 --- a/fairnessbench_analysis/csv_files/Gemma_cv.csv +++ /dev/null @@ -1,8 +0,0 @@ -model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing -claude-3-7-sonnet-20250219,adult_err_balance-race,0.15946509441945983,0.11649810905340587,0.0,0.15946509441945983,0.0 -claude-3-7-sonnet-20250219,adult_err_balance-sex,0.17494570236436235,0.0,0.0,0.1957400731715678,0.0 -gpt-4o,adult_err_balance-race,0.0,0.0,0.0,0.0,0.0 -gpt-4o,adult_err_balance-sex,0.0,0.0,0.0,0.0,0.0 -llama,adult_err_balance-race,0.20573779994945587,0.19716158838352976,0.0,0.12297509238026914,0.0 -llama,adult_err_balance-sex,0.3043212760213842,0.12426253043692712,0.0,0.16886551261045996,0.0 -qwen,adult_err_balance-sex,0.2138089935299395,0.19716158838352973,0.0,0.19716158838352973,0.0 diff --git a/fairnessbench_analysis/csv_files/Granite_cv.csv b/fairnessbench_analysis/csv_files/Granite_cv.csv deleted file mode 100644 index e0023d9..0000000 --- a/fairnessbench_analysis/csv_files/Granite_cv.csv +++ /dev/null @@ -1,3 +0,0 @@ -model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing -claude-3-7-sonnet-20250219,adult_err_balance-race,0.04932502891543654,0.09584211726899525,0.14691056734678462,0.0,0.04844009143018392 -qwen,adult_err_balance-race,0.18672359914948844,1.282654434033444,0.2803402154503214,0.062112999374994156,0.28247912462432095 diff --git a/fairnessbench_analysis/readme.md b/fairnessbench_analysis/readme.md new file mode 100644 index 0000000..ebbca27 --- /dev/null +++ b/fairnessbench_analysis/readme.md @@ -0,0 +1,48 @@ +# fairnessbench analysis + +This folder contains all the code and data for analyzing the fairnessbench results. +The main analysis script is explode_results.py, which loads the raw results data and creates clean CSV files ready for analysis. + +# A. Setup: + +**Local path configuration** +1. Create `paths.py` at the repo root. +2. Create the csv_files and graphs directories, then set CSV_FILES and GRAPHS in paths.py to the absolute paths on your machine. + +### Required variables in `paths.py` +- **PROJECT_ROOT** — Directory that contains all *raw results*. +- **CSV_FILES** — Directory that contains the *clean CSV files* produced by `explode_results.py`. +- **GRAPHS** — Directory where analysis scripts will save generated *figures/plots*. +- **FILES** — Directory that stores *CSV files from different evaluation models* (used by `cv_scores_evalmodels.py`). + +# B. Run Analysis: +**Run main file** +```python +python explode_results.py +``` +This will create the following files in the csv_files/ directory: +- Result_Final_code_clean*.csv: File contains raw scores and final scores from the llm evaluation on the training scripts(code). +- Result_Final_log_clean*: File contains raw scores and final scores from the llm evaluation on the reasoning process of the agent(log). +- Final_step_performance*.csv: File contains performance metric(e.g. accuracy,disparate impact etc.) scores of the models on each task. +These files are then used for futher analysis +**Analysis** +In FairnessBench we run several analyses on our results. Each `.py` file performs a different analysis and generates plots that are stored in the `graphs/` directory. +To run an analysis, change the input CSV filename in the script to the file required for that analysis. +**Example:** To analyze different types of fairness for the Adult dataset, run `adult_fairness.py`. Before running it, update the script’s input CSV to the new file generated in the `csv_files/` directory. + +```python +python ....py +``` +***Key files:*** +`- adult_di_code_llmeval.py: Analyzes disparate impact (DI) for the Adult dataset using the LLM code evaluation.`\ +`- adult_fairness.py: Analyze the fairness metrics used in the benchmark for the Adult dataset.`\ +`- balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem.`\ +`- code_log_llm_eval.py: Generates plots showing how rubric-section scores vary across tasks and datasets for both code and log evaluations.`\ +`- correlation_flake8_code.py: Analyzes the correlation between LLM code-evaluation scores and Flake8 (linter-based) scores.`\ +`- cv_scores_evalmodels.py: Analyzes the consistency of different evaluation models based on cross-validation (CV) scores.`\ +`- di_across_datasets.py: Analyzes disparate impact (DI) and accuracy across datasets and research problems.`\ +`- performance_flake8_code.py: Generates a scatter plot of Flake8 performance vs. LLM code-evaluation scores across models, datasets, and research problems.`\ +`- target_selection.py: Analyze the models performance and fairness metrics for the targey selection tasks.`\ +`- target10_sucess_rate.py: Generates a plot showing how agent performance differs from the baseline on the Target10 research problem, by dataset and model.`\ + + From 16cdbe1902f27b0b6a917b9c63403d79f0d62fc0 Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Thu, 18 Sep 2025 03:57:40 +0000 Subject: [PATCH 5/5] font changes to readme --- fairnessbench_analysis/readme.md | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/fairnessbench_analysis/readme.md b/fairnessbench_analysis/readme.md index ebbca27..2ca8829 100644 --- a/fairnessbench_analysis/readme.md +++ b/fairnessbench_analysis/readme.md @@ -8,7 +8,7 @@ The main analysis script is explode_results.py, which loads the raw results data **Local path configuration** 1. Create `paths.py` at the repo root. 2. Create the csv_files and graphs directories, then set CSV_FILES and GRAPHS in paths.py to the absolute paths on your machine. - +3. `paths.py` is in `.gitignore`. ### Required variables in `paths.py` - **PROJECT_ROOT** — Directory that contains all *raw results*. - **CSV_FILES** — Directory that contains the *clean CSV files* produced by `explode_results.py`. @@ -24,7 +24,9 @@ This will create the following files in the csv_files/ directory: - Result_Final_code_clean*.csv: File contains raw scores and final scores from the llm evaluation on the training scripts(code). - Result_Final_log_clean*: File contains raw scores and final scores from the llm evaluation on the reasoning process of the agent(log). - Final_step_performance*.csv: File contains performance metric(e.g. accuracy,disparate impact etc.) scores of the models on each task. -These files are then used for futher analysis + +These files are then used for futher analysis. + **Analysis** In FairnessBench we run several analyses on our results. Each `.py` file performs a different analysis and generates plots that are stored in the `graphs/` directory. To run an analysis, change the input CSV filename in the script to the file required for that analysis. @@ -33,16 +35,16 @@ To run an analysis, change the input CSV filename in the script to the file requ ```python python ....py ``` -***Key files:*** -`- adult_di_code_llmeval.py: Analyzes disparate impact (DI) for the Adult dataset using the LLM code evaluation.`\ -`- adult_fairness.py: Analyze the fairness metrics used in the benchmark for the Adult dataset.`\ -`- balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem.`\ -`- code_log_llm_eval.py: Generates plots showing how rubric-section scores vary across tasks and datasets for both code and log evaluations.`\ -`- correlation_flake8_code.py: Analyzes the correlation between LLM code-evaluation scores and Flake8 (linter-based) scores.`\ -`- cv_scores_evalmodels.py: Analyzes the consistency of different evaluation models based on cross-validation (CV) scores.`\ -`- di_across_datasets.py: Analyzes disparate impact (DI) and accuracy across datasets and research problems.`\ -`- performance_flake8_code.py: Generates a scatter plot of Flake8 performance vs. LLM code-evaluation scores across models, datasets, and research problems.`\ -`- target_selection.py: Analyze the models performance and fairness metrics for the targey selection tasks.`\ -`- target10_sucess_rate.py: Generates a plot showing how agent performance differs from the baseline on the Target10 research problem, by dataset and model.`\ +**Key files:** +- adult_di_code_llmeval.py: Analyzes disparate impact (DI) for the Adult dataset using the LLM code evaluation. +- adult_fairness.py: Analyze the fairness metrics used in the benchmark for the Adult dataset. +- balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem. +- code_log_llm_eval.py: Generates plots showing how rubric-section scores vary across tasks and datasets for both code and log evaluations. +- correlation_flake8_code.py: Analyzes the correlation between LLM code-evaluation scores and Flake8 (linter-based) scores. +- cv_scores_evalmodels.py: Analyzes the consistency of different evaluation models based on cross-validation (CV) scores. +- di_across_datasets.py: Analyzes disparate impact (DI) and accuracy across datasets and research problems. +- performance_flake8_code.py: Generates a scatter plot of Flake8 performance vs. LLM code-evaluation scores across models, datasets, and research problems. +- target_selection.py: Analyze the models performance and fairness metrics for the targey selection tasks. +- target10_sucess_rate.py: Generates a plot showing how agent performance differs from the baseline on the Target10 research problem, by dataset and model.