Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ Run_scripts/*
run_experiments_*.sh
test*.txt
eval*.sh
fairnessbench_analysis/*/*.png
fairnessbench_analysis/*/*.csv




# path
path.py
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
59 changes: 59 additions & 0 deletions fairnessbench_analysis/adult_di_code_llmeval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from path import CSV_FILES,GRAPHS
# Loading useful dataframes
file = CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv'
code_eval = pd.read_csv(file)


# Removing missing rows fairnessBench
code_eval= code_eval.dropna(how="any")

task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'})
task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'})
wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1)
wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"]
wider_code = wider_code[wider_cols]

# filtering the adult dataset and the di task_metric
adult= wider_code[wider_code['task_dataset']=='adult']
adult_di=adult[adult['task_metric']=='di']

long_df = adult_di.melt(
id_vars=['model','task_dataset','task_metric','resrch_prob','dem'],
value_vars=[
'1. Data Collection and Processing',
'2. Bias Detection and Mitigation',
'3. Fairness Metric Selection',
'4. Model Selection and Training',
'5. Evaluation and Testing'
],
var_name='rubric_section',
value_name='score'
)

sns.set_context(context='poster',font_scale=1.0)
plt.figsize=(16,12)
m=sns.catplot(
data=long_df,
x="rubric_section",
y="score",
hue="model",
col="resrch_prob",
row='dem',
kind="bar",
aspect=2
)
m.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}')
ax=m.axes
ax = m.axes
for ax in m.axes.flatten():
plt.setp(ax.get_xticklabels(), rotation=30)
ax.axhline(y=4.0, color='black', linestyle='-.', alpha=0.3)

output = os.path.join(GRAPHS, 'adult_di_code_llm_eval.png')
plt.savefig(output, dpi=400 , bbox_inches='tight')
64 changes: 64 additions & 0 deletions fairnessbench_analysis/adult_fairness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from path import CSV_FILES,GRAPHS

# Loading useful dataframes
file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv'
perf_alt = pd.read_csv(file)


# Removing missing rows
perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff',
'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio',
'false_omission_rate_diff']
perf_alt= perf_alt.dropna(subset=perf, how='all')
perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0)

# expanding the task to sub columns
task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'})
wider_code = pd.concat([perf_alt, task_data_metric],axis=1)
wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count']
wider_code = wider_code[wider_cols]

# Filtering only adult dataset from the dataframe
wider_adult = wider_code[wider_code['task_dataset']=='adult']
fairness_metrics= ['di','error_rate_ratio','statistical_parity_diff','equal_opp_diff','error_rate_diff','false_omission_rate_diff']
wider_ADULT = (
wider_adult.groupby(['model','task-dem','task_metric'])[fairness_metrics].mean()
).reset_index()
ad_df= wider_ADULT['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'})
wider_ADULT=pd.concat([wider_ADULT,ad_df],axis=1)
wider_ADULT=wider_ADULT.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet')

metric_map = {
'acc': 'acc',
'di': 'di',
'spd': 'statistical_parity_diff',
'eod': 'equal_opp_diff',
'err' : 'error_rate_ratio',
'erd' : 'error_rate_diff',
'ford': 'false_omission_rate_diff',
}
wider_ADULT.loc[:, 'task_metric_value'] = wider_ADULT.apply(lambda row: row[metric_map[row['task_metric']]], axis=1)

sns.set_context(context='poster',font_scale=1.0)
g=sns.catplot(data=wider_ADULT,x='resrch_prob',y='task_metric_value',hue='dem',row='task_metric',col='model',kind='bar'
,aspect=1)

g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}')
# adding horizontal lines at di=1 to each cell plot
ax=g.axes
for i in range(4):
ax[0,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3)
ax[1,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6)
ax[2,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6)
ax[3,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3)
ax[4,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6)
ax[5,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6)

output = os.path.join(GRAPHS, 'adult_fairness.png')
plt.savefig(output,dpi=300, bbox_inches='tight')
63 changes: 63 additions & 0 deletions fairnessbench_analysis/balancing_fairness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from path import CSV_FILES,GRAPHS

# Loading useful dataframes
file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv'
perf_alt = pd.read_csv(file)

# Removing missing rows
perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff',
'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio',
'false_omission_rate_diff']
perf_alt= perf_alt.dropna(subset=perf, how='all')
perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0)

# expanding the task to sub columns
task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'})
wider_code = pd.concat([perf_alt, task_data_metric],axis=1)
wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count']
wider_code = wider_code[wider_cols]

task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'})
wider = pd.concat([wider_code, task_task_dem],axis=1)
cols= ['model','task','task_dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count']
wider=wider[cols]

# Filtering only balance task from the dataframe
wider_balance = wider[wider['rsch_prob']=='balance']
wider_balance=wider_balance.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet')

wider_balance= wider_balance.copy()
metric_map = {
'acc': 'acc',
'di': 'di',
'spd': 'statistical_parity_diff',
'eod': 'equal_opp_diff',
'err' : 'error_rate_ratio',
'erd' : 'error_rate_diff',
'ford': 'false_omission_rate_diff',
}
wider_balance.loc[:, 'task_metric_value'] = wider_balance.apply(lambda row: row[metric_map[row['task_metric']]], axis=1)

sns.set_context(context='poster',font_scale=1.0)
g=sns.relplot(data=wider_balance,x='acc',y='task_metric_value',hue='task_dataset',style='dem',row='task_metric',col='model',kind='scatter',
aspect=1)
g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}')

# add horizontal lines at di=1 for the first row and a vertical line for acc=1
ax=g.axes
for i in range(len(ax)):
for j in range(len(ax[0])):
if i in [0,3]: # first row (di)
ax[i,j].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3)
elif i in [1, 2, 4, 5]: # other fairness metrics
ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3)
ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2)

output = os.path.join(GRAPHS,'balancing_fairness.png')
plt.savefig(output,dpi=400,bbox_inches='tight')
54 changes: 54 additions & 0 deletions fairnessbench_analysis/code_log_llm_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from path import CSV_FILES,GRAPHS


# Loading useful dataframes
code_eval = pd.read_csv(CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv')
log_eval = pd.read_csv(CSV_FILES/'Results_Final_log_clean2025-09-18T00:48:52.486398.csv')
perf_df= pd.read_csv(CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv')


# Removing missing rows
code_eval= code_eval.dropna(how="any")
code_eval = code_eval.fillna(0)
log_eval= log_eval.dropna(how='any')
perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff',
'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio',
'false_omission_rate_diff']
perf_df= perf_df.dropna(subset=perf, how='all')
perf_df = perf_df.fillna(0)

task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'})
task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'})
wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1)
wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','final_flake8_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"]
wider_code = wider_code[wider_cols]

score_cols = ["1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"]
code_tall = wider_code.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'],
value_vars=score_cols,var_name='score')
output= os.path.join(GRAPHS,'codeval')
sns.catplot(code_tall,col='model',row='resrch_prob',x= 'task_dataset',y='value',hue='score',kind='bar').savefig(output)


# log eval
task_data_metric = log_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'})
task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'})
wider_log = pd.concat([log_eval, task_data_metric,task_data_dem],axis=1)
wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id',"1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"]
wider_log = wider_log[wider_cols]
wider_log.head()

score_cols = ["1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"]
log_tall = wider_log.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'],
value_vars=score_cols,var_name='score')


output=os.path.join(GRAPHS,'logval')
sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig(output)

57 changes: 57 additions & 0 deletions fairnessbench_analysis/correlation_flake8_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from path import CSV_FILES,GRAPHS

# Loading useful dataframes
file= CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv'
code_eval = pd.read_csv(file)

# Removing missing rows
code_eval= code_eval.dropna(how="any")

task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'})
task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'})
wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1)
wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing",'final_flake8_score']
wider_code = wider_code[wider_cols]

# Correlation between flake8 and code llm eval on claude_adult_di_erd task
code_cols=['1. Data Collection and Processing','2. Bias Detection and Mitigation','3. Fairness Metric Selection','4. Model Selection and Training', '5. Evaluation and Testing']
group_cols = ["model", "task_dataset", "resrch_prob", "task_metric"] # Add 'task_dem' if needed

def flake8_corr_matrix(group):
# Compute correlation between flake8_score and each rubric section
corrs = [group["final_flake8_score"].corr(group[rubric]) for rubric in code_cols]
return pd.Series(corrs, index=code_cols)

corrs = (
wider_code.groupby(group_cols)
.apply(flake8_corr_matrix)
.reset_index()
)
corrs=corrs.fillna(0)

group_filter = (
(corrs['model'] == 'claude-3-7-sonnet-20250219') &
(corrs['task_dataset'] == 'adult') &
(corrs['resrch_prob'] == 'balance') &
(corrs['task_metric'] == 'erd')
)
corr_row = corrs.loc[group_filter, code_cols]

plt.figure(figsize=(8, 2))
sns.heatmap(
corr_row.values.reshape(1, -1),
annot=True,
cmap='coolwarm',
xticklabels=code_cols,
yticklabels=['Flake8 score']
)
plt.title("Flake8 vs Rubric Correlation (claude-3, adult, balance, erd)")

output= os.path.join(GRAPHS,'flake8_vs_code_correlation.png')
plt.savefig(output,bbox_inches='tight')
20 changes: 20 additions & 0 deletions fairnessbench_analysis/cv_scores_evalmodels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os
import pandas as pd
from path import FILES

gemma_df = pd.read_csv(FILES/'Gemma_cv.csv')
deepseek_df= pd.read_csv(FILES/'Deepseek_cv.csv')
granite_df=pd.read_csv(FILES/'Granite_cv.csv')

gemma_df['eval'] = 'gemma'
deepseek_df['eval'] = 'deepseek'
granite_df['eval'] = 'granite'

cols = ['eval', 'model', 'task'] + [c for c in gemma_df.columns if c not in ['eval', 'model', 'task']]
gemma_df = gemma_df[cols]
deepseek_df = deepseek_df[cols]
granite_df = granite_df[cols]

all_eval_cv = pd.concat([gemma_df, deepseek_df, granite_df], axis=0, ignore_index=True)

all_eval_cv.to_csv('cv_scores_evalmodel.csv',index=False)
48 changes: 48 additions & 0 deletions fairnessbench_analysis/di_across_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from path import CSV_FILES,GRAPHS

# Loading useful dataframes
file= CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv'
perf_alt = pd.read_csv(file)

# Removing missing rows
perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff',
'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio',
'false_omission_rate_diff']
perf_alt= perf_alt.dropna(subset=perf, how='all')
perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0)

# expanding the task to sub columns
task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'})
wider_code = pd.concat([perf_alt, task_data_metric],axis=1)
wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count']
wider_code = wider_code[wider_cols]

# Filtering only DI from the dataframe
wider_di = wider_code[wider_code['task_metric']=='di']
wider_DI = (
wider_di.groupby(['task_dataset','task-dem'])[['di','acc']].mean().reset_index()
)
dem_df= wider_di['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'})
wider_DI=pd.concat([wider_di,dem_df],axis=1)
wider_DI=wider_DI.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet')

# ploting the scatter plot for di vs acc
sns.set_context(context='poster',font_scale=0.8)
g=sns.relplot(data=wider_DI,x='acc',y='di',hue='task_dataset',row='resrch_prob',style='dem',col='model',kind='scatter',aspect=1)
g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}')

# adding horizontal lines at di=1 to each cell plot
for i, ax in enumerate(g.axes.flat):
ax.axhline(y=1.0, color='black', linestyle='-.', alpha=0.2)
ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2)

# saving the plot
output= os.path.join(GRAPHS,'di_vs_acc_scatter.png')

plt.savefig(output, dpi=300, bbox_inches='tight')
Loading