1+ import os
2+ import sys
3+ import torch
4+ import random
5+ import argparse
6+ import numpy as np
7+ import pandas as pd
8+ from LIC import LIC
9+ from utils .text import CaptionProcessor
10+ from utils .datacreator_race import CaptionRaceDataset
11+ from attackerModels .NetModel import LSTM_ANN_Model , RNN_ANN_Model
12+
13+ torch .backends .cudnn .deterministic = True
14+
15+ device = torch .device ("cuda" if torch .cuda .is_available () else "cpu" )
16+
17+ sys .path .append ("/home/nshah96/DIC" )
18+ # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
19+
20+ print ("GPU Available:" , torch .cuda .is_available ())
21+ # Print the currently active GPU
22+ if torch .cuda .is_available ():
23+ print ("Current GPU:" , torch .cuda .get_device_name (torch .cuda .current_device ()))
24+ print ("GPU Device Index:" , torch .cuda .current_device ())
25+ print ("Total GPUs:" , torch .cuda .device_count ())
26+
27+ # Define thresholds for contextual LIC
28+ contextual_thresholds = [round (x * 0.05 , 2 ) for x in range (11 , 16 )]
29+
30+ # Step 1: Define Race Words and Token
31+ race_words = [
32+ "white" ,
33+ "caucasian" ,
34+ "black" ,
35+ "african" ,
36+ "asian" ,
37+ "latino" ,
38+ "latina" ,
39+ "latinx" ,
40+ "hispanic" ,
41+ "native" ,
42+ "indigenous" ,
43+ ]
44+ race_token = "race"
45+
46+
47+ def calculate_lic (data_obj , processor , lic_model , mode = "non-contextual" , threshold = 0.5 ):
48+ print (
49+ f"\n Calculating LIC for mode: { mode } , Threshold: { threshold if threshold else 'N/A' } "
50+ )
51+
52+ combined_data = data_obj .getDataCombined ()
53+ print ("\n Loaded Combined Dataset:" )
54+ print (f"Total Samples: { len (combined_data )} " )
55+
56+ # Extract Features
57+ human_ann = combined_data ["caption_human" ]
58+ model_ann = combined_data ["caption_model" ]
59+ feat = torch .tensor (
60+ combined_data ["race" ].values , dtype = torch .float , device = device
61+ ).reshape (- 1 , 1 )
62+
63+ print ("\n Preprocessing Captions..." )
64+
65+ # Calculate LIC Score
66+
67+ lic_score = lic_model .getAmortizedLeakage (
68+ feat , human_ann , model_ann , normalized = False
69+ )
70+ print (f"\n LIC Score for mode { mode } , Threshold { threshold } : { lic_score } " )
71+ return lic_score
72+
73+
74+ def main ():
75+ parser = argparse .ArgumentParser (
76+ description = "Test LIC and Contextual LIC calculations for race"
77+ )
78+ parser .add_argument (
79+ "--human_path" , required = True , help = "Path to human annotations pickle file"
80+ )
81+ parser .add_argument (
82+ "--model_path" , required = True , help = "Path to model annotations pickle file"
83+ )
84+ parser .add_argument (
85+ "--glove_path" ,
86+ required = True ,
87+ help = "Path to GloVe embeddings in word2vec format" ,
88+ )
89+ parser .add_argument (
90+ "--output_file" , default = "lic_scores_race.csv" , help = "Output file to save LIC scores"
91+ )
92+ parser .add_argument (
93+ "--mode" ,
94+ required = True ,
95+ choices = ["contextual" , "non-contextual" ],
96+ help = "Choose mode: 'contextual' or 'non-contextual'" ,
97+ )
98+ parser .add_argument (
99+ "--use_rnn" , action = "store_true" , help = "Use RNN instead of LSTM"
100+ )
101+ parser .add_argument (
102+ "--bidirectional" , action = "store_true" , help = "Use bidirectional LSTM/RNN"
103+ )
104+ parser .add_argument (
105+ "--seed" ,
106+ default = 0 ,
107+ help = "Set random seed for the experiment. Helps ensure reproducability." ,
108+ )
109+ args = parser .parse_args ()
110+
111+ # Setting random seed
112+ random .seed (args .seed )
113+ np .random .seed (args .seed )
114+ torch .manual_seed (args .seed )
115+
116+ # Initialize objects
117+ data_obj = CaptionRaceDataset (args .human_path , args .model_path )
118+ processor = CaptionProcessor (
119+ gender_words = race_words ,
120+ obj_words = [],
121+ glove_path = args .glove_path ,
122+ tokenizer = "nltk" ,
123+ gender_token = race_token ,
124+ )
125+
126+ if args .use_rnn :
127+ model_type = RNN_ANN_Model
128+ model_params = {
129+ "embedding_dim" : 250 ,
130+ "pad_idx" : 0 ,
131+ "rnn_hidden_size" : 256 ,
132+ "rnn_num_layers" : 2 ,
133+ "rnn_bidirectional" : args .bidirectional ,
134+ "ann_output_size" : 1 ,
135+ "num_ann_layers" : 5 ,
136+ "ann_numFirst" : 64 ,
137+ }
138+
139+ else :
140+ model_type = LSTM_ANN_Model
141+ model_params = {
142+ "embedding_dim" : 250 ,
143+ "pad_idx" : 0 ,
144+ "lstm_hidden_size" : 256 ,
145+ "lstm_num_layers" : 2 ,
146+ "lstm_bidirectional" : args .bidirectional ,
147+ "ann_output_size" : 1 ,
148+ "num_ann_layers" : 5 ,
149+ "ann_numFirst" : 64 ,
150+ }
151+
152+ # Initialize LIC
153+ lic_model = LIC (
154+ model_params = {
155+ "attacker_class" : model_type ,
156+ "attacker_params" : model_params ,
157+ },
158+ train_params = {
159+ "learning_rate" : 0.01 ,
160+ "loss_function" : "bce" ,
161+ "epochs" : 50 ,
162+ "batch_size" : 1024 ,
163+ },
164+ gender_words = race_words ,
165+ obj_words = [],
166+ gender_token = race_token ,
167+ obj_token = "obj" ,
168+ glove_path = args .glove_path ,
169+ device = device ,
170+ eval_metric = "bce" ,
171+ )
172+
173+ # Initialize results storage
174+ results = []
175+
176+ if args .mode == "non-contextual" :
177+ non_contextual_lic = calculate_lic (
178+ data_obj , processor , lic_model , mode = "non-contextual"
179+ )
180+ results .append (
181+ {
182+ "mode" : "non-contextual" ,
183+ "threshold" : "N/A" ,
184+ "lic_score_mean" : non_contextual_lic ["Mean" ].item (),
185+ "lic_score_std_dev" : non_contextual_lic ["std" ].item (),
186+ "Number of Trials" : non_contextual_lic ["num_trials" ],
187+ }
188+ )
189+
190+ elif args .mode == "contextual" :
191+ for threshold in contextual_thresholds :
192+ contextual_lic = calculate_lic (
193+ data_obj , processor , lic_model , mode = "contextual" , threshold = threshold
194+ )
195+ results .append (
196+ {
197+ "mode" : "contextual" ,
198+ "threshold" : threshold ,
199+ "lic_score_mean" : contextual_lic ["Mean" ].item (),
200+ "lic_score_std_dev" : contextual_lic ["std" ].item (),
201+ "Number of Trials" : contextual_lic ["num_trials" ],
202+ }
203+ )
204+
205+ # Save results to CSV
206+ output_dir = "/" .join (args .output_file .split ("/" )[:- 1 ])
207+ if not (os .path .isdir (output_dir )):
208+ os .makedirs (output_dir )
209+ results_df = pd .DataFrame (results )
210+ results_df .to_csv (args .output_file , index = False )
211+ print (f"\n Results saved to { args .output_file } " )
212+
213+
214+ if __name__ == "__main__" :
215+ main ()
0 commit comments