diff --git a/data/batch_data.py b/data/batch_data.py new file mode 100644 index 0000000..49c5f22 --- /dev/null +++ b/data/batch_data.py @@ -0,0 +1,83 @@ +import dgl +import networkx as nx +from utils.query_model import query_trained_model +from data.generate_xy import generate_attack_xy_node,generate_attack_xy_graph,generate_attack_xy_node_graph, generate_attack_xy, generate_attack_xy_plus, generate_attack_xy_all, generate_attack_xy_plus2 + + +def get_batch(args, batch_pairs, g, k, i, label, mode): + + query_graph_batch, index_mapping_dict_batch = get_khop_query_graph_batch(g, batch_pairs, k=k) + index_update_batch = [node for _, nodes in index_mapping_dict_batch.items() for node in nodes] + posteriors_dict_batch = query_trained_model(args, index_update_batch, query_graph_batch, mode) + + print('Finish generating posteriors and mapping dict...') + return posteriors_dict_batch, index_mapping_dict_batch + + +def get_batch_posteriors(args, batch_pairs, g, k, i, label, mode): + posteriors_dict_batch, index_mapping_dict_batch = get_batch(args, batch_pairs, g, k, i, label, mode) + batch_features, batch_labels, batch_stat_dict = generate_attack_xy(args, batch_pairs, posteriors_dict_batch, label, index_mapping_dict_batch) + return batch_features, batch_labels, batch_stat_dict + + +def get_batch_posteriors_node(args, batch_pairs, g, k, i, label, mode): + posteriors_dict_batch, index_mapping_dict_batch = get_batch(args, batch_pairs, g, k, i, label, mode) + batch_node_features, batch_posteriors_features, batch_labels, batch_stat_dict = generate_attack_xy_plus(args, g, batch_pairs, posteriors_dict_batch, label, index_mapping_dict_batch) + return batch_node_features, batch_posteriors_features, batch_labels, batch_stat_dict + + +def get_batch_posteriors_graph(args, batch_pairs, g, k, i, label, mode): + posteriors_dict_batch, index_mapping_dict_batch = get_batch(args, batch_pairs, g, k, i, label, mode) + batch_posteriors_features, batch_graph_features, batch_labels, batch_stat_dict = generate_attack_xy_plus2(args, g, batch_pairs, posteriors_dict_batch, label, mode, index_mapping_dict_batch) + return batch_posteriors_features, batch_graph_features, batch_labels, batch_stat_dict + + +def get_batch_posteriors_node_graph(args, batch_pairs, g, k, i, label, mode): + posteriors_dict_batch, index_mapping_dict_batch = get_batch(args, batch_pairs, g, k, i, label, mode) + batch_node_features, batch_posteriors_features, batch_graph_features, batch_labels, batch_stat_dict = generate_attack_xy_all(args, g, batch_pairs, posteriors_dict_batch, label, mode, index_mapping_dict_batch) + return batch_node_features, batch_posteriors_features, batch_graph_features, batch_labels, batch_stat_dict + +def get_batch_node_only(args, batch_pairs, g, i, label, mode): + + batch_node_features, batch_labels, batch_stat_dict = generate_attack_xy_node(args, g, batch_pairs, label) + return batch_node_features, batch_labels, batch_stat_dict + +def get_batch_graph_only(args, batch_pairs, g, i, label, mode): + # Skip posteriors_dict and mapping: Not needed for graph-only + batch_graph_features, batch_labels, batch_stat_dict = generate_attack_xy_graph(args, g, batch_pairs, label, mode) + return batch_graph_features, batch_labels, batch_stat_dict + +def get_batch_node_graph_only(args, batch_pairs, g, i, label, mode): + # Skip posteriors_dict and mapping: Not needed for node+graph only + batch_node_features, batch_graph_features, batch_labels, batch_stat_dict = generate_attack_xy_node_graph(args, g, batch_pairs, label, mode) + return batch_node_features, batch_graph_features, batch_labels, batch_stat_dict + + + +def get_khop_query_graph_batch(g, pairs, k=2): + + nx_g = dgl.to_networkx(g, node_attrs=["features"]) + + subgraph_list = [] + index_mapping_dict = {} + bias = 0 + + for pair in pairs: + start_node = pair[0] + end_node = pair[1] + nx_g.remove_edges_from([(start_node, end_node), (end_node, start_node)]) + node_index = [] + for node in (start_node, end_node): + node_neighbor = list(nx.ego.ego_graph(nx_g, n=node, radius=k).nodes()) + node_neighbor_num = len(node_neighbor) + node_new_index = node_neighbor.index(node) + subgraph_k_hop = g.subgraph(node_neighbor) + subgraph_list.append(subgraph_k_hop) + node_index.append(node_new_index + bias) + bias += node_neighbor_num + nx_g.add_edges_from([(start_node, end_node), (end_node, start_node)]) + index_mapping_dict[(start_node, end_node)] = (node_index[0], node_index[1]) + update_query_graph = dgl.batch([row for row in subgraph_list]) + + print("Get k-hop query graph") + return update_query_graph, index_mapping_dict \ No newline at end of file diff --git a/data/generate_xy.py b/data/generate_xy.py new file mode 100644 index 0000000..533c257 --- /dev/null +++ b/data/generate_xy.py @@ -0,0 +1,335 @@ +from tqdm import tqdm +import numpy as np +import torch as th +import torch.nn.functional as F +from scipy.spatial import distance + +from utils.tradition_metrics import get_features +np.random.seed(0) +th.manual_seed(0) +th.backends.cudnn.deterministic = True +th.backends.cudnn.benchmark = False + + +def entropy(P): + """ Epsilon is used here to avoid conditional code for + checking that neither P nor Q is equal to 0. """ + epsilon = 0.00001 + P = P + epsilon + entropy_value = -np.sum(P * np.log(P)) + return np.array([entropy_value]) + + +def js_divergence(a, b): + return distance.jensenshannon(a, b, 2.0) + + +def cosine_sim(a, b): + return 1 - distance.cosine(a, b) + + +def correlation_dist(a, b): + return distance.correlation(a, b) + + +def pair_wise(a, b, edge_feature): + if edge_feature == 'simple': + return np.concatenate([a, b]) + elif edge_feature == 'add': + return a + b + elif edge_feature == 'hadamard': + return a * b + elif edge_feature == 'average': + return (a + b) / 2 + elif edge_feature == 'l1': + return abs(a - b) + elif edge_feature == 'l2': + return abs((a - b) * (a - b)) + elif edge_feature == 'all': + hadamard = a * b + average = (a + b) / 2 + weighted_l1 = abs(a - b) + weighted_l2 = abs((a - b) * (a - b)) + return np.concatenate([hadamard, average, weighted_l1, weighted_l2]) + + +def sim_metrics(a, b): + a_entropy = entropy(a) + b_entropy = entropy(b) + entr_feature = pair_wise(a_entropy, b_entropy, 'all') + sim_feature = np.array([js_divergence(a, b), cosine_sim(a,b), correlation_dist(a, b)]) + return np.concatenate([entr_feature, sim_feature]) + +def generate_attack_xy_node(args, g, pairs, label): + features = g.ndata['features'] + node_features = [] + stat_dict = {} + labels = [label] * len(pairs) + + for (start_id, end_id) in (pairs): + start_feature = features[start_id].cpu().numpy() + end_feature = features[end_id].cpu().numpy() + + if args.diff: + start_entropy = entropy(start_feature) + end_entropy = entropy(end_feature) + node_feature = sim_metrics(start_entropy, end_entropy) + node_feature[np.isnan(node_feature)] = 0 + else: + node_feature = pair_wise(start_feature, end_feature, 'hadamard') + + node_features.append(node_feature) + stat_dict[(start_id, end_id)] = { + 'node_ids': (start_id, end_id), + f'{args.node_topology}_node_feature': node_feature, + 'label': label + } + + print("Node features and labels of %d pairs have been generated" % len(labels)) + return node_features, labels, stat_dict + +def generate_attack_xy_graph(args, g, pairs, label,mode): + graph_features = [] + stat_dict = {} + jaccard_dict, attach_dict, neighbor_dict = get_features(args, g, pairs, label,mode) + labels = [label] * len(pairs) + + for (start_id, end_id) in (pairs): + graph_feature = [ + jaccard_dict[(start_id, end_id)], + attach_dict[(start_id, end_id)], + neighbor_dict[(start_id, end_id)] + ] + graph_features.append(graph_feature) + + stat_dict[(start_id, end_id)] = { + 'node_ids': (start_id, end_id), + f'{args.node_topology}_graph_feature': graph_feature, + 'label': label + } + + print("Graph features and labels of %d pairs have been generated" % len(labels)) + return graph_features, labels, stat_dict + +def generate_attack_xy_node_graph(args, g, pairs, label,mode): + features = g.ndata['features'] + node_features = [] + graph_features = [] + stat_dict = {} + + jaccard_dict, attach_dict, neighbor_dict = get_features(args, g, pairs, label) + labels = [label] * len(pairs) + + for (start_id, end_id) in tqdm(pairs): + # Node feature + start_feature = features[start_id].cpu().numpy() + end_feature = features[end_id].cpu().numpy() + + if args.diff: + start_entropy = entropy(start_feature) + end_entropy = entropy(end_feature) + node_feature = sim_metrics(start_entropy, end_entropy) + node_feature[np.isnan(node_feature)] = 0 + else: + node_feature = pair_wise(start_feature, end_feature, 'hadamard') + node_features.append(node_feature) + + # Graph feature + graph_feature = [ + jaccard_dict[(start_id, end_id)], + attach_dict[(start_id, end_id)], + neighbor_dict[(start_id, end_id)] + ] + graph_features.append(graph_feature) + + # Stat dict + stat_dict[(start_id, end_id)] = { + 'node_ids': (start_id, end_id), + 'start_node_feature': start_feature, + 'end_node_feature': end_feature, + 'node_feature': node_feature, + 'graph_feature': graph_feature, + 'label': label + } + + print("Node + graph features and labels of %d pairs have been generated" % len(labels)) + return node_features, graph_features, labels, stat_dict + + + + +def generate_attack_xy(args, pairs, posteriors_dict, label, index_mapping_dict=None): + features = [] + stat_dict = {} + for (start_id, end_id) in pairs: + if index_mapping_dict: + new_start_id, new_end_id = index_mapping_dict[(start_id, end_id)] + start_posterior = F.softmax(posteriors_dict[new_start_id], dim=0) + end_posterior = F.softmax(posteriors_dict[new_end_id], dim=0) + else: + start_posterior = F.softmax(posteriors_dict[start_id], dim=0) + end_posterior = F.softmax(posteriors_dict[end_id], dim=0) + if args.label_only: + start_posterior, end_posterior = start_posterior.numpy(), end_posterior.numpy() + label_dim = len(start_posterior) + start_label = np.eye(label_dim)[np.argmax(start_posterior)] + end_label = np.eye(label_dim)[np.argmax(end_posterior)] + feature = pair_wise(start_label, end_label, 'add') + elif args.diff: + start_posterior, end_posterior = start_posterior.numpy(), end_posterior.numpy() + feature = sim_metrics(start_posterior, end_posterior) + feature[np.isnan(feature)] = 0 + elif args.soft_prob: + start_posterior = F.softmax(start_posterior/args.T, dim=0).numpy() + end_posterior = F.softmax(end_posterior/args.T, dim=0).numpy() + feature = pair_wise(start_posterior, end_posterior, args.edge_feature) + else: + start_posterior, end_posterior = start_posterior.numpy(), end_posterior.numpy() + feature = pair_wise(start_posterior, end_posterior, args.edge_feature) + stat_dict[(start_id, end_id)] = {'node_ids':(start_id, end_id), f'{args.node_topology}_start_posterior': start_posterior, f'{args.node_topology}_end_posterior': end_posterior, f'{args.node_topology}_posterior_feature': feature, 'label': label} + features.append(feature) + print(start_posterior) + labels = [label] * len(features) + + print("features and labels of %d pairs have been generated" % len(labels)) + + return features, labels, stat_dict + + +def generate_attack_xy_plus(args, g, pairs, posteriors_dict, label, index_mapping_dict=None): + features = g.ndata['features'] + node_features = [] + posterior_features = [] + stat_dict = {} + labels = [label] * len(pairs) + for (start_id, end_id) in tqdm(pairs): + if index_mapping_dict: + new_start_id, new_end_id = index_mapping_dict[(start_id, end_id)] + start_posterior = F.softmax(posteriors_dict[new_start_id], dim=0 ).numpy() + end_posterior = F.softmax(posteriors_dict[new_end_id], dim=0 ).numpy() + else: + start_posterior = F.softmax(posteriors_dict[start_id], dim=0).numpy() + end_posterior = F.softmax(posteriors_dict[end_id], dim=0).numpy() + if args.label_only: + label_dim = len(start_posterior) + start_label = np.eye(label_dim)[np.argmax(start_posterior)] + end_label = np.eye(label_dim)[np.argmax(end_posterior)] + posterior_feature = pair_wise(start_label, end_label, 'add') + elif args.diff: + start_entropy = entropy(start_posterior) + end_entropy = entropy(end_posterior) + posterior_feature = sim_metrics(start_entropy, end_entropy) + posterior_feature[np.isnan(posterior_feature)] = 0 + else: + posterior_feature = pair_wise(start_posterior, end_posterior, args.edge_feature) + posterior_features.append(posterior_feature) + + start_feature = features[start_id].cpu().numpy() + end_feature = features[end_id].cpu().numpy() + if args.diff: + start_entropy = entropy(start_feature) + end_entropy = entropy(end_feature) + node_feature = sim_metrics(start_entropy, end_entropy) + node_feature[np.isnan(node_feature)] = 0 + else: + node_feature = pair_wise(start_feature, end_feature, 'hadamard') + node_features.append(node_feature) + + stat_dict[(start_id, end_id)] = {'node_ids':(start_id, end_id), f'{args.node_topology}_start_posterior': start_posterior, f'{args.node_topology}_end_posterior': end_posterior, f'{args.node_topology}_posterior_feature': posterior_feature, 'label': label} + + print("features and labels of %d pairs have been generated" % len(labels)) + + return node_features, posterior_features, labels, stat_dict + +def generate_attack_xy_plus2(args, g, pairs, posteriors_dict, label, mode, index_mapping_dict=None): + posterior_features = [] + graph_features = [] + stat_dict = {} + jaccard_dict, attach_dict, neighbor_dict = get_features(args, g, pairs, label, mode) + labels = [label] * len(pairs) + for (start_id, end_id) in tqdm(pairs): + if index_mapping_dict: + new_start_id, new_end_id = index_mapping_dict[(start_id, end_id)] + start_posterior = F.softmax(posteriors_dict[new_start_id], dim=0 ).numpy() + end_posterior = F.softmax(posteriors_dict[new_end_id], dim=0 ).numpy() + else: + start_posterior = F.softmax(posteriors_dict[start_id], dim=0).numpy() + end_posterior = F.softmax(posteriors_dict[end_id], dim=0).numpy() + + if args.label_only: + label_dim = len(start_posterior) + start_label = np.eye(label_dim)[np.argmax(start_posterior)] + end_label = np.eye(label_dim)[np.argmax(end_posterior)] + posterior_feature = pair_wise(start_label, end_label, 'add') + elif args.diff: + start_entropy = entropy(start_posterior) + end_entropy = entropy(end_posterior) + posterior_feature = sim_metrics(start_entropy, end_entropy) + posterior_feature[np.isnan(posterior_feature)] = 0 + else: + posterior_feature = pair_wise(start_posterior, end_posterior, args.edge_feature) + posterior_features.append(posterior_feature) + + + graph_feature = [jaccard_dict[(start_id, end_id)], attach_dict[(start_id, end_id)], neighbor_dict[(start_id, end_id)]] + graph_features.append(graph_feature) + + stat_dict[(start_id, end_id)] = {'node_ids':(start_id, end_id), f'{args.node_topology}_start_posterior': start_posterior, f'{args.node_topology}_end_posterior': end_posterior, f'{args.node_topology}_posterior_feature': posterior_feature, 'label': label} + + print("features and labels of %d pairs have been generated" % len(labels)) + + return posterior_features, graph_features, labels, stat_dict + + +def generate_attack_xy_all(args, g, pairs, posteriors_dict, label, mode, index_mapping_dict=None): + features = g.ndata['features'] + node_features = [] + posterior_features = [] + graph_features = [] + stat_dict = {} + jaccard_dict, attach_dict, neighbor_dict = get_features(args, g, pairs, label, mode) + labels = [label] * len(pairs) + for (start_id, end_id) in tqdm(pairs): + if index_mapping_dict: + new_start_id, new_end_id = index_mapping_dict[(start_id, end_id)] + start_posterior = F.softmax(posteriors_dict[new_start_id], dim=0 ).numpy() + end_posterior = F.softmax(posteriors_dict[new_end_id], dim=0 ).numpy() + else: + start_posterior = F.softmax(posteriors_dict[start_id], dim=0).numpy() + end_posterior = F.softmax(posteriors_dict[end_id], dim=0).numpy() + + if args.label_only: + label_dim = len(start_posterior) + start_label = np.eye(label_dim)[np.argmax(start_posterior)] + end_label = np.eye(label_dim)[np.argmax(end_posterior)] + posterior_feature = pair_wise(start_label, end_label, 'add') + elif args.diff: + start_entropy = entropy(start_posterior) + end_entropy = entropy(end_posterior) + posterior_feature = sim_metrics(start_entropy, end_entropy) + posterior_feature[np.isnan(posterior_feature)] = 0 + else: + posterior_feature = pair_wise(start_posterior, end_posterior, args.edge_feature) + + posterior_features.append(posterior_feature) + + start_feature = features[start_id].cpu().numpy() + end_feature = features[end_id].cpu().numpy() + if args.diff: + start_entropy = entropy(start_feature) + end_entropy = entropy(end_feature) + node_feature = sim_metrics(start_entropy, end_entropy) + node_feature[np.isnan(node_feature)] = 0 + else: + node_feature = pair_wise(start_feature, end_feature, 'hadamard') + node_features.append(node_feature) + + graph_feature = [jaccard_dict[(start_id, end_id)], attach_dict[(start_id, end_id)], neighbor_dict[(start_id, end_id)]] + graph_features.append(graph_feature) + + stat_dict[(start_id, end_id)] = {'node_ids':(start_id, end_id), f'{args.node_topology}_start_posterior': start_posterior, f'{args.node_topology}_end_posterior': end_posterior, f'{args.node_topology}_posterior_feature': posterior_feature, 'start_node_feature': start_feature, 'end_node_feature': end_feature, 'node_feature':node_feature, 'graph_feature': graph_feature, 'label': label} + + print("features and labels of %d pairs have been generated" % len(labels)) + + return node_features, posterior_features, graph_features, labels, stat_dict + diff --git a/data/inductive_split.py b/data/inductive_split.py new file mode 100644 index 0000000..6a05aac --- /dev/null +++ b/data/inductive_split.py @@ -0,0 +1,634 @@ + +import numpy as np +import torch as th +import dgl +import os +from tqdm import tqdm +from multiprocessing import Pool +import sys + +from utils.query_model import query_trained_model +from data.batch_data import get_batch_posteriors, get_batch_posteriors_node, get_batch_posteriors_graph, get_batch_posteriors_node_graph,get_batch_node_only,get_batch_graph_only,get_batch_node_graph_only +from data.generate_xy import generate_attack_xy, generate_attack_xy_plus,generate_attack_xy_node,generate_attack_xy_graph,generate_attack_xy_node_graph + +np.random.seed(0) +th.manual_seed(0) +th.backends.cudnn.deterministic = True +th.backends.cudnn.benchmark = False + + +def generate_pairs(g, train_index): + start_ids, end_ids = g.edges() + postive_pairs = [] + negative_pairs = [] + for i in tqdm(range(len(start_ids))): + if start_ids[i] < end_ids[i]: + postive_pairs.append((start_ids[i].item(), end_ids[i].item())) + + num_pos_pairs = len(postive_pairs) + print("There are %d edges in the training graph!" % (num_pos_pairs)) + while True: + a, b = np.random.choice(list(train_index), 2, replace=False) + random_pair = (a, b) if a < b else (b, a) + if random_pair not in postive_pairs: + negative_pairs.append(random_pair) + if len(negative_pairs) == num_pos_pairs: + break + print("Finish Generating Pairs!") + return postive_pairs, negative_pairs + + +def make_dirs(): + os.makedirs('./data/pairs/', exist_ok=True) + os.makedirs('./data/posteriors/', exist_ok=True) + os.makedirs('./data/mapping/', exist_ok=True) + + +def remove_neighbor_edge(g): + """ + Remove all edges from a graph, only save self connection + """ + start_ids, end_ids = g.edges() + delete_eid = [] + for i in tqdm(range(len(start_ids))): + if start_ids[i] != end_ids[i]: + delete_eid.append(i) + g = dgl.remove_edges(g, th.tensor(delete_eid)) + + return g + + +def normalized(feature, scaler, mode): + if mode == 'shadow': + feature_scaled = scaler.fit_transform(feature) + return feature_scaled, scaler + else: + feature_scaled = scaler.transform(feature) + return feature_scaled, scaler + + +def inductive_split_posteriors(args, train_g, test_g): + make_dirs() + dataloaders = [] + stat_dicts = [] + count = 0 + + + + for g in (train_g, test_g): + if args.prop: + mode = 'shadow'+ str(args.prop) if count == 0 else 'target' + else: + mode = 'shadow' if count == 0 else 'target' + if args.diff: + args.dataset = args.target_dataset if mode == 'target' else args.shadow_dataset + model = args.target_model if mode == 'target' else args.shadow_model + index = np.arange(len(g.nodes())) + stat_dict = {} + + positive_pairs, negative_pairs = generate_pairs(g, index) + print(f"Finish Generating Pairs...") + + if args.node_topology == '0-hop': + zero_hop_g = remove_neighbor_edge(g) + + posteriors_dict = query_trained_model(args, index, zero_hop_g, mode) + print("Finish Generating Posteriors Dict...") + + positive_features, positive_labels, positive_stat_dict = generate_attack_xy(args, positive_pairs, posteriors_dict, 1) + negative_features, negative_labels, negative_stat_dict = generate_attack_xy(args, negative_pairs, posteriors_dict, 0) + + stat_dict = {**positive_stat_dict, **negative_stat_dict} + features = positive_features + negative_features + labels = positive_labels + negative_labels + + elif args.node_topology == '1-hop' or args.node_topology == '2-hop': + k = 1 if args.node_topology == '1-hop' else 2 + features = [] + labels = [] + flag = 1 + for pairs in (positive_pairs, negative_pairs): + label = flag + flag -= 1 + batch_size = 4096 + num_batch = len(pairs) // batch_size + pool = Pool(12) + results = [] + for i in tqdm(range(num_batch+1)): + if i == num_batch: + batch_pairs = pairs[i*batch_size:] + else: + batch_pairs = pairs[i*batch_size:(i+1)*batch_size] + batch_result = pool.apply_async(get_batch_posteriors, args=(args, batch_pairs, g, k, i, label, mode)) + results.append(batch_result) + pool.close() + pool.join() + for batch_result in results: + batch_result = batch_result.get() + features.extend(batch_result[0]) + labels.extend(batch_result[1]) + stat_dict.update(batch_result[2]) + + features = np.array(features).astype(np.float32) + features = th.from_numpy(features) + indices = th.from_numpy(np.array(positive_pairs+negative_pairs)) + labels = th.tensor(labels) + + dataset = th.utils.data.TensorDataset(indices, features, labels) + dataloader = th.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + stat_dicts.append(stat_dict) + dataloaders.append(dataloader) + count += 1 + feature_dim = features[0].shape[0] + + return dataloaders[0], dataloaders[1], feature_dim, stat_dicts[1] +def inductive_split_node(args, train_g, test_g): + + make_dirs() + dataloaders = [] + stat_dicts = [] + count = 0 + + for g in (train_g, test_g): + if args.prop: + mode = 'shadow'+ str(args.prop) if count == 0 else 'target' + else: + mode = 'shadow' if count == 0 else 'target' + if args.diff: + args.dataset = args.target_dataset if mode == 'target' else args.shadow_dataset + index = np.arange(len(g.nodes())) + stat_dict = {} + + positive_pairs, negative_pairs = generate_pairs(g, index) + print("Finish Generating Pairs....") + + positive_node_features, positive_labels, positive_stat_dict = generate_attack_xy_node(args, g, positive_pairs, 1) + negative_node_features, negative_labels, negative_stat_dict = generate_attack_xy_node(args, g, negative_pairs, 0) + + stat_dict = {**positive_stat_dict, **negative_stat_dict} + node_features = positive_node_features + negative_node_features + labels = positive_labels + negative_labels + + node_features = np.array(node_features).astype(np.float32) + node_features = th.from_numpy(node_features) + + indices = th.from_numpy(np.array(positive_pairs + negative_pairs)) + labels = th.tensor(labels) + + dataset = th.utils.data.TensorDataset(indices, node_features, labels) + dataloader = th.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + + dataloaders.append(dataloader) + stat_dicts.append(stat_dict) + + count += 1 + + feature_dim = node_features[0].shape[0] + return dataloaders[0], dataloaders[1], feature_dim, stat_dicts[1] + +def inductive_split_graph(args, train_g, test_g): + + make_dirs() + dataloaders = [] + stat_dicts = [] + count = 0 + for g in (train_g, test_g): + if args.prop: + mode = 'shadow'+ str(args.prop) if count == 0 else 'target' + else: + mode = 'shadow' if count == 0 else 'target' + if args.diff: + args.dataset = args.target_dataset if mode == 'target' else args.shadow_dataset + index = np.arange(len(g.nodes())) + stat_dict = {} + + positive_pairs, negative_pairs = generate_pairs(g, index) + print("Finish Generating Pairs....") + + graph_features = [] + labels = [] + flag = 1 + for pairs in (positive_pairs, negative_pairs): + label = flag + flag -= 1 + batch_size = 4096 + num_batch = len(pairs) // batch_size + pool = Pool(12) + results = [] + for i in tqdm(range(num_batch+1)): + if i == num_batch: + batch_pairs = pairs[i*batch_size:] + else: + batch_pairs = pairs[i*batch_size:(i+1)*batch_size] + batch_result = pool.apply_async(get_batch_graph_only, args=(args, batch_pairs, g, i, label, mode)) + results.append(batch_result) + pool.close() + pool.join() + for batch_result in results: + batch_result = batch_result.get() + graph_features.extend(batch_result[0]) + labels.extend(batch_result[1]) + stat_dict.update(batch_result[2]) + + graph_features = np.array(graph_features).astype(np.float32) + graph_features = th.from_numpy(graph_features) + + indices = th.from_numpy(np.array(positive_pairs + negative_pairs)) + labels = th.tensor(labels) + print(graph_features.shape) + + dataset = th.utils.data.TensorDataset(indices, graph_features, labels) + dataloader = th.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + + dataloaders.append(dataloader) + stat_dicts.append(stat_dict) + + count += 1 + + feature_dim = graph_features[0].shape[0] + return dataloaders[0], dataloaders[1], feature_dim, stat_dicts[1] + +def inductive_split_node_graph(args, train_g, test_g): + make_dirs() + dataloaders = [] + stat_dicts = [] + count = 0 + for g in (train_g, test_g): + if args.prop: + mode = 'shadow'+ str(args.prop) if count == 0 else 'target' + else: + mode = 'shadow' if count == 0 else 'target' + if args.diff: + args.dataset = args.target_dataset if mode == 'target' else args.shadow_dataset + all_index = np.arange(len(g.nodes())) + stat_dict = {} + positive_pairs, negative_pairs = generate_pairs(g, all_index) + print("Finish Generating Pairs....") + + node_features = [] + graph_features = [] + labels = [] + flag = 1 + for pairs in (positive_pairs, negative_pairs): + label = flag + flag -= 1 + batch_size = 4096 + num_batch = len(pairs) // batch_size + pool = Pool(12) + results = [] + for i in tqdm(range(num_batch+1)): + if i == num_batch: + batch_pairs = pairs[i*batch_size:] + else: + batch_pairs = pairs[i*batch_size:(i+1)*batch_size] + + batch_result = pool.apply_async(get_batch_node_graph_only, args=(args, batch_pairs, g, i, mode,label)) + results.append(batch_result) + pool.close() + pool.join() + + for batch_result in results: + batch_result = batch_result.get() + node_features.extend(batch_result[0]) + graph_features.extend(batch_result[1]) + labels.extend(batch_result[2]) + stat_dict.update(batch_result[3]) + + node_features = np.array(node_features).astype(np.float32) + node_features = th.from_numpy(node_features) + + graph_features = np.array(graph_features).astype(np.float32) + graph_features = th.from_numpy(graph_features) + + indices = th.from_numpy(np.array(positive_pairs + negative_pairs)) + labels = th.tensor(labels) + + count += 1 + + dataset = th.utils.data.TensorDataset(indices, node_features, graph_features, labels) + dataloader = th.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + dataloaders.append(dataloader) + stat_dicts.append(stat_dict) + + feature_dim = node_features[0].shape[0] + graph_features[0].shape[0] + return dataloaders[0], dataloaders[1], feature_dim, stat_dicts[1] + + + + + + + + + +def inductive_split_posteriors(args, train_g, test_g): + make_dirs() + dataloaders = [] + stat_dicts = [] + count = 0 + + + for g in (train_g, test_g): + if args.prop: + mode = 'shadow'+ str(args.prop) if count == 0 else 'target' + else: + mode = 'shadow' if count == 0 else 'target' + if args.diff: + args.dataset = args.target_dataset if mode == 'target' else args.shadow_dataset + model = args.target_model if mode == 'target' else args.shadow_model + index = np.arange(len(g.nodes())) + stat_dict = {} + + positive_pairs, negative_pairs = generate_pairs(g, index) + print(f"Finish Generating Pairs...") + + if args.node_topology == '0-hop': + zero_hop_g = remove_neighbor_edge(g) + + posteriors_dict = query_trained_model(args, index, zero_hop_g, mode) + print("Finish Generating Posteriors Dict...") + + positive_features, positive_labels, positive_stat_dict = generate_attack_xy(args, positive_pairs, posteriors_dict, 1) + negative_features, negative_labels, negative_stat_dict = generate_attack_xy(args, negative_pairs, posteriors_dict, 0) + + stat_dict = {**positive_stat_dict, **negative_stat_dict} + features = positive_features + negative_features + labels = positive_labels + negative_labels + + elif args.node_topology == '1-hop' or args.node_topology == '2-hop': + k = 1 if args.node_topology == '1-hop' else 2 + features = [] + labels = [] + flag = 1 + for pairs in (positive_pairs, negative_pairs): + label = flag + flag -= 1 + batch_size = 4096 + num_batch = len(pairs) // batch_size + pool = Pool(12) + results = [] + for i in tqdm(range(num_batch+1)): + if i == num_batch: + batch_pairs = pairs[i*batch_size:] + else: + batch_pairs = pairs[i*batch_size:(i+1)*batch_size] + batch_result = pool.apply_async(get_batch_posteriors, args=(args, batch_pairs, g, k, i, label, mode)) + results.append(batch_result) + pool.close() + pool.join() + for batch_result in results: + batch_result = batch_result.get() + features.extend(batch_result[0]) + labels.extend(batch_result[1]) + stat_dict.update(batch_result[2]) + + features = np.array(features).astype(np.float32) + features = th.from_numpy(features) + indices = th.from_numpy(np.array(positive_pairs+negative_pairs)) + labels = th.tensor(labels) + + dataset = th.utils.data.TensorDataset(indices, features, labels) + dataloader = th.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + stat_dicts.append(stat_dict) + dataloaders.append(dataloader) + count += 1 + feature_dim = features[0].shape[0] + + return dataloaders[0], dataloaders[1], feature_dim, stat_dicts[1] + + +def inductive_split_plus(args, train_g, test_g): + + make_dirs() + dataloaders = [] + stat_dicts = [] + count = 0 + + for g in (train_g, test_g): + if args.prop: + mode = 'shadow'+ str(args.prop) if count == 0 else 'target' + else: + mode = 'shadow' if count == 0 else 'target' + if args.diff: + args.dataset = args.target_dataset if mode == 'target' else args.shadow_dataset + index = np.arange(len(g.nodes())) + stat_dict = {} + + positive_pairs, negative_pairs = generate_pairs(g, index) + print("Finish Generating Pairs....") + + if args.node_topology == '0-hop': + zero_hop_g = remove_neighbor_edge(g) + posteriors_dict = query_trained_model(args, index, zero_hop_g, mode) + print("Finish Generating Posteriors Dict...") + + positive_node_features, positive_posterior_features, positive_labels, positive_stat_dict = generate_attack_xy_plus(args, g, positive_pairs, posteriors_dict, 1) + negative_node_features, negative_posterior_features, negative_labels, negative_stat_dict = generate_attack_xy_plus(args, g, negative_pairs, posteriors_dict, 0) + + stat_dict = {**positive_stat_dict, **negative_stat_dict} + + node_features = positive_node_features + negative_node_features + posterior_features = positive_posterior_features + negative_posterior_features + labels = positive_labels + negative_labels + elif args.node_topology == '1-hop' or args.node_topology == '2-hop': + k = 1 if args.node_topology == '1-hop' else 2 + node_features = [] + posterior_features = [] + labels = [] + flag = 1 + for pairs in (positive_pairs, negative_pairs): + label = flag + flag -= 1 + batch_size = 4096 + num_batch = len(pairs) // batch_size + pool = Pool(12) + results = [] + for i in tqdm(range(num_batch+1)): + if i == num_batch: + batch_pairs = pairs[i*batch_size:] + else: + batch_pairs = pairs[i*batch_size:(i+1)*batch_size] + batch_result = pool.apply_async(get_batch_posteriors_node, args=(args, batch_pairs, g, k, i, label, mode)) + results.append(batch_result) + pool.close() + pool.join() + for batch_result in results: + batch_result = batch_result.get() + node_features.extend(batch_result[0]) + posterior_features.extend(batch_result[1]) + labels.extend(batch_result[2]) + stat_dict.update(batch_result[3]) + + node_features = np.array(node_features).astype(np.float32) + node_features = th.from_numpy(node_features) + + posterior_features = np.array(posterior_features).astype(np.float32) + posterior_features = th.from_numpy(posterior_features) + + indices = th.from_numpy(np.array(positive_pairs+negative_pairs)) + labels = th.tensor(labels) + + dataset = th.utils.data.TensorDataset(indices, node_features, posterior_features, labels) + dataloader = th.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + + dataloaders.append(dataloader) + stat_dicts.append(stat_dict) + + count += 1 + + + posterior_feature_dim = posterior_features[0].shape[0] + + return dataloaders[0], dataloaders[1], posterior_feature_dim, stat_dicts[1] + +def inductive_split_plus2(args, train_g, test_g): + # train_g is the shadow graph + # test_g is the target graph + make_dirs() + dataloaders = [] + stat_dicts = [] + count = 0 + for g in (train_g, test_g): + if args.prop: + mode = 'shadow'+ str(args.prop) if count == 0 else 'target' + else: + mode = 'shadow' if count == 0 else 'target' + if args.diff: + args.dataset = args.target_dataset if mode == 'target' else args.shadow_dataset + index = np.arange(len(g.nodes())) + stat_dict = {} + + positive_pairs, negative_pairs = generate_pairs(g, index) + print("Finish Generating Pairs....") + + if args.node_topology == '0-hop': + print("Wrong Action") + sys.exit(0) + + k = 1 if args.node_topology == '1-hop' else 2 + graph_features = [] + posterior_features = [] + labels = [] + flag = 1 + for pairs in (positive_pairs, negative_pairs): + label = flag + flag -= 1 + batch_size = 4096 + num_batch = len(pairs) // batch_size + pool = Pool(12) + results = [] + for i in tqdm(range(num_batch+1)): + if i == num_batch: + batch_pairs = pairs[i*batch_size:] + else: + batch_pairs = pairs[i*batch_size:(i+1)*batch_size] + batch_result = pool.apply_async(get_batch_posteriors_graph, args=(args, batch_pairs, g, k, i, label, mode)) + results.append(batch_result) + pool.close() + pool.join() + for batch_result in results: + batch_result = batch_result.get() + posterior_features.extend(batch_result[0]) + graph_features.extend(batch_result[1]) + labels.extend(batch_result[2]) + stat_dict.update(batch_result[3]) + + graph_features = np.array(graph_features).astype(np.float32) + graph_features = th.from_numpy(graph_features) + + posterior_features = np.array(posterior_features).astype(np.float32) + posterior_features = th.from_numpy(posterior_features) + + indices = th.from_numpy(np.array(positive_pairs+negative_pairs)) + labels = th.tensor(labels) + print(graph_features.shape) + + dataset = th.utils.data.TensorDataset(indices, graph_features, posterior_features, labels) + dataloader = th.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + + dataloaders.append(dataloader) + stat_dicts.append(stat_dict) + + count += 1 + + posterior_feature_dim = posterior_features[0].shape[0] + + return dataloaders[0], dataloaders[1], posterior_feature_dim, stat_dicts[1] + + +def inductive_split_all(args, train_g, test_g): + make_dirs() + dataloaders = [] + stat_dicts = [] + count = 0 + for g in (train_g, test_g): + if args.prop: + mode = 'shadow'+ str(args.prop) if count == 0 else 'target' + else: + mode = 'shadow' if count == 0 else 'target' + if args.diff: + args.dataset = args.target_dataset if mode == 'target' else args.shadow_dataset + all_index = np.arange(len(g.nodes())) + stat_dict = {} + positive_pairs, negative_pairs = generate_pairs(g, all_index) + print("Finish Generating Pairs....") + + if args.node_topology == '0-hop': + print("wrong action") + sys.exit(0) + + k = 1 if args.node_topology == '1-hop' else 2 + node_features = [] + posterior_features = [] + graph_features = [] + labels = [] + flag = 1 + for pairs in (positive_pairs, negative_pairs): + label = flag + flag -= 1 + batch_size = 4096 + num_batch = len(pairs) // batch_size + pool = Pool(12) + results = [] + for i in tqdm(range(num_batch+1)): + if i == num_batch: + batch_pairs = pairs[i*batch_size:] + else: + batch_pairs = pairs[i*batch_size:(i+1)*batch_size] + + batch_result = pool.apply_async(get_batch_posteriors_node_graph, args=(args, batch_pairs, g, k, i, label, mode)) + results.append(batch_result) + pool.close() + pool.join() + + for batch_result in results: + batch_result = batch_result.get() + node_features.extend(batch_result[0]) + posterior_features.extend(batch_result[1]) + graph_features.extend(batch_result[2]) + labels.extend(batch_result[3]) + stat_dict.update(batch_result[4]) + + node_features = np.array(node_features).astype(np.float32) + node_features = th.from_numpy(node_features) + + graph_features = np.array(graph_features).astype(np.float32) + graph_features = th.from_numpy(graph_features) + + posterior_features = np.array(posterior_features).astype(np.float32) + posterior_features = th.from_numpy(posterior_features) + + indices = th.from_numpy(np.array(positive_pairs+negative_pairs)) + labels = th.tensor(labels) + + count += 1 + + dataset = th.utils.data.TensorDataset(indices, node_features, posterior_features, graph_features, labels) + + dataloader = th.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + dataloaders.append(dataloader) + stat_dicts.append(stat_dict) + + posterior_feature_dim = posterior_features[0].shape[0] + + return dataloaders[0], dataloaders[1], posterior_feature_dim, stat_dicts[1] diff --git a/data/load_graph.py b/data/load_graph.py new file mode 100644 index 0000000..07e6f18 --- /dev/null +++ b/data/load_graph.py @@ -0,0 +1,122 @@ +import numpy as np +import torch as th +import dgl +from tqdm import tqdm +import networkx as nx + +from graphgallery.datasets import NPZDataset, KarateClub, Reddit + +np.random.seed(0) +th.manual_seed(0) +th.backends.cudnn.deterministic = True +th.backends.cudnn.benchmark = False + + +def load_graphgallery_data(dataset): + + if dataset in ['deezer', 'lastfm']: + data = KarateClub(dataset) + elif dataset in ['reddit']: + data = Reddit(dataset) + else: + data = NPZDataset(dataset, verbose=False) + graph = data.graph + nx_g = nx.from_scipy_sparse_array(graph.adj_matrix) + for node_id, node_data in nx_g.nodes(data=True): + node_data["features"] = graph.feat[node_id].astype(np.float32) + if dataset in ['blogcatalog', 'flickr']: + node_data["labels"] = graph.y[node_id].astype(np.longlong) - 1 + else: + node_data["labels"] = graph.y[node_id].astype(np.longlong) + dgl_graph = dgl.from_networkx(nx_g, node_attrs=['features', 'labels']) + dgl_graph = dgl.add_self_loop(dgl_graph) + dgl_graph = dgl.to_simple(dgl_graph, copy_ndata=True) + dgl_graph = dgl.to_bidirected(dgl_graph, copy_ndata=True) + + print(nx.density(dgl_graph.to_networkx())) + print("Classes:%d" % (graph.num_classes)) + print("Feature dim: %d" % (dgl_graph.ndata['features'].shape[1])) + print(f"Graph has {dgl_graph.number_of_nodes()} nodes, {dgl_graph.number_of_edges()} edges.") + + return dgl_graph, graph.num_classes + + +def node_sample(g, prop=0.5): + ''' + sample target/shadow graph (1:1) + ''' + node_number = len(g.nodes()) + node_index_list = np.arange(node_number) + np.random.shuffle(node_index_list) + split_length = int(node_number * prop) + + train_index = np.sort(node_index_list[:split_length]) + test_index = np.sort(node_index_list[split_length: ]) + + return train_index, test_index + + +def remove_neighbor_edge_by_prop(g, prop=0.2): + """ + Remove all edges from a graph, only save self connection + """ + real_pairs = [] + test_pairs = [] + start_ids, end_ids = g.edges() + + for i in tqdm(range(len(start_ids))): + if start_ids[i] < end_ids[i]: + real_pairs.append([i, start_ids[i].item(), end_ids[i].item()]) + + delete_edge_num = int(len(real_pairs) * prop) + print("Real Pairs Number (no self-loop & reverse edge): %d" % (len(real_pairs))) + print("Delete real pairs number (no self-loop & reverse edge): %d" % (delete_edge_num)) + np.random + delete_ids_1d = np.random.choice(len(real_pairs), delete_edge_num, replace=False) + delete_eids = [] + for i in delete_ids_1d: + eid, start_id, end_id = real_pairs[i] + eid_2 = g.edge_ids(end_id, start_id) + delete_eids += [eid, eid_2] + test_pairs.append((start_id, end_id)) + + print("All edge numbers: %d" % (len(start_ids))) + g = dgl.remove_edges(g, th.tensor(delete_eids)) + + print("Delete %d edges" % (len(delete_eids))) + print("Lefted %d edges" % (len(g.edges()[0]))) + return g, test_pairs + + +def split_target_shadow(g): + + target_index, shadow_index = node_sample(g, 0.5) + + target_g = g.subgraph(target_index) + shadow_g = g.subgraph(shadow_index) + + return target_g, shadow_g + + +def split_target_shadow_by_prop(args, g): + + target_index, shadow_index = node_sample(g, 0.5) + + target_g = g.subgraph(target_index) + shadow_g = g.subgraph(shadow_index) + shadow_index_prop, _ = node_sample(shadow_g, args.prop*0.01) + shadow_g = shadow_g.subgraph(shadow_index_prop) + + return target_g, shadow_g + + +def split_train_test(g): + + train_index, test_index = node_sample(g, 0.8) + + train_g = g.subgraph(train_index) + test_g = g.subgraph(test_index) + + return train_g, test_g + + diff --git a/mlp_attack.py b/mlp_attack.py new file mode 100644 index 0000000..6a3c13f --- /dev/null +++ b/mlp_attack.py @@ -0,0 +1,161 @@ + +import os +import torch as th +import networkx as nx +import datetime +import argparse +import numpy as np +import random + + +from data.load_graph import split_target_shadow, split_train_test, load_graphgallery_data, split_target_shadow_by_prop +from data.inductive_split import inductive_split_posteriors, inductive_split_plus, inductive_split_plus2, inductive_split_all,inductive_split_node,inductive_split_graph,inductive_split_node_graph +from train.attack import run_attack, run_attack_two_features, run_attack_three_features,run_b0_attack,run_b1_attack,run_b2_attack + +th.set_num_threads(1) + +def arg_parse(): + argparser = argparse.ArgumentParser("multi-gpu training") + argparser.add_argument('--gpu', type=int, default=-1, + help="GPU device ID. Use -1 for CPU training") + argparser.add_argument('--dataset', type=str, default='Cora') + argparser.add_argument('--node_topology', type=str, help="node topology used to query the model 0-hop, 2-hop") + argparser.add_argument('--num_epochs', type=int, default=200) + argparser.add_argument('--edge_feature', type=str, default='all') + argparser.add_argument('--n_hidden', type=int, default=128) + argparser.add_argument('--mlp_layers', type=int, default=3) + argparser.add_argument('--gnn_layers', type=int, default=2) + argparser.add_argument('--batch_size', type=int, default=1000) + argparser.add_argument('--lr', type=float, default=0.001) + argparser.add_argument('--log-every', type=int, default=20) + argparser.add_argument('--eval-every', type=int, default=10) + argparser.add_argument('--dropout', type=float, default=0.5) + argparser.add_argument("--seed", type=int, default=0, help="seed",) + argparser.add_argument('--optim', type=str, default='adam') + argparser.add_argument('--target_model', type=str, default='graphsage') + argparser.add_argument('--shadow_model', type=str, default='graphsage') + argparser.add_argument('--baseline', type=str, default = "b0") + argparser.add_argument('--node_only', action='store_true') + argparser.add_argument('--graph_only', action='store_true') + argparser.add_argument('--node_graph_both', action='store_true') + argparser.add_argument('--num_workers', type=int, default=0, + help="Number of sampling processes. Use 0 for no extra process.") + argparser.add_argument('--model_save_path', type=str, default='../data/save_model/gnn/') + argparser.add_argument('--attack_model_save_path', type=str, default='../data/save_model/mlp/') + argparser.add_argument('--load_trained', type=str, default='no') + argparser.add_argument('--plus', action='store_true') + argparser.add_argument('--plus2', action='store_true') + argparser.add_argument('--all', action='store_true') + argparser.add_argument('--scheduler', action='store_true') + argparser.add_argument('--perturb_type', type=str, default='discrete') + argparser.add_argument('--dp', action='store_true') + argparser.add_argument('--epsilon', type=int, default=8) + argparser.add_argument('--label_only', action='store_true') + argparser.add_argument('--soft_prob', action='store_true') + argparser.add_argument('--T', type=int, default=20) + argparser.add_argument('--prop', type=int, + help="use a specified propotion of the shadow dataset") + args = argparser.parse_args() + + if args.gpu >= 0: + args.device = th.device('cuda:%d' % args.gpu) + else: + args.device = th.device('cpu') + args.trad = False + return args + +if __name__ == '__main__': + args = arg_parse() + args.model_save_path = './data/save_model/gnn/' + args.data_save_path = './data/' + log_dir = 'output/logs/' + + random.seed(args.seed) + np.random.seed(args.seed) + th.manual_seed(args.seed) + + begin = datetime.datetime.now() + g, n_classes = load_graphgallery_data(args.dataset) + print(nx.density(g.to_networkx())) + + args.diff = False + args.in_feats = g.ndata['features'].shape[1] + args.node_feature_dim = args.in_feats + args.graph_feature_dim = 3 + args.n_classes = n_classes + args.setting = 'inductive' + + if args.prop: + target_g, shadow_g = split_target_shadow_by_prop(args, g) + print(f'Target Graph Num of Edges: {len(target_g.edges()[0])}') + print(f'Shadow Graph Num of Edges: {len(shadow_g.edges()[0])}') + else: + target_g, shadow_g = split_target_shadow(g) + + target_train_g, target_test_g = split_train_test(target_g) + shadow_train_g, shadow_test_g = split_train_test(shadow_g) + target_train_g.create_formats_() + shadow_train_g.create_formats_() + + print("Target Train Graph Num of Edges %d" % (len(target_train_g.edges()[0]))) + print("Target Train Graph Num of Nodes %d" % (len(target_train_g.nodes()))) + print("Target Train Graph Density: %.5f" % (nx.density(target_train_g.to_networkx()))) + + print("Shadow Train Graph Num of Edges %d" % (len(shadow_train_g.edges()[0]))) + print("Shadow Train Graph Num of Nodes %d" % (len(shadow_train_g.nodes()))) + print("Shadow Train Graph Density: %.5f" % (nx.density(shadow_train_g.to_networkx()))) + print("Classes:%d" % (n_classes)) + print("Feature dim: %d" % (args.in_feats)) + + if args.node_only: + args.feature = 'label_node' if args.label_only else 'node' + args.method = f'{args.feature}' + train_dataloader, test_dataloader, feature_dim, stat_dict = inductive_split_node(args, shadow_train_g, target_train_g) + model, train_acc, train_auc, test_acc, test_auc, stat_dict = run_b0_attack(args, train_dataloader, test_dataloader, stat_dict) + + elif args.graph_only: + args.feature = 'label_graph' if args.label_only else 'graph' + args.method = f'{args.feature}' + train_dataloader, test_dataloader, feature_dim, stat_dict = inductive_split_graph(args, shadow_train_g, target_train_g) + model, train_acc, train_auc, test_acc, test_auc, stat_dict = run_b1_attack(args, train_dataloader, test_dataloader, stat_dict) + + elif args.node_graph_both: + args.feature = 'label_node_graph' if args.label_only else 'node_graph' + args.method = f'{args.feature}' + train_dataloader, test_dataloader, feature_dim, stat_dict = inductive_split_node_graph(args, shadow_train_g, target_train_g) + model, train_acc, train_auc, test_acc, test_auc, stat_dict = run_b2_attack(args, train_dataloader, test_dataloader, stat_dict) + + elif args.plus: + args.feature = 'label_node' if args.label_only else 'posteriors_node' + args.method = f'{args.node_topology}_{args.feature}' + train_dataloader, test_dataloader, posterior_feature_dim, stat_dict = inductive_split_plus(args, shadow_train_g, target_train_g) + model, train_acc, train_auc, test_acc, test_auc, stat_dict = run_attack_two_features(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict) + + elif args.plus2: + args.feature = 'label_graph' if args.label_only else 'posteriors_graph' + args.method = f'{args.node_topology}_{args.feature}' + train_dataloader, test_dataloader, posterior_feature_dim, stat_dict = inductive_split_plus2(args, shadow_train_g, target_train_g) + model, train_acc, train_auc, test_acc, test_auc, stat_dict = run_attack_two_features(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict) + + + elif args.all: + args.feature = 'label_node_graph' if args.label_only else 'posteriors_node_graph' + args.method = f'{args.node_topology}_{args.feature}' + train_dataloader, test_dataloader, posterior_feature_dim, stat_dict = inductive_split_all(args, shadow_train_g, target_train_g) + model, train_acc, train_auc, test_acc, test_auc, stat_dict = run_attack_three_features(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict) + + else: + args.feature = 'label' if args.label_only else 'posteriors' + args.method = f'{args.node_topology}_{args.feature}' + train_dataloader, test_dataloader, feature_dim, stat_dict = inductive_split_posteriors(args, shadow_train_g, target_train_g) + model, train_acc, train_auc, test_acc, test_auc, stat_dict = run_attack(args, feature_dim, train_dataloader, test_dataloader, stat_dict) + + is_scheduled = 1 if args.scheduler else 0 + end = datetime.datetime.now() + k = (end - begin).seconds + + pickle_path = os.path.join(args.data_save_path, f'{args.setting}_{args.dataset}_{args.target_model}_{args.shadow_model}_{args.method}.pickle') + th.save(stat_dict, pickle_path) + + with open(os.path.join(log_dir, "attack_performance.txt"), "a") as wf: + wf.write(f"{args.dataset}, {args.target_model}, {args.shadow_model}, {args.edge_feature}, {is_scheduled}, {args.optim}, {args.lr}, {args.method}, {train_acc:.3f}, {train_auc:.3f}, {test_acc:.3f}, {test_auc:.3f}, {str(datetime.timedelta(seconds=k))}, {args.seed}\n") diff --git a/models/gnn.py b/models/gnn.py new file mode 100644 index 0000000..979a278 --- /dev/null +++ b/models/gnn.py @@ -0,0 +1,390 @@ +import torch as th +th.manual_seed(0) +import torch.nn as nn +import dgl +import dgl.nn.pytorch as dglnn + + +class SAGE(nn.Module): + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + batch_size, + num_workers, + dropout): + super().__init__() + self.n_layers = n_layers + self.n_hidden = n_hidden + self.n_classes = n_classes + self.layers = nn.ModuleList() + self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean')) + for i in range(1, n_layers - 1): + self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean')) + self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean')) + self.dropout = nn.Dropout(dropout) + self.activation = activation + self.batch_size = batch_size + self.num_workers = num_workers + + def forward(self, blocks, x): + h = x + for l, (layer, block) in enumerate(zip(self.layers, blocks)): + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + return h + + + def inference(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + + x = y + return y + + def extract_embedding(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.NodeDataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + x = y + # return the embedding after the first layer; + break + return y + + + +class GCN(nn.Module): + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + batch_size, + num_workers, + dropout): + super().__init__() + self.n_layers = n_layers + self.n_hidden = n_hidden + self.n_classes = n_classes + self.layers = nn.ModuleList() + # self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'gcn')) + # for i in range(1, n_layers - 1): + # self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'gcn')) + # self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'gcn')) + self.layers.append(dglnn.GraphConv(in_feats, n_hidden, allow_zero_in_degree=True)) + for i in range(1, n_layers - 1): + self.layers.append(dglnn.GraphConv(n_hidden, n_hidden, allow_zero_in_degree=True)) + self.layers.append(dglnn.GraphConv(n_hidden, n_classes, allow_zero_in_degree=True)) + self.dropout = nn.Dropout(dropout) + self.activation = activation + self.batch_size = batch_size + self.num_workers = num_workers + + def forward(self, blocks, x): + h = x + for l, (layer, block) in enumerate(zip(self.layers, blocks)): + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + return h + + def inference(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + + x = y + return y + + def extract_embedding(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + x = y + # return the embedding after the first layer; + break + return y + + + +class GAT(nn.Module): + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + batch_size, + num_workers, + dropout,num_heads=2): + super().__init__() + self.n_layers = n_layers + self.n_hidden = n_hidden + self.n_classes = n_classes + self.num_heads = num_heads + self.layers = nn.ModuleList() + self.layers.append(dglnn.GATConv(in_feats, n_hidden, num_heads, allow_zero_in_degree=True)) + for i in range(1, n_layers - 1): + self.layers.append(dglnn.GATConv(n_hidden*num_heads, n_hidden, num_heads, allow_zero_in_degree=True)) + self.layers.append(dglnn.GATConv(n_hidden*num_heads, n_classes, 1, allow_zero_in_degree=True)) + self.dropout = nn.Dropout(dropout) + self.activation = activation + self.batch_size = batch_size + self.num_workers = num_workers + + def forward(self, blocks, x): + h = x + for l, (layer, block) in enumerate(zip(self.layers, blocks)): + h = layer(block, h).flatten(1) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + logits = h + return logits + + def inference(self, g, x, device): + + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden*self.num_heads if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h).flatten(1) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + + x = y + return y + + def extract_embedding(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden*self.num_heads if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h).flatten(1) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + x = y + # return the embedding after the first layer; + break + return y + +class GIN(nn.Module): + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + batch_size, + num_workers, + dropout): + super().__init__() + self.n_layers = 2 + self.n_hidden = n_hidden + self.n_classes = n_classes + self.layers = nn.ModuleList() + linear = nn.Linear(in_feats, n_hidden) + self.layers.append(dglnn.GINConv(linear, 'sum')) + for i in range(1, n_layers-1): + linear = nn.Linear(n_hidden, n_hidden) + self.layers.append(dglnn.GINConv(linear, 'sum')) + linear = nn.Linear(n_hidden, n_classes) + self.layers.append(dglnn.GINConv(linear, 'mean')) + + self.dropout = nn.Dropout(dropout) + self.activation = activation + self.batch_size = batch_size + self.num_workers = num_workers + + def forward(self, blocks, x): + h = x + for l, (layer, block) in enumerate(zip(self.layers, blocks)): + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + return h + + def inference(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + + x = y + return y + + def extract_embedding(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + x = y + # return the embedding after the first layer; + break + return y \ No newline at end of file diff --git a/models/mlp.py b/models/mlp.py new file mode 100644 index 0000000..9de5f55 --- /dev/null +++ b/models/mlp.py @@ -0,0 +1,212 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class MLP_ATTACK(nn.Module): + def __init__(self, dim_in): + super(MLP_ATTACK, self).__init__() + self.dim_in = dim_in + self.fc1 = nn.Linear(self.dim_in, 128) + self.fc2 = nn.Linear(128, 32) + self.fc3 = nn.Linear(32, 2) + + def forward(self, x): + x = x.view(-1,self.dim_in) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + out = F.relu(self.fc3(x)) + return out + + +class MLP_ATTACK_PLUS(nn.Module): + def __init__(self, dim_in_1, dim_in_2): + super(MLP_ATTACK_PLUS, self).__init__() + self.dim_in_1 = dim_in_1 + self.dim_in_2 = dim_in_2 + + self.fc1 = nn.Linear(self.dim_in_1, 128) + self.fc2 = nn.Linear(128, 64) + self.fc3 = nn.Linear(64, 16) + + + self.fc4 = nn.Linear(self.dim_in_2, 64) + self.fc5 = nn.Linear(64, 16) + self.fc6 = nn.Linear(32, 2) + + + def forward(self, x1, x2): + x1 = x1.view(-1, self.dim_in_1) + x2 = x2.view(-1, self.dim_in_2) + + x1 = F.relu(self.fc1(x1)) + x1 = F.relu(self.fc2(x1)) + x1 = F.relu(self.fc3(x1)) + + x2 = F.relu(self.fc4(x2)) + x2 = F.relu(self.fc5(x2)) + + combine = torch.cat([x1, x2], dim=1) + out = F.relu(self.fc6(combine)) + + return out + + +class MLP_ATTACK_PLUS2(nn.Module): + def __init__(self, dim_in_1, dim_in_2): + super(MLP_ATTACK_PLUS2, self).__init__() + self.dim_in_1 = dim_in_1 + self.dim_in_2 = dim_in_2 + + self.fc1 = nn.Linear(self.dim_in_1, 16) + self.fc2 = nn.Linear(16, 4) + + self.fc3 = nn.Linear(self.dim_in_2, 128) + self.fc4 = nn.Linear(128, 64) + self.fc5 = nn.Linear(64, 16) + + self.fc6 = nn.Linear(20, 2) + + + def forward(self, x1, x2): + x1 = x1.view(-1, self.dim_in_1) + x2 = x2.view(-1, self.dim_in_2) + + x1 = F.relu(self.fc1(x1)) + x1 = F.relu(self.fc2(x1)) + + x2 = F.relu(self.fc3(x2)) + x2 = F.relu(self.fc4(x2)) + x2 = F.relu(self.fc5(x2)) + + combine = torch.cat([x1, x2], dim=1) + out = F.relu(self.fc6(combine)) + + return out + + +class MLP_ATTACK_ALL(nn.Module): + def __init__(self, dim_in_1, dim_in_2, dim_in_3): + super(MLP_ATTACK_ALL, self).__init__() + self.dim_in_1 = dim_in_1 + self.dim_in_2 = dim_in_2 + self.dim_in_3 = dim_in_3 + + self.fc1 = nn.Linear(self.dim_in_1, 128) + self.fc2 = nn.Linear(128, 64) + self.fc3 = nn.Linear(64, 16) + + self.fc4 = nn.Linear(self.dim_in_2, 128) + self.fc5 = nn.Linear(128, 64) + self.fc6 = nn.Linear(64, 16) + + self.fc7 = nn.Linear(self.dim_in_3, 4) + + self.fc8 = nn.Linear(36, 2) + + + def forward(self, x1, x2, x3): + x1 = x1.view(-1, self.dim_in_1) + x2 = x2.view(-1, self.dim_in_2) + x3 = x3.view(-1, self.dim_in_3) + + x1 = F.relu(self.fc1(x1)) + x1 = F.relu(self.fc2(x1)) + x1 = F.relu(self.fc3(x1)) + + x2 = F.relu(self.fc4(x2)) + x2 = F.relu(self.fc5(x2)) + x2 = F.relu(self.fc6(x2)) + + + x3 = F.relu(self.fc7(x3)) + + combine = torch.cat([x1, x2, x3], dim=1) + out = F.relu(self.fc8(combine)) + + return out + + +class MLP_Target(nn.Module): + def __init__(self, dim_in, dim_out): + super(MLP_Target, self).__init__() + self.dim_in = dim_in + self.dim_out = dim_out + + self.fc1 = nn.Linear(dim_in, 32) + self.fc2 = nn.Linear(32, dim_out) + + def forward(self, x): + x = x.view(-1,self.dim_in) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return x + +class Baseline0_MLP(nn.Module): + def __init__(self, dim_in): + super(Baseline0_MLP, self).__init__() + self.dim_in = dim_in + + self.fc1 = nn.Linear(dim_in, 128) + self.fc2 = nn.Linear(128, 32) + self.fc3 = nn.Linear(32, 2) + + self.dropout = nn.Dropout(0.5) + + def forward(self, x): + x = x.view(-1, self.dim_in) + x = F.relu(self.fc1(x)) + x = self.dropout(x) + x = F.relu(self.fc2(x)) + x = self.dropout(x) + out = self.fc3(x) + return out + +class Baseline1_MLP(nn.Module): + def __init__(self, dim_in): + super().__init__() + self.fc1 = nn.Linear(dim_in, 16) + self.dropout = nn.Dropout(0.5) + self.fc2 = nn.Linear(16, 2) + + def forward(self, x): + x = self.dropout(F.relu(self.fc1(x))) + return self.fc2(x) + + + + +class Baseline2_MLP(nn.Module): + def __init__(self, dim_in_1, dim_in_2): # dim_in_1 = node features, dim_in_2 = graph features + super(Baseline2_MLP, self).__init__() + self.dim_in_1 = dim_in_1 + self.dim_in_2 = dim_in_2 + + + self.fc1 = nn.Linear(self.dim_in_1, 256) + self.fc2 = nn.Linear(256, 64) + self.fc3 = nn.Linear(64, 8) + + self.fc4 = nn.Linear(self.dim_in_2, 1) + + self.fc5 = nn.Linear(9, 2) + + self.dropout = nn.Dropout(0.5) + + def forward(self, x1, x2): # x1 = node features, x2 = graph features + x1 = x1.view(-1, self.dim_in_1) + x2 = x2.view(-1, self.dim_in_2) + + h1 = F.relu(self.fc1(x1)) + h1 = self.dropout(h1) + h1 = F.relu(self.fc2(h1)) + h1 = self.dropout(h1) + h1 = F.relu(self.fc3(h1)) + + h2 = F.relu(self.fc4(x2)) + combined = torch.cat([h1, h2], dim=1) + output = self.fc5(combined) + + return output + + + diff --git a/train/attack.py b/train/attack.py new file mode 100644 index 0000000..c1b02d2 --- /dev/null +++ b/train/attack.py @@ -0,0 +1,544 @@ +import os +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F +from torch.optim.lr_scheduler import CosineAnnealingLR +from sklearn.metrics import roc_auc_score +th.manual_seed(0) + +from model.mlp import MLP_ATTACK, MLP_ATTACK_PLUS, MLP_ATTACK_PLUS2, MLP_ATTACK_ALL,Baseline0_MLP,Baseline1_MLP,Baseline2_MLP + +def _weights_init_normal(m): + + classname = m.__class__.__name__ + # for every Linear layer in a model + if classname.find('Linear') != -1: + y = m.in_features + # m.weight.data shoud be taken from a normal distribution + m.weight.data.normal_(0.0,1/np.sqrt(y)) + m.bias.data.fill_(0) + + +def save_attack_model(args, model): + if not os.path.exists(args.attack_model_save_path): + os.makedirs(args.attack_model_save_path) + if args.prop: + save_name = os.path.join(args.attack_model_save_path + f'attack_model_{args.dataset}_{args.target_model}_{args.shadow_model}_{args.prop}_{args.node_topology}_{args.feature}_{args.edge_feature}.pth') + elif args.diff: + save_name = os.path.join(args.attack_model_save_path + f'diff_attack_model_{args.target_dataset}_{args.shadow_dataset}_{args.target_model}_{args.shadow_model}_{args.node_topology}_{args.feature}_{args.edge_feature}.pth') + else: + save_name = os.path.join(args.attack_model_save_path + f'attack_model_{args.dataset}_{args.target_model}_{args.shadow_model}_{args.node_topology}_{args.feature}_{args.edge_feature}.pth') + th.save(model.state_dict(), save_name) + print("Finish training, save model to %s" % (save_name)) + + +def load_attack_model(model, model_path, device): + print("load model from: ", model_path) + state_dict = th.load(model_path, map_location=device) + model.load_state_dict(state_dict) + return model + + +def test_one_feature(args, epoch, model, test_dataloader, stat_dict=None): + device = args.device + test_acc = 0.0 + correct = 0 + total = 0 + scores = [] + targets = [] + if not stat_dict: + stat_dict = {} + model.eval() + + with th.no_grad(): + for indice, feature, label in test_dataloader: + indice, feature, label = indice.to(device), feature.to(device), label.to(device) + + outputs = model(feature) + posteriors = F.softmax(outputs, dim=1) + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + if epoch == args.num_epochs - 1 and not args.diff and not args.label_only and (args.mlp_layers == 3) and not args.soft_prob: + for i, posterior in zip(indice, posteriors): + stat_dict[tuple(i.cpu().numpy())][f'{args.method}_attack_posterior'] = posterior.cpu().numpy() + + targets.extend(label.cpu().numpy().tolist()) + scores.extend([i.cpu().numpy()[1] for i in posteriors]) + targets = np.array(targets) + scores = np.array(scores) + scores[np.isnan(scores)] = 0 + test_acc = correct / total + test_auc = roc_auc_score(targets, scores) + print('Test Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (100. * test_acc, correct, total, test_auc)) + + return test_acc, test_auc, stat_dict + + +def test_two_features(args, epoch, model, test_dataloader, stat_dict=None): + device = args.device + test_acc = 0.0 + correct = 0 + total = 0 + scores = [] + targets = [] + stat_dict = {} if stat_dict is None else stat_dict + model.eval() + + with th.no_grad(): + for indice, feature1, feature2, label in test_dataloader: + indice, feature1, feature2, label = indice.to(device), feature1.to(device), feature2.to(device), label.to(device) + + outputs = model(feature1, feature2) + posteriors = F.softmax(outputs, dim=1) + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + if epoch == args.num_epochs - 1 and not args.diff: + for i, posterior in zip(indice, posteriors): + stat_dict[tuple(i.cpu().numpy())][f'{args.method}_attack_posterior'] = posterior.cpu().numpy() + targets.extend(label.cpu().numpy().tolist()) + scores.extend([i.cpu().numpy()[1] for i in posteriors]) + + test_acc = correct / total + test_auc = roc_auc_score(targets, scores) + print('Test Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (100. * test_acc, correct, total, test_auc)) + return test_acc, test_auc, stat_dict + + +def test_three_features(args, epoch, model, test_dataloader, stat_dict=None): + device = args.device + test_acc = 0.0 + correct = 0 + total = 0 + scores = [] + targets = [] + model.eval() + + with th.no_grad(): + for indice, feature1, feature2, feature3, label in test_dataloader: + indice, feature1, feature2, feature3, label = indice.to(device), feature1.to(device), feature2.to(device), feature3.to(device), label.to(device) + + outputs = model(feature1, feature2, feature3) + posteriors = F.softmax(outputs, dim=1) + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + if epoch == args.num_epochs - 1 and not args.diff: + for i, posterior in zip(indice, posteriors): + stat_dict[tuple(i.cpu().numpy())][f'{args.method}_attack_posterior'] = posterior.cpu().numpy() + targets.extend(label.cpu().numpy().tolist()) + scores.extend([i.cpu().numpy()[1] for i in posteriors]) + test_acc = correct / total + test_auc = roc_auc_score(targets, scores) + print('Test Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (100. * test_acc, correct, total, test_auc)) + + return test_acc, test_auc, stat_dict + +def test_gorn_feature(args, epoch, model, test_dataloader, stat_dict=None): + device = args.device + test_acc = 0.0 + correct = 0 + total = 0 + scores = [] + targets = [] + stat_dict = {} if stat_dict is None else stat_dict + model.eval() + + with th.no_grad(): + for indice, feature, label in test_dataloader: + indice = indice.to(device) + feature = feature.to(device) + label = label.to(device) + + outputs = model(feature) + posteriors = F.softmax(outputs, dim=1) + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + + targets.extend(label.cpu().numpy().tolist()) + scores.extend([i.cpu().numpy()[1] for i in posteriors]) + + targets = np.array(targets) + scores = np.array(scores) + scores[np.isnan(scores)] = 0 + test_acc = correct / total + test_auc = roc_auc_score(targets, scores) + print('Test Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (100. * test_acc, correct, total, test_auc)) + + return test_acc, test_auc, stat_dict + + + +def test_graph_node_features(args, epoch, model, test_dataloader, stat_dict=None): + device = args.device + test_acc = 0.0 + correct = 0 + total = 0 + scores = [] + targets = [] + stat_dict = {} if stat_dict is None else stat_dict + model.eval() + + with th.no_grad(): + for indice, feature1, feature2, label in test_dataloader: + indice = indice.to(device) + feature1 = feature1.to(device) + feature2 = feature2.to(device) + label = label.to(device) + + outputs = model(feature1, feature2) + posteriors = F.softmax(outputs, dim=1) + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + + targets.extend(label.cpu().numpy().tolist()) + scores.extend([i.cpu().numpy()[1] for i in posteriors]) + + test_acc = correct / total + test_auc = roc_auc_score(targets, scores) + print('Test Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (100. * test_acc, correct, total, test_auc)) + + return test_acc, test_auc, stat_dict + + + + +def run_attack(args, in_dim, train_dataloader, test_dataloader, stat_dict): + epoch = args.num_epochs + device = args.device + model = MLP_ATTACK(in_dim) + model = model.to(args.device) + model.apply(_weights_init_normal) + loss_fcn = nn.CrossEntropyLoss() + loss_fcn = loss_fcn.to(args.device) + + if args.optim == 'adam': + optimizer = th.optim.Adam(model.parameters(), lr=args.lr) + elif args.optim == 'sgd': + optimizer = th.optim.SGD(model.parameters(), lr=args.lr) + scheduler3 = CosineAnnealingLR(optimizer, T_max=args.num_epochs, eta_min=0) + train_acc = 0.0 + + for e in range(epoch): + + correct = 0 + total = 0 + targets = [] + scores = [] + model.train() + for _, feature, label in train_dataloader: + optimizer.zero_grad() + feature, label = feature.to(device), label.to(device) + outputs = model(feature) + posteriors = F.softmax(outputs, dim=1) + + loss = loss_fcn(posteriors, label) + loss.backward() + optimizer.step() + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + targets.extend(label.cpu().detach().numpy().tolist()) + scores.extend([i.cpu().detach().numpy()[1] for i in posteriors]) + if args.scheduler: + scheduler3.step() + print(scheduler3.get_last_lr()) + train_acc = correct / total + targets = np.array(targets) + scores = np.array(scores) + scores[np.isnan(scores)] = 0 + train_auc = roc_auc_score(targets, scores) + print('[Epoch %d] Train Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (e, 100. * train_acc, correct, total, train_auc)) + + if e == epoch - 1: + test_acc, test_auc, stat_dict = test_one_feature(args, e, model, test_dataloader, stat_dict) + save_attack_model(args, model) + else: + test_acc, test_auc, _ = test_one_feature(args, e, model, test_dataloader) + + return model, train_acc, train_auc, test_acc, test_auc, stat_dict + + +def run_attack_two_features(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict): + epoch = args.num_epochs + device = args.device + + if (args.feature == 'posteriors_graph') or (args.feature == 'label_graph') : + model = MLP_ATTACK_PLUS2(args.graph_feature_dim, posterior_feature_dim) + elif (args.feature == 'posteriors_node') or (args.feature == 'label_node'): + model = MLP_ATTACK_PLUS(args.node_feature_dim, posterior_feature_dim) + model = model.to(args.device) + model.apply(_weights_init_normal) + + loss_fcn = nn.CrossEntropyLoss() + loss_fcn = loss_fcn.to(args.device) + if args.optim == 'adam': + optimizer = th.optim.Adam(model.parameters(), lr=args.lr) + elif args.optim == 'sgd': + optimizer = th.optim.SGD(model.parameters(), lr=args.lr) + + scheduler3 = CosineAnnealingLR(optimizer, T_max=5, eta_min=0) + train_acc = 0.0 + + for e in range(epoch): + + correct = 0 + total = 0 + targets = [] + scores = [] + model.train() + for _, origin_feature, posterior_feature, label in train_dataloader: + optimizer.zero_grad() + origin_feature, posterior_feature, label = origin_feature.to(device), posterior_feature.to(device), label.to(device) + outputs = model(origin_feature, posterior_feature) + posteriors = F.softmax(outputs, dim=1) + loss = loss_fcn(posteriors, label) + loss.backward() + optimizer.step() + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + targets.extend(label.cpu().detach().numpy().tolist()) + scores.extend([i.cpu().detach().numpy()[1] for i in posteriors]) + if args.scheduler: + scheduler3.step() + print(scheduler3.get_last_lr()) + + train_acc = correct / total + train_auc = roc_auc_score(targets, scores) + print('[Epoch %d] Train Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (e, 100. * train_acc, correct, total, train_auc)) + + if e == epoch - 1: + test_acc, test_auc, stat_dict = test_two_features(args, e, model, test_dataloader, stat_dict) + save_attack_model(args, model) + else: + test_acc, test_auc, _ = test_two_features(args, e, model, test_dataloader) + + return model, train_acc, train_auc, test_acc, test_auc, stat_dict + + +def run_attack_three_features(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict): + epoch = args.num_epochs + device = args.device + model = MLP_ATTACK_ALL(args.node_feature_dim, posterior_feature_dim, args.graph_feature_dim) + model = model.to(args.device) + model.apply(_weights_init_normal) + + loss_fcn = nn.CrossEntropyLoss() + loss_fcn = loss_fcn.to(args.device) + if args.optim == 'adam': + optimizer = th.optim.Adam(model.parameters(), lr=args.lr) + elif args.optim == 'sgd': + optimizer = th.optim.SGD(model.parameters(), lr=args.lr) + + scheduler3 = CosineAnnealingLR(optimizer, T_max=5, eta_min=0) + train_acc = 0.0 + + for e in range(epoch): + targets = [] + scores = [] + correct = 0 + total = 0 + model.train() + for _, node_feature, posterior_feature, graph_feature, label in train_dataloader: + optimizer.zero_grad() + node_feature, posterior_feature, graph_feature, label = node_feature.to(device), posterior_feature.to(device), graph_feature.to(device), label.to(device) + + outputs = model(node_feature, posterior_feature, graph_feature) + posteriors = F.softmax(outputs, dim=1) + loss = loss_fcn(posteriors, label) + loss.backward() + optimizer.step() + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + targets.extend(label.cpu().detach().numpy().tolist()) + scores.extend([i.cpu().detach().numpy()[1] for i in posteriors]) + if args.scheduler: + scheduler3.step() + print(scheduler3.get_last_lr()) + train_acc = correct / total + train_auc = roc_auc_score(targets, scores) + print('[Epoch %d] Train Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (e, 100. * train_acc, correct, total, train_auc)) + + if e == epoch - 1: + test_acc, test_auc, stat_dict = test_three_features(args, e, model, test_dataloader, stat_dict) + save_attack_model(args, model) + else: + test_acc, test_auc, _ = test_three_features(args, e, model, test_dataloader) + + return model, train_acc, train_auc, test_acc, test_auc, stat_dict + + + +def run_b0_attack(args, train_dataloader, test_dataloader, stat_dict): + epoch = args.num_epochs + device = args.device + model = Baseline0_MLP(args.node_feature_dim) + model = model.to(args.device) + model.apply(_weights_init_normal) + loss_fcn = nn.CrossEntropyLoss() + loss_fcn = loss_fcn.to(args.device) + + if args.optim == 'adam': + optimizer = th.optim.Adam(model.parameters(), lr=args.lr) + elif args.optim == 'sgd': + optimizer = th.optim.SGD(model.parameters(), lr=args.lr) + scheduler3 = CosineAnnealingLR(optimizer, T_max=args.num_epochs, eta_min=0) + train_acc = 0.0 + + for e in range(epoch): + + correct = 0 + total = 0 + targets = [] + scores = [] + model.train() + + for _, node_feature, label in train_dataloader: + optimizer.zero_grad() + node_feature, label = node_feature.to(device), label.to(device) + outputs = model(node_feature) + posteriors = F.softmax(outputs, dim=1) + + loss = loss_fcn(posteriors, label) + loss.backward() + optimizer.step() + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + targets.extend(label.cpu().detach().numpy().tolist()) + scores.extend([i.cpu().detach().numpy()[1] for i in posteriors]) + if args.scheduler: + scheduler3.step() + print(scheduler3.get_last_lr()) + train_acc = correct / total + targets = np.array(targets) + scores = np.array(scores) + scores[np.isnan(scores)] = 0 + train_auc = roc_auc_score(targets, scores) + print('[Epoch %d] Train Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (e, 100. * train_acc, correct, total, train_auc)) + + if e == epoch - 1: + test_acc, test_auc, stat_dict = test_gorn_feature(args, e, model, test_dataloader, stat_dict) + save_attack_model(args, model) + else: + test_acc, test_auc, _ = test_gorn_feature(args, e, model, test_dataloader) + + return model, train_acc, train_auc, test_acc, test_auc, stat_dict + +def run_b1_attack(args, train_dataloader, test_dataloader, stat_dict): + epoch = args.num_epochs + device = args.device + model = Baseline1_MLP(args.graph_feature_dim) + model = model.to(args.device) + model.apply(_weights_init_normal) + loss_fcn = nn.CrossEntropyLoss() + loss_fcn = loss_fcn.to(args.device) + + if args.optim == 'adam': + optimizer = th.optim.Adam(model.parameters(), lr=args.lr) + elif args.optim == 'sgd': + optimizer = th.optim.SGD(model.parameters(), lr=args.lr) + scheduler3 = CosineAnnealingLR(optimizer, T_max=args.num_epochs, eta_min=0) + train_acc = 0.0 + + for e in range(epoch): + + correct = 0 + total = 0 + targets = [] + scores = [] + model.train() + + for _, graph_feature, label in train_dataloader: + optimizer.zero_grad() + graph_feature, label = graph_feature.to(device), label.to(device) + outputs = model(graph_feature) + posteriors = F.softmax(outputs, dim=1) + + loss = loss_fcn(posteriors, label) + loss.backward() + optimizer.step() + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + targets.extend(label.cpu().detach().numpy().tolist()) + scores.extend([i.cpu().detach().numpy()[1] for i in posteriors]) + if args.scheduler: + scheduler3.step() + print(scheduler3.get_last_lr()) + train_acc = correct / total + targets = np.array(targets) + scores = np.array(scores) + scores[np.isnan(scores)] = 0 + train_auc = roc_auc_score(targets, scores) + print('[Epoch %d] Train Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (e, 100. * train_acc, correct, total, train_auc)) + + if e == epoch - 1: + test_acc, test_auc, stat_dict = test_gorn_feature(args, e, model, test_dataloader, stat_dict) + save_attack_model(args, model) + else: + test_acc, test_auc, _ = test_gorn_feature(args, e, model, test_dataloader) + + return model, train_acc, train_auc, test_acc, test_auc, stat_dict + + + + + +def run_b2_attack(args, train_dataloader, test_dataloader, stat_dict): + epoch = args.num_epochs + device = args.device + model = Baseline2_MLP(args.node_feature_dim, args.graph_feature_dim) + model = model.to(args.device) + model.apply(_weights_init_normal) + + loss_fcn = nn.CrossEntropyLoss() + loss_fcn = loss_fcn.to(args.device) + if args.optim == 'adam': + optimizer = th.optim.Adam(model.parameters(), lr=args.lr) + elif args.optim == 'sgd': + optimizer = th.optim.SGD(model.parameters(), lr=args.lr) + + scheduler3 = CosineAnnealingLR(optimizer, T_max=5, eta_min=0) + train_acc = 0.0 + + for e in range(epoch): + targets = [] + scores = [] + correct = 0 + total = 0 + model.train() + for _, node_feature, graph_feature, label in train_dataloader: + optimizer.zero_grad() + node_feature, graph_feature, label = node_feature.to(device), graph_feature.to(device), label.to(device) + + outputs = model(node_feature, graph_feature) + posteriors = F.softmax(outputs, dim=1) + loss = loss_fcn(posteriors, label) + loss.backward() + optimizer.step() + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + targets.extend(label.cpu().detach().numpy().tolist()) + scores.extend([i.cpu().detach().numpy()[1] for i in posteriors]) + if args.scheduler: + scheduler3.step() + print(scheduler3.get_last_lr()) + train_acc = correct / total + train_auc = roc_auc_score(targets, scores) + print('[Epoch %d] Train Acc: %.3f%% (%d/%d) AUC Score: %.3f' % (e, 100. * train_acc, correct, total, train_auc)) + + if e == epoch - 1: + test_acc, test_auc, stat_dict = test_graph_node_features(args, e, model, test_dataloader, stat_dict) + save_attack_model(args, model) + else: + test_acc, test_auc, _ = test_graph_node_features(args, e, model, test_dataloader) + + return model, train_acc, train_auc, test_acc, test_auc, stat_dict \ No newline at end of file diff --git a/train/target.py b/train/target.py new file mode 100644 index 0000000..d111f16 --- /dev/null +++ b/train/target.py @@ -0,0 +1,89 @@ +import os +import dgl +import time +import numpy as np +import torch as th +import torch.nn.functional as F +import torch.optim as optim +import torch.nn as nn +th.manual_seed(1) + +from utils.load_model import get_gnn_model +from utils.metrics__ import compute_acc, evaluate + +def run_gnn(args, data): + train_g, test_g = data + + train_nid = th.tensor(range(0, len(train_g.nodes()))) + test_nid = th.tensor(range(0, len(test_g.nodes()))) + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2) + dataloader = dgl.dataloading.DataLoader( + train_g, + train_nid, + sampler, + batch_size=args.batch_size, + shuffle=True, + drop_last=False, + num_workers=args.num_workers) + + # Define model and optimizer + model = get_gnn_model(args) + print(model) + model = model.to(args.device) + loss_fcn = nn.CrossEntropyLoss() + loss_fcn = loss_fcn.to(args.device) + optimizer = optim.Adam(model.parameters(), lr=args.lr) + + # Training loop + avg = 0 + iter_tput = [] + for epoch in range(args.num_epochs): + tic = time.time() + + tic_step = time.time() + for step, (_, seeds, blocks) in enumerate(dataloader): + blocks = [block.int().to(args.device) for block in blocks] + batch_inputs = blocks[0].srcdata['features'] + batch_labels = blocks[-1].dstdata['labels'].to(device=args.device, dtype=th.long) + + # Compute loss and prediction + batch_pred = model(blocks, batch_inputs) + batch_pred = F.softmax(batch_pred, dim=1) + loss = loss_fcn(batch_pred, batch_labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + iter_tput.append(len(seeds) / (time.time() - tic_step)) + if step % args.log_every == 0: + acc = compute_acc(batch_pred, batch_labels) + print('Epoch {:05d} | Step {:05d} | Loss {:.4f} | Train Acc {:.4f} | Speed (samples/sec) {:.4f}'.format( + epoch, step, loss.item(), acc.item(), np.mean(iter_tput[3:]))) + tic_step = time.time() + + toc = time.time() + print('Epoch %d, Time(s):%.4f'%(epoch, toc - tic)) + if epoch >= 5: + avg += toc - tic + if epoch % args.eval_every == 0 and epoch != 0: + train_acc, _ = evaluate(model, train_g, train_g.ndata['features'], train_g.ndata['labels'], train_nid, args.device) + print('Train Acc {:.4f}'.format(train_acc)) + + test_acc, _ = evaluate(model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.device) + print('Test Acc: {:.4f}'.format(test_acc)) + + if args.prop and args.mode == 'shadow': + saving_path = os.path.join(args.model_save_path, '%s_%s_%s_%s%d.pth'%(args.setting, args.dataset, args.model, args.mode, args.prop)) + else: + saving_path = os.path.join(args.model_save_path, '%s_%s_%s_%s.pth'%(args.setting, args.dataset, args.model, args.mode)) + print("Finish training, save model to %s"%(saving_path)) + th.save(model.state_dict(), saving_path) + + #finish training + train_acc, _ = evaluate(model, train_g, train_g.ndata['features'], train_g.ndata['labels'], train_nid, args.device) + print('Final Train Acc {:.4f}'.format(train_acc)) + + test_acc, _ = evaluate(model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.device) + print('Final Test Acc {:.4f}'.format(test_acc)) + + return train_acc, test_acc diff --git a/train_gnn.py b/train_gnn.py new file mode 100644 index 0000000..35576c6 --- /dev/null +++ b/train_gnn.py @@ -0,0 +1,91 @@ +import torch as th +import argparse +import os + +from train.target import run_gnn +from data.load_graph import split_target_shadow, load_graphgallery_data, split_train_test, split_target_shadow_by_prop +th.set_num_threads(1) + + +def arg_parse(): + argparser = argparse.ArgumentParser("multi-gpu training") + argparser.add_argument('--gpu', type=int, default=0, + help="GPU device ID. Use -1 for CPU training") + argparser.add_argument('--dataset', type=str, default='Cora') + argparser.add_argument('--num_epochs', type=int, default=200) + argparser.add_argument('--n_hidden', type=int, default=128) + argparser.add_argument('--gnn_layers', type=int, default=2) + argparser.add_argument('--batch_size', type=int, default=1000) + argparser.add_argument('--lr', type=float, default=0.001) + argparser.add_argument('--dropout', type=float, default=0.5) + argparser.add_argument('--log-every', type=int, default=20) + argparser.add_argument('--eval-every', type=int, default=5) + argparser.add_argument('--model', type=str, default='graphsage') + argparser.add_argument('--mode', type=str, default='target') + argparser.add_argument('--fan-out', type=str, default='10,25') + argparser.add_argument('--num_workers', type=int, default=4, + help="Number of sampling processes. Use 0 for no extra process.") + argparser.add_argument('--model_save_path', type=str, default='../data/save_model/gnn/') + argparser.add_argument('--attack_model_save_path', type=str, default='../data/save_model/mlp/') + argparser.add_argument('--load_trained', type=str, default='no') + argparser.add_argument('--dp', action='store_true') + argparser.add_argument('--epsilon', type=int, default=8) + argparser.add_argument('--delta', type=float, default=1e-5) + argparser.add_argument('--noise_seed', type=int, default=42) + argparser.add_argument('--noise_type', type=str, default='laplace') + argparser.add_argument('--perturb_type', type=str, default='continuous') + argparser.add_argument('--prop', type=int, + help="use a specified propotion of the shadow dataset") + argparser.add_argument("--seed", type=int, default=0, help="seed",) + args = argparser.parse_args() + + if args.gpu >= 0: + args.device = th.device('cuda:%d' % args.gpu) + else: + args.device = th.device('cpu') + + return args + + +if __name__ == '__main__': + + args = arg_parse() + args.model_save_path = f'./data/save_model/gnn/' + args.data_save_path = f'./data/' + log_dir = 'output/logs/' + os.makedirs(args.model_save_path, exist_ok=True) + os.makedirs(log_dir, exist_ok=True) + g, n_classes = load_graphgallery_data(args.dataset) + + in_feats = g.ndata['features'].shape[1] + args.in_feats = in_feats + args.n_classes = n_classes + args.setting = 'inductive' + + if args.prop: + target_g, shadow_g = split_target_shadow_by_prop(args, g) + else: + target_g, shadow_g = split_target_shadow(g) + + + if args.mode == 'target': + target_train_g, target_test_g = split_train_test(target_g) + + target_train_g.create_formats_() + target_test_g.create_formats_() + + run_data = target_train_g, target_test_g + + elif args.mode == 'shadow': + shadow_train_g, shadow_test_g = split_train_test(shadow_g) + + shadow_train_g.create_formats_() + shadow_test_g.create_formats_() + + run_data = shadow_train_g, shadow_test_g + + train_acc, test_acc = run_gnn(args, run_data) + prop = args.prop if args.prop else 100 + with open(os.path.join(log_dir, "target_preformance.txt"), "a") as wf: + wf.write("%s, %s, %s, %d, %.3f, %.3f, %.3d\n" % (args.dataset, args.model, args.mode, prop, train_acc, test_acc, args.seed)) + diff --git a/utils/load_model.py b/utils/load_model.py new file mode 100644 index 0000000..ccebf83 --- /dev/null +++ b/utils/load_model.py @@ -0,0 +1,21 @@ +import torch.nn.functional as F +import torch as th + +from model.gnn import SAGE, GAT, GIN, GCN + +def get_gnn_model(config): + if config.model == 'graphsage': + model = SAGE(config.in_feats, config.n_hidden, config.n_classes, config.gnn_layers, F.relu, config.batch_size, config.num_workers, config.dropout) + elif config.model == 'gat': + model = GAT(config.in_feats, config.n_hidden, config.n_classes, config.gnn_layers, F.relu, config.batch_size, config.num_workers, config.dropout) + elif config.model == 'gin': + model = GIN(config.in_feats, config.n_hidden, config.n_classes, config.gnn_layers, F.relu, config.batch_size, config.num_workers, config.dropout) + elif config.model == 'gcn': + model = GCN(config.in_feats, config.n_hidden, config.n_classes, config.gnn_layers, F.relu, config.batch_size, config.num_workers, config.dropout) + return model + +def load_trained_gnn_model(model, model_path, device): + print("load model from: ", model_path) + state_dict = th.load(model_path, map_location=device) + model.load_state_dict(state_dict) + return model \ No newline at end of file diff --git a/utils/metrics__.py b/utils/metrics__.py new file mode 100644 index 0000000..60f507f --- /dev/null +++ b/utils/metrics__.py @@ -0,0 +1,28 @@ +import torch as th +from scipy.special import softmax + + +def compute_acc(pred, labels): + """ + Compute the accuracy of prediction given the labels. + """ + labels = labels.long() + return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred) + + +def evaluate(model, g, inputs, labels, val_nid, device): + """ + Evaluate the model on the validation set specified by ``val_nid``. + g : The entire graph. + inputs : The features of all the nodes. + labels : The labels of all the nodes. + val_nid : the node Ids for validation. + batch_size : Number of nodes to compute at the same time. + device : The GPU device to evaluate on. + """ + model.eval() + with th.no_grad(): + pred = model.inference(g, inputs, device) + model.train() + return compute_acc(pred[val_nid], labels[val_nid]), softmax(pred[val_nid].detach().cpu().numpy(), axis=1) + diff --git a/utils/query_model.py b/utils/query_model.py new file mode 100644 index 0000000..17a12bb --- /dev/null +++ b/utils/query_model.py @@ -0,0 +1,27 @@ +import os +import torch as th + +from utils.load_model import get_gnn_model, load_trained_gnn_model + +def query_trained_model(args, train_index, g, mode): + ''' + query trained model using 0-hop training graph nodes + ''' + if args.diff: + args.in_feats = args.target_in_feats if mode == 'target' else args.shadow_in_feats + args.n_classes= args.target_classes if mode == 'target' else args.shadow_classes + args.model = args.target_model if mode == 'target' else args.shadow_model + model = get_gnn_model(args).to(args.device) + model_save_path = os.path.join(args.model_save_path, '%s_%s_%s_%s.pth' % (args.setting, args.dataset, args.model, mode)) + print(args.model_save_path) + print(f'Load {mode} model from: {model_save_path}') + model = load_trained_gnn_model(model, model_save_path, args.device) + + model.eval() + with th.no_grad(): + train_pred = model.inference(g, g.ndata['features'], args.device) + res_dict = {} + for i in range(len(train_index)): + res_dict[train_index[i]] = train_pred[train_index[i]] + print("Finish Querying %s Model!" % (mode)) + return res_dict \ No newline at end of file diff --git a/utils/tradition_metrics.py b/utils/tradition_metrics.py new file mode 100644 index 0000000..96ad430 --- /dev/null +++ b/utils/tradition_metrics.py @@ -0,0 +1,51 @@ +import networkx as nx +import dgl + +def get_jaccard(nx_g, pairs): + jaccard_dict = {} + jaccard_tuple = nx.jaccard_coefficient(nx_g, pairs) + for u, v, p in jaccard_tuple: + jaccard_dict[(u, v)] = round(p,3) + return jaccard_dict + + +def get_attach(nx_g, pairs): + attach_dict = {} + jaccard_tuple = nx.preferential_attachment(nx_g, pairs) + for u, v, p in jaccard_tuple: + attach_dict[(u, v)] = round(p,3) + return attach_dict + +def get_common_neighbors(nx_g, pairs): + neihbors_dict = {} + for start_id, end_id in pairs: + neihbors_dict[(start_id, end_id)] = len(list(nx.common_neighbors(nx_g, start_id, end_id))) + return neihbors_dict + + +def get_features(args, g, pairs, label, mode): + k = 1 + + jaccard_dict = {} + attach_dict = {} + neihbors_dict = {} + nx_g = nx.Graph(dgl.to_networkx(g, node_attrs=["features"])) + for pair in pairs: + start_subgraph_nodes = list(nx.ego.ego_graph(nx_g, n=pair[0], radius=k).nodes()) + end_subgraph_nodes = list(nx.ego.ego_graph(nx_g, n=pair[1], radius=k).nodes()) + subgraph_nodes = start_subgraph_nodes + end_subgraph_nodes + subgraph = nx_g.subgraph(subgraph_nodes).copy() + start_id = pair[0] + end_id = pair[1] + if label == 1: + subgraph.remove_edge(start_id, end_id) + jaccard_tuple = nx.jaccard_coefficient(subgraph, [(start_id, end_id)]) + for _, _, p in jaccard_tuple: + jaccard_dict[pair] = round(p,3) + attach_tuple = nx.preferential_attachment(subgraph, [(start_id, end_id)]) + for _, _, p in attach_tuple: + attach_dict[pair] = round(p,3) + neihbors_dict[pair] = len(list(nx.common_neighbors(subgraph, start_id, end_id))) + print("Finish Generating trad_feature_dict...") + return jaccard_dict, attach_dict, neihbors_dict +