diff --git a/examples/examples.py b/examples/examples.py index 6509d67..e0a4112 100644 --- a/examples/examples.py +++ b/examples/examples.py @@ -135,3 +135,6 @@ def test(): elif (dataset_name == 3): defense = Watermark_sage(PubMed(), 0.25) defense.watermark_attack(PubMed(), attack_name, dataset_name) + + +test() \ No newline at end of file diff --git a/new_code/attack/__init__.py b/new_code/attack/__init__.py new file mode 100644 index 0000000..36cf2d0 --- /dev/null +++ b/new_code/attack/__init__.py @@ -0,0 +1,2 @@ +from .attack import * +from .target import * diff --git a/new_code/attack/attack.py b/new_code/attack/attack.py new file mode 100644 index 0000000..fa437b3 --- /dev/null +++ b/new_code/attack/attack.py @@ -0,0 +1,116 @@ +import os +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F +from torch.optim.lr_scheduler import CosineAnnealingLR +from sklearn.metrics import roc_auc_score +th.manual_seed(0) + +from model.mlp import MLP_ATTACK, MLP_ATTACK_PLUS, MLP_ATTACK_PLUS2, MLP_ATTACK_ALL + +def _weights_init_normal(m): + classname = m.__class__.__name__ + if classname.find('Linear') != -1: + y = m.in_features + m.weight.data.normal_(0.0, 1/np.sqrt(y)) + m.bias.data.fill_(0) + +def save_attack_model(args, model): + if not os.path.exists(args.attack_model_save_path): + os.makedirs(args.attack_model_save_path) + save_name = os.path.join(args.attack_model_save_path, f'{args.attack_model_prefix}_{args.dataset}_{args.target_model}_{args.shadow_model}_{args.node_topology}_{args.feature}_{args.edge_feature}.pth') + th.save(model.state_dict(), save_name) + print(f"Finish training, save model to {save_name}") + +def load_attack_model(model, model_path, device): + print("load model from:", model_path) + state_dict = th.load(model_path, map_location=device) + model.load_state_dict(state_dict) + return model + +def test_features(args, epoch, model, test_dataloader, num_features, stat_dict=None): + device = args.device + test_acc, correct, total, scores, targets = 0.0, 0, 0, [], [] + stat_dict = stat_dict or {} + model.eval() + + with th.no_grad(): + for data in test_dataloader: + inputs = [x.to(device) for x in data[:-1]] + label = data[-1].to(device) + + outputs = model(*inputs) + posteriors = F.softmax(outputs, dim=1) + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + + if epoch == args.num_epochs - 1 and not args.diff: + for i, posterior in zip(data[0], posteriors): + stat_dict[tuple(i.cpu().numpy())][f'{args.method}_attack_posterior'] = posterior.cpu().numpy() + + targets.extend(label.cpu().numpy().tolist()) + scores.extend([i.cpu().numpy()[1] for i in posteriors]) + + test_acc = correct / total + test_auc = roc_auc_score(targets, scores) + print(f'Test Acc: {100. * test_acc:.3f}% ({correct}/{total}) AUC Score: {test_auc:.3f}') + + return test_acc, test_auc, stat_dict + +def run_attack(args, in_dim, train_dataloader, test_dataloader, stat_dict, model_class, model_args=()): + epoch, device = args.num_epochs, args.device + model = model_class(*model_args).to(device) + model.apply(_weights_init_normal) + loss_fcn = nn.CrossEntropyLoss().to(device) + optimizer = th.optim.Adam(model.parameters(), lr=args.lr) if args.optim == 'adam' else th.optim.SGD(model.parameters(), lr=args.lr) + scheduler = CosineAnnealingLR(optimizer, T_max=epoch, eta_min=0) + train_acc = 0.0 + + for e in range(epoch): + correct, total, targets, scores = 0, 0, [], [] + model.train() + + for _, *features, label in train_dataloader: + optimizer.zero_grad() + features = [f.to(device) for f in features] + label = label.to(device) + + outputs = model(*features) + posteriors = F.softmax(outputs, dim=1) + loss = loss_fcn(posteriors, label) + loss.backward() + optimizer.step() + + _, predicted = posteriors.max(1) + total += label.size(0) + correct += predicted.eq(label).sum().item() + targets.extend(label.cpu().detach().numpy().tolist()) + scores.extend([i.cpu().detach().numpy()[1] for i in posteriors]) + + if args.scheduler: + scheduler.step() + + train_acc = correct / total + train_auc = roc_auc_score(targets, scores) + print(f'[Epoch {e}] Train Acc: {100. * train_acc:.3f}% ({correct}/{total}) AUC Score: {train_auc:.3f}') + + if e == epoch - 1: + test_acc, test_auc, stat_dict = test_features(args, e, model, test_dataloader, len(model_args), stat_dict) + save_attack_model(args, model) + else: + test_acc, test_auc, _ = test_features(args, e, model, test_dataloader, len(model_args)) + + return model, train_acc, train_auc, test_acc, test_auc, stat_dict + +# Example for running attack with different feature combinations +def run_attack_one_feature(args, in_dim, train_dataloader, test_dataloader, stat_dict): + return run_attack(args, in_dim, train_dataloader, test_dataloader, stat_dict, MLP_ATTACK) + +def run_attack_two_features(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict): + model_class = MLP_ATTACK_PLUS2 if args.feature == 'posteriors_graph' or args.feature == 'label_graph' else MLP_ATTACK_PLUS + return run_attack(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict, model_class, (args.graph_feature_dim, posterior_feature_dim) if model_class == MLP_ATTACK_PLUS2 else (args.node_feature_dim, posterior_feature_dim)) + +def run_attack_three_features(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict): + return run_attack(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict, MLP_ATTACK_ALL, (args.node_feature_dim, posterior_feature_dim, args.graph_feature_dim)) diff --git a/new_code/attack/target.py b/new_code/attack/target.py new file mode 100644 index 0000000..7fd6523 --- /dev/null +++ b/new_code/attack/target.py @@ -0,0 +1,123 @@ +import os +import dgl +import time +import numpy as np +import torch as th +import torch.nn.functional as F +import torch.optim as optim +import torch.nn as nn +th.manual_seed(1) + +from utils.load_model import get_gnn_model +from utils.model_evaluation import compute_accuracy, evaluate_model + +def get_dataloader(train_graph, args): + # Set up the data loader for training + train_node_ids = th.tensor(range(0, len(train_graph.nodes()))) + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2) # Sampling neighbors + return dgl.dataloading.DataLoader( + train_graph, + train_node_ids, + sampler, + batch_size=args.batch_size, + shuffle=True, + drop_last=False, + num_workers=args.num_workers + ) + +def initialize_model_and_optimizer(args): + # Initialize model, loss function, and optimizer + model = get_gnn_model(args).to(args.device) + loss_function = nn.CrossEntropyLoss().to(args.device) + optimizer = optim.Adam(model.parameters(), lr=args.lr) + return model, loss_function, optimizer + +def training_step(model, blocks, batch_inputs, batch_labels, loss_function, optimizer, args, tic_step): + # One training step: forward, loss calculation, and backpropagation + blocks = [block.int().to(args.device) for block in blocks] + batch_pred = model(blocks, batch_inputs) # Model's prediction + batch_pred = F.softmax(batch_pred, dim=1) # Apply softmax for classification + loss = loss_function(batch_pred, batch_labels) # Compute loss + optimizer.zero_grad() # Zero out gradients + loss.backward() # Backpropagate + optimizer.step() # Update weights + return loss, batch_pred + +def log_info(epoch, step, loss, batch_pred, batch_labels, iter_tput): + # Print training details like loss, accuracy, and throughput + acc = compute_accuracy(batch_pred, batch_labels) + print(f'Epoch {epoch:05d} | Step {step:05d} | Loss {loss.item():.4f} | ' + f'Train Acc {acc.item():.4f} | Speed (samples/sec) {np.mean(iter_tput[3:]):.4f}') + +def evaluate_and_log(model, train_graph, test_graph, train_node_ids, test_node_ids, args): + # Evaluate the model on train and test datasets + train_acc, _ = evaluate_model(model, train_graph, train_graph.ndata['features'], + train_graph.ndata['labels'], train_node_ids, args.device) + print(f'Train Acc {train_acc:.4f}') + + test_acc, _ = evaluate_model(model, test_graph, test_graph.ndata['features'], + test_graph.ndata['labels'], test_node_ids, args.device) + print(f'Test Acc: {test_acc:.4f}') + +def save_trained_model(model, args): + # Save the trained model to disk + model_save_path = os.path.join( + args.model_save_path, + f'{args.setting}_{args.dataset}_{args.model}_{args.mode}' + f'_{args.prop if args.prop else ""}.pth' + ) + print(f"Training complete, model saved to {model_save_path}") + th.save(model.state_dict(), model_save_path) + +def run_gnn(args, data): + train_graph, test_graph = data + train_node_ids = th.tensor(range(0, len(train_graph.nodes()))) + test_node_ids = th.tensor(range(0, len(test_graph.nodes()))) + + # Initialize DataLoader for training + dataloader = get_dataloader(train_graph, args) + + # Initialize model, loss function, and optimizer + model, loss_function, optimizer = initialize_model_and_optimizer(args) + + iter_tput = [] # List for tracking throughput during training + avg_epoch_time = 0 # Average time taken per epoch + + # Training loop + for epoch in range(args.num_epochs): + epoch_start_time = time.time() + + # Loop through batches + for step, (_, seeds, blocks) in enumerate(dataloader): + batch_inputs = blocks[0].srcdata['features'] + batch_labels = blocks[-1].dstdata['labels'].to(device=args.device, dtype=th.long) + + # Perform a training step (forward, backward, and optimization) + loss, batch_pred = training_step(model, blocks, batch_inputs, batch_labels, loss_function, optimizer, args, epoch_start_time) + + # Track throughput and log training info every few steps + iter_tput.append(len(seeds) / (time.time() - epoch_start_time)) + if step % args.log_every == 0: + log_info(epoch, step, loss, batch_pred, batch_labels, iter_tput) + + epoch_start_time = time.time() + + # Log time taken for each epoch + epoch_end_time = time.time() + print(f'Epoch {epoch}, Time(s): {epoch_end_time - epoch_start_time:.4f}') + + # Calculate average time after the first few epochs + if epoch >= 5: + avg_epoch_time += epoch_end_time - epoch_start_time + + # Evaluate the model periodically + if epoch % args.eval_every == 0 and epoch != 0: + evaluate_and_log(model, train_graph, test_graph, train_node_ids, test_node_ids, args) + + # Save the trained model + save_trained_model(model, args) + + # Final evaluation + evaluate_and_log(model, train_graph, test_graph, train_node_ids, test_node_ids, args) + + return evaluate_and_log(model, train_graph, test_graph, train_node_ids, test_node_ids, args) diff --git a/new_code/load_data/__init__.py b/new_code/load_data/__init__.py new file mode 100644 index 0000000..7ad26a8 --- /dev/null +++ b/new_code/load_data/__init__.py @@ -0,0 +1,4 @@ +from .load_graph import * +from .inductive_split import * +from .generate_xy import * +from .batch_data import * \ No newline at end of file diff --git a/new_code/load_data/batch_data.py b/new_code/load_data/batch_data.py new file mode 100644 index 0000000..fe09782 --- /dev/null +++ b/new_code/load_data/batch_data.py @@ -0,0 +1,113 @@ +import dgl +import networkx as nx +from utils.model_query import query_trained_model +from load_data.generate_xy import generate_features + + +def get_batch(args, batch_pairs, g, k, mode): + """ + Generates a batch of subgraphs and queries posteriors for node pairs. + + Args: + args: Arguments for the model and batch generation. + batch_pairs: List of node pairs for which the batch is generated. + g: The graph object (DGL graph). + k: The number of hops to consider for subgraphs. + mode: Mode for querying the trained model. + + Returns: + posteriors_dict_batch: A dictionary of posteriors. + index_mapping_dict_batch: A dictionary of index mappings for nodes. + """ + query_graph_batch, index_mapping_dict_batch = get_khop_query_graph_batch(g, batch_pairs, k) + index_update_batch = [node for _, nodes in index_mapping_dict_batch.items() for node in nodes] + posteriors_dict_batch = query_trained_model(args, index_update_batch, query_graph_batch, mode) + + print('Finish generating posteriors and mapping dict...') + return posteriors_dict_batch, index_mapping_dict_batch + + +def generate_batch_features(args, batch_pairs, g, k, mode, feature_type): + """ + A generic function to generate features for different types of graph-based batches. + + Args: + args: Arguments for the model and batch generation. + batch_pairs: List of node pairs for which the batch is generated. + g: The graph object (DGL graph). + k: The number of hops to consider for subgraphs. + mode: Mode for querying the trained model. + feature_type: The type of features to generate ('node', 'graph', or 'all'). + + Returns: + Features, labels, and statistical data for the batch. + """ + posteriors_dict_batch, index_mapping_dict_batch = get_batch(args, batch_pairs, g, k, mode) + + if feature_type == 'node': + return generate_features(args, g, batch_pairs, posteriors_dict_batch, index_mapping_dict_batch) + elif feature_type == 'graph': + return generate_features(args, g, batch_pairs, posteriors_dict_batch, mode, index_mapping_dict_batch) + elif feature_type == 'all': + return generate_features(args, g, batch_pairs, posteriors_dict_batch, mode, index_mapping_dict_batch) + else: + return generate_features(args, batch_pairs, posteriors_dict_batch, index_mapping_dict_batch) + + +def get_khop_query_graph_batch(g, pairs, k=2): + """ + Generates a k-hop subgraph for each node pair and returns a batched graph. + + Args: + g: The graph object (DGL graph). + pairs: List of node pairs for which k-hop neighborhoods are generated. + k: The number of hops to consider for subgraphs. + + Returns: + A batched graph containing the k-hop subgraphs and a mapping of node indices. + """ + nx_g = dgl.to_networkx(g, node_attrs=["features"]) + subgraph_list = [] + index_mapping_dict = {} + bias = 0 + + for pair in pairs: + start_node, end_node = pair + nx_g.remove_edges_from([(start_node, end_node), (end_node, start_node)]) + + node_index = [] + for node in (start_node, end_node): + node_neighbors = list(nx.ego.ego_graph(nx_g, n=node, radius=k).nodes()) + node_new_index = node_neighbors.index(node) + subgraph_k_hop = g.subgraph(node_neighbors) + subgraph_list.append(subgraph_k_hop) + node_index.append(node_new_index + bias) + bias += len(node_neighbors) + + nx_g.add_edges_from([(start_node, end_node), (end_node, start_node)]) + index_mapping_dict[(start_node, end_node)] = (node_index[0], node_index[1]) + + update_query_graph = dgl.batch(subgraph_list) + print("Get k-hop query graph") + return update_query_graph, index_mapping_dict + + +# Wrapper functions for specific batch feature types +def get_batch_posteriors(args, batch_pairs, g, k, mode): + batch_features, batch_labels, batch_stat_dict = generate_batch_features(args, batch_pairs, g, k, mode, 'default') + return batch_features, batch_labels, batch_stat_dict + + +def get_batch_posteriors_node(args, batch_pairs, g, k, mode): + batch_node_features, batch_posteriors_features, batch_labels, batch_stat_dict = generate_batch_features(args, batch_pairs, g, k, mode, 'node') + return batch_node_features, batch_posteriors_features, batch_labels, batch_stat_dict + + +def get_batch_posteriors_graph(args, batch_pairs, g, k, mode): + batch_posteriors_features, batch_graph_features, batch_labels, batch_stat_dict = generate_batch_features(args, batch_pairs, g, k, mode, 'graph') + return batch_posteriors_features, batch_graph_features, batch_labels, batch_stat_dict + + +def get_batch_posteriors_node_graph(args, batch_pairs, g, k, mode): + batch_node_features, batch_posteriors_features, batch_graph_features, batch_labels, batch_stat_dict = generate_batch_features(args, batch_pairs, g, k, mode, 'all') + return batch_node_features, batch_posteriors_features, batch_graph_features, batch_labels, batch_stat_dict diff --git a/new_code/load_data/generate_xy.py b/new_code/load_data/generate_xy.py new file mode 100644 index 0000000..b4b41ff --- /dev/null +++ b/new_code/load_data/generate_xy.py @@ -0,0 +1,162 @@ +from tqdm import tqdm +import numpy as np +import torch as th +import torch.nn.functional as F +from scipy.spatial import distance +from utils.graph_features import generate_graph_features + +# Set random seeds for reproducibility +np.random.seed(0) +th.manual_seed(0) +th.backends.cudnn.deterministic = True +th.backends.cudnn.benchmark = False + + +def entropy(P: np.ndarray) -> np.ndarray: + """Calculate the entropy of a probability distribution.""" + epsilon = 1e-5 # Avoid division by zero + P = P + epsilon + return np.array([-np.sum(P * np.log(P))]) + + +def js_divergence(a: np.ndarray, b: np.ndarray) -> float: + """Calculate the Jensen-Shannon divergence.""" + return distance.jensenshannon(a, b, base=2.0) + + +def cosine_sim(a: np.ndarray, b: np.ndarray) -> float: + """Calculate the cosine similarity.""" + return 1 - distance.cosine(a, b) + + +def correlation_dist(a: np.ndarray, b: np.ndarray) -> float: + """Calculate the correlation distance.""" + return distance.correlation(a, b) + + +def pair_wise(a: np.ndarray, b: np.ndarray, edge_feature: str) -> np.ndarray: + """Generate pair-wise features based on the specified edge feature type.""" + if edge_feature == 'simple': + return np.concatenate([a, b]) + elif edge_feature == 'add': + return a + b + elif edge_feature == 'hadamard': + return a * b + elif edge_feature == 'average': + return (a + b) / 2 + elif edge_feature == 'l1': + return np.abs(a - b) + elif edge_feature == 'l2': + return (a - b) ** 2 + elif edge_feature == 'all': + return np.concatenate([ + a * b, + (a + b) / 2, + np.abs(a - b), + (a - b) ** 2 + ]) + else: + raise ValueError(f"Unknown edge feature type: {edge_feature}") + + +def sim_metrics(a: np.ndarray, b: np.ndarray) -> np.ndarray: + """Calculate similarity metrics and entropy-based features.""" + a_entropy = entropy(a) + b_entropy = entropy(b) + entr_feature = pair_wise(a_entropy, b_entropy, 'all') + sim_feature = np.array([js_divergence(a, b), cosine_sim(a, b), correlation_dist(a, b)]) + return np.concatenate([entr_feature, sim_feature]) + + +def process_pair( + start_id: int, + end_id: int, + posteriors_dict: dict, + features: np.ndarray, + args, + index_mapping_dict=None +) -> dict: + """Process a single pair of nodes to generate features.""" + if index_mapping_dict: + start_id, end_id = index_mapping_dict[(start_id, end_id)] + + start_posterior = F.softmax(posteriors_dict[start_id], dim=0).numpy() + end_posterior = F.softmax(posteriors_dict[end_id], dim=0).numpy() + + if args.label_only: + label_dim = len(start_posterior) + start_label = np.eye(label_dim)[np.argmax(start_posterior)] + end_label = np.eye(label_dim)[np.argmax(end_posterior)] + posterior_feature = pair_wise(start_label, end_label, 'add') + elif args.diff: + posterior_feature = sim_metrics(start_posterior, end_posterior) + posterior_feature[np.isnan(posterior_feature)] = 0 + else: + posterior_feature = pair_wise(start_posterior, end_posterior, args.edge_feature) + + node_feature = None + if features is not None: + start_feature = features[start_id].cpu().numpy() + end_feature = features[end_id].cpu().numpy() + if args.diff: + node_feature = sim_metrics(start_feature, end_feature) + node_feature[np.isnan(node_feature)] = 0 + else: + node_feature = pair_wise(start_feature, end_feature, 'hadamard') + + return { + "posterior_feature": posterior_feature, + "node_feature": node_feature, + "start_posterior": start_posterior, + "end_posterior": end_posterior, + } + + +def generate_features( + args, + pairs, + posteriors_dict, + label, + g=None, + index_mapping_dict=None, + mode=None +): + """Generate features and labels for all pairs.""" + features = g.ndata['features'] if g else None + stat_dict = {} + node_features, posterior_features, graph_features = [], [], [] + labels = [label] * len(pairs) + + if mode: + jaccard_dict, attach_dict, neighbor_dict = generate_graph_features(args, g, pairs, label, mode) + + for start_id, end_id in tqdm(pairs): + pair_result = process_pair( + start_id, end_id, posteriors_dict, features, args, index_mapping_dict + ) + posterior_features.append(pair_result["posterior_feature"]) + if pair_result["node_feature"] is not None: + node_features.append(pair_result["node_feature"]) + if mode: + graph_feature = [ + jaccard_dict[(start_id, end_id)], + attach_dict[(start_id, end_id)], + neighbor_dict[(start_id, end_id)], + ] + graph_features.append(graph_feature) + stat_dict[(start_id, end_id)] = { + 'node_ids': (start_id, end_id), + f'{args.node_topology}_start_posterior': pair_result["start_posterior"], + f'{args.node_topology}_end_posterior': pair_result["end_posterior"], + f'{args.node_topology}_posterior_feature': pair_result["posterior_feature"], + 'label': label, + } + + print(f"Features and labels of {len(labels)} pairs have been generated") + return { + "node_features": node_features, + "posterior_features": posterior_features, + "graph_features": graph_features if mode else None, + "labels": labels, + "stat_dict": stat_dict, + } diff --git a/new_code/load_data/inductive_split.py b/new_code/load_data/inductive_split.py new file mode 100644 index 0000000..d5c0be2 --- /dev/null +++ b/new_code/load_data/inductive_split.py @@ -0,0 +1,131 @@ +import numpy as np +import torch as th +import dgl +import os +from tqdm import tqdm +from multiprocessing import Pool +import sys +from utils.model_query import query_trained_model + + +from load_data.batch_data import get_batch_posteriors +from load_data.generate_xy import generate_features + + + +# Helper functions to avoid redundancy +def make_dirs(): + os.makedirs('./data/pairs/', exist_ok=True) + os.makedirs('./data/posteriors/', exist_ok=True) + os.makedirs('./data/mapping/', exist_ok=True) + +def generate_pairs(g, train_index): + start_ids, end_ids = g.edges() + postive_pairs = [] + negative_pairs = [] + + for i in tqdm(range(len(start_ids))): + if start_ids[i] < end_ids[i]: + postive_pairs.append((start_ids[i].item(), end_ids[i].item())) + + num_pos_pairs = len(postive_pairs) + print(f"There are {num_pos_pairs} edges in the training graph!") + + while len(negative_pairs) < num_pos_pairs: + a, b = np.random.choice(list(train_index), 2, replace=False) + random_pair = (a, b) if a < b else (b, a) + if random_pair not in postive_pairs: + negative_pairs.append(random_pair) + + print("Finish Generating Pairs!") + return postive_pairs, negative_pairs + +def remove_neighbor_edge(g): + """ + Remove all edges from a graph, only save self connection + """ + start_ids, end_ids = g.edges() + delete_eid = [i for i in tqdm(range(len(start_ids))) if start_ids[i] != end_ids[i]] + g = dgl.remove_edges(g, th.tensor(delete_eid)) + return g + +def process_graph_pairs(args, g, mode, positive_pairs, negative_pairs, stat_dict): + if args.node_topology == '0-hop': + zero_hop_g = remove_neighbor_edge(g) + posteriors_dict = query_trained_model(args, np.arange(len(g.nodes())), zero_hop_g, mode) + + positive_features, positive_labels, positive_stat_dict = generate_features(args, positive_pairs, posteriors_dict, 1) + negative_features, negative_labels, negative_stat_dict = generate_features(args, negative_pairs, posteriors_dict, 0) + + stat_dict.update({**positive_stat_dict, **negative_stat_dict}) + features = positive_features + negative_features + labels = positive_labels + negative_labels + + elif args.node_topology in ['1-hop', '2-hop']: + k = 1 if args.node_topology == '1-hop' else 2 + features, labels = [], [] + flag = 1 + for pairs in (positive_pairs, negative_pairs): + label = flag + flag -= 1 + batch_size = 4096 + num_batch = len(pairs) // batch_size + pool = Pool(12) + results = [] + + for i in tqdm(range(num_batch+1)): + batch_pairs = pairs[i*batch_size:(i+1)*batch_size] if i < num_batch else pairs[i*batch_size:] + batch_result = pool.apply_async(get_batch_posteriors, args=(args, batch_pairs, g, k, i, label, mode)) + results.append(batch_result) + + pool.close() + pool.join() + for batch_result in results: + batch_result = batch_result.get() + features.extend(batch_result[0]) + labels.extend(batch_result[1]) + stat_dict.update(batch_result[2]) + + return np.array(features).astype(np.float32), th.from_numpy(np.array(features)), labels + +def inductive_split(args, train_g, test_g, mode_func): + make_dirs() + dataloaders = [] + stat_dicts = [] + for count, g in enumerate([train_g, test_g]): + mode = f"shadow{str(args.prop)}" if args.prop else "shadow" + mode = mode if count == 0 else "target" + if args.diff: + args.dataset = args.target_dataset if mode == 'target' else args.shadow_dataset + index = np.arange(len(g.nodes())) + stat_dict = {} + + positive_pairs, negative_pairs = generate_pairs(g, index) + print(f"Finish Generating Pairs...") + + features, features_tensor, labels = mode_func(args, g, mode, positive_pairs, negative_pairs, stat_dict) + + indices = th.from_numpy(np.array(positive_pairs + negative_pairs)) + labels = th.tensor(labels) + + dataset = th.utils.data.TensorDataset(indices, features_tensor, labels) + dataloader = th.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + + dataloaders.append(dataloader) + stat_dicts.append(stat_dict) + + feature_dim = features[0].shape[0] + return dataloaders[0], dataloaders[1], feature_dim, stat_dicts[1] + + +def inductive_split_posteriors(args, train_g, test_g): + return inductive_split(args, train_g, test_g, process_graph_pairs) + +def inductive_split_plus(args, train_g, test_g): + return inductive_split(args, train_g, test_g, process_graph_pairs) + +def inductive_split_plus2(args, train_g, test_g): + return inductive_split(args, train_g, test_g, process_graph_pairs) + +def inductive_split_all(args, train_g, test_g): + return inductive_split(args, train_g, test_g, process_graph_pairs) diff --git a/new_code/load_data/load_graph.py b/new_code/load_data/load_graph.py new file mode 100644 index 0000000..724e09e --- /dev/null +++ b/new_code/load_data/load_graph.py @@ -0,0 +1,77 @@ +import numpy as np +import torch as th +import dgl +import networkx as nx +from tqdm import tqdm +from dgl.data import CiteseerGraphDataset, CoraGraphDataset, PubmedGraphDataset + +# Set random seeds for reproducibility +np.random.seed(0) +th.manual_seed(0) +th.backends.cudnn.deterministic = True +th.backends.cudnn.benchmark = False + +def load_dataset(dataset_name): + if dataset_name == 'citeseer': + data = CiteseerGraphDataset() + elif dataset_name == 'cora': + data = CoraGraphDataset() + elif dataset_name == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError("Unsupported dataset name") + + graph = data[0] + nx_g = nx.Graph(graph.to_networkx()) + + for node_id in nx_g.nodes(): + nx_g.nodes[node_id]["features"] = graph.ndata['feat'][node_id].numpy() + nx_g.nodes[node_id]["labels"] = graph.ndata['label'][node_id].item() + + dgl_graph = dgl.from_networkx(nx_g, node_attrs=['features', 'labels']) + dgl_graph = dgl.add_self_loop(dgl_graph) + dgl_graph = dgl.to_simple(dgl_graph, copy_ndata=True) + dgl_graph = dgl.to_bidirected(dgl_graph, copy_ndata=True) + + print(f"Graph density: {nx.density(nx_g)}") + print(f"Classes: {len(set(graph.ndata['label'].tolist()))}") + print(f"Feature dim: {graph.ndata['feat'].shape[1]}") + print(f"Graph has {dgl_graph.number_of_nodes()} nodes and {dgl_graph.number_of_edges()} edges.") + + return dgl_graph, len(set(graph.ndata['label'].tolist())) + +def node_sample(g, prop=0.5): + node_number = g.number_of_nodes() + node_indices = np.random.permutation(node_number) + split_length = int(node_number * prop) + return np.sort(node_indices[:split_length]), np.sort(node_indices[split_length:]) + +def remove_neighbor_edge_by_prop(g, prop=0.2): + real_pairs = [(i, start.item(), end.item()) for i, (start, end) in enumerate(zip(*g.edges())) if start < end] + delete_edge_num = int(len(real_pairs) * prop) + + print(f"Real Pairs Number (no self-loop & reverse edge): {len(real_pairs)}") + print(f"Delete real pairs number (no self-loop & reverse edge): {delete_edge_num}") + + delete_ids = np.random.choice(len(real_pairs), delete_edge_num, replace=False) + delete_eids = [g.edge_ids(start, end) for i in delete_ids for start, end in real_pairs[i][1:]] + + g = dgl.remove_edges(g, th.tensor(delete_eids)) + + print(f"Deleted {len(delete_eids)} edges") + print(f"Remaining edges: {g.number_of_edges()}") + return g, [(real_pairs[i][1], real_pairs[i][2]) for i in delete_ids] + +def split_target_shadow(g): + target_index, shadow_index = node_sample(g) + return g.subgraph(target_index), g.subgraph(shadow_index) + +def split_target_shadow_by_prop(args, g): + target_g, shadow_g = split_target_shadow(g) + shadow_index_prop, _ = node_sample(shadow_g, args.prop * 0.01) + shadow_g = shadow_g.subgraph(shadow_index_prop) + return target_g, shadow_g + +def split_train_test(g): + train_index, test_index = node_sample(g, 0.8) + return g.subgraph(train_index), g.subgraph(test_index) diff --git a/new_code/mlp_attack.py b/new_code/mlp_attack.py new file mode 100644 index 0000000..4ea42e9 --- /dev/null +++ b/new_code/mlp_attack.py @@ -0,0 +1,159 @@ +import os +import torch as th +import networkx as nx +import datetime +import argparse +import numpy as np +import random + +from load_data.load_graph import load_dataset, split_target_shadow, split_target_shadow_by_prop, split_train_test +from load_data.inductive_split import inductive_split_posteriors, inductive_split_plus, inductive_split_plus2, inductive_split_all +from attack.attack import run_attack, run_attack_three_features, run_attack_two_features + + +th.set_num_threads(1) + +def arg_parse(): + argparser = argparse.ArgumentParser("multi-gpu training") + argparser.add_argument('--gpu', type=int, default=-1, help="GPU device ID. Use -1 for CPU training") + argparser.add_argument('--dataset', type=str, default='Cora') + argparser.add_argument('--node_topology', type=str, default='0-hop', help="node topology used to query the model 0-hop, 2-hop") + argparser.add_argument('--num_epochs', type=int, default=200) + argparser.add_argument('--edge_feature', type=str, default='all') + argparser.add_argument('--n_hidden', type=int, default=128) + argparser.add_argument('--mlp_layers', type=int, default=3) + argparser.add_argument('--gnn_layers', type=int, default=2) + argparser.add_argument('--batch_size', type=int, default=1000) + argparser.add_argument('--lr', type=float, default=0.001) + argparser.add_argument('--log-every', type=int, default=20) + argparser.add_argument('--eval-every', type=int, default=10) + argparser.add_argument('--dropout', type=float, default=0.5) + argparser.add_argument("--seed", type=int, default=0, help="seed") + argparser.add_argument('--optim', type=str, default='adam') + argparser.add_argument('--target_model', type=str, default='graphsage') + argparser.add_argument('--shadow_model', type=str, default='graphsage') + argparser.add_argument('--num_workers', type=int, default=0, help="Number of sampling processes. Use 0 for no extra process.") + argparser.add_argument('--model_save_path', type=str, default='../data/save_model/gnn/') + argparser.add_argument('--attack_model_save_path', type=str, default='../data/save_model/mlp/') + argparser.add_argument('--load_trained', type=str, default='no') + argparser.add_argument('--plus', action='store_true') + argparser.add_argument('--plus2', action='store_true') + argparser.add_argument('--all', action='store_true') + argparser.add_argument('--scheduler', action='store_true') + argparser.add_argument('--perturb_type', type=str, default='discrete') + argparser.add_argument('--dp', action='store_true') + argparser.add_argument('--epsilon', type=int, default=8) + argparser.add_argument('--label_only', action='store_true') + argparser.add_argument('--soft_prob', action='store_true') + argparser.add_argument('--T', type=int, default=20) + argparser.add_argument('--prop', type=int, help="use a specified proportion of the shadow dataset") + args = argparser.parse_args() + + # args.device = th.device('cuda:%d' % args.gpu if args.gpu >= 0 else 'cpu') + args.device = th.device('cpu') + args.trad = False + return args + +def initialize_seeds(args): + random.seed(args.seed) + np.random.seed(args.seed) + th.manual_seed(args.seed) + +def prepare_graph_data(args): + g, n_classes = load_dataset(args.dataset) + print(nx.density(g.to_networkx())) + + args.diff = False + args.in_feats = g.ndata['features'].shape[1] + args.node_feature_dim = args.in_feats + args.graph_feature_dim = 3 + args.n_classes = n_classes + args.setting = 'inductive' + + if args.prop: + target_g, shadow_g = split_target_shadow_by_prop(args, g) + else: + target_g, shadow_g = split_target_shadow(g) + + target_train_g, target_test_g = split_train_test(target_g) + shadow_train_g, shadow_test_g = split_train_test(shadow_g) + target_train_g.create_formats_() + shadow_train_g.create_formats_() + + return target_train_g, shadow_train_g, n_classes + +def print_graph_statistics(target_train_g, shadow_train_g, n_classes, args): + print(f"Target Train Graph Num of Edges: {len(target_train_g.edges()[0])}") + print(f"Target Train Graph Num of Nodes: {len(target_train_g.nodes())}") + print(f"Target Train Graph Density: {nx.density(target_train_g.to_networkx()):.5f}") + + print(f"Shadow Train Graph Num of Edges: {len(shadow_train_g.edges()[0])}") + print(f"Shadow Train Graph Num of Nodes: {len(shadow_train_g.nodes())}") + print(f"Shadow Train Graph Density: {nx.density(shadow_train_g.to_networkx()):.5f}") + + print(f"Classes: {n_classes}") + print(f"Feature dim: {args.in_feats}") + +def perform_attack(args, target_train_g, shadow_train_g): + if args.plus: + return perform_plus_attack(args, target_train_g, shadow_train_g) + elif args.plus2: + return perform_plus2_attack(args, target_train_g, shadow_train_g) + elif args.all: + return perform_all_attack(args, target_train_g, shadow_train_g) + else: + return perform_default_attack(args, target_train_g, shadow_train_g) + +def perform_plus_attack(args, target_train_g, shadow_train_g): + args.feature = 'label_node' if args.label_only else 'posteriors_node' + args.method = f'{args.node_topology}_{args.feature}' + train_dataloader, test_dataloader, posterior_feature_dim, stat_dict = inductive_split_plus(args, shadow_train_g, target_train_g) + return run_attack_two_features(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict) + +def perform_plus2_attack(args, target_train_g, shadow_train_g): + args.feature = 'label_graph' if args.label_only else 'posteriors_graph' + args.method = f'{args.node_topology}_{args.feature}' + train_dataloader, test_dataloader, posterior_feature_dim, stat_dict = inductive_split_plus2(args, shadow_train_g, target_train_g) + return run_attack_two_features(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict) + +def perform_all_attack(args, target_train_g, shadow_train_g): + args.feature = 'label_node_graph' if args.label_only else 'posteriors_node_graph' + args.method = f'{args.node_topology}_{args.feature}' + train_dataloader, test_dataloader, posterior_feature_dim, stat_dict = inductive_split_all(args, shadow_train_g, target_train_g) + return run_attack_three_features(args, posterior_feature_dim, train_dataloader, test_dataloader, stat_dict) + +def perform_default_attack(args, target_train_g, shadow_train_g): + args.feature = 'label' if args.label_only else 'posteriors' + args.method = f'{args.node_topology}_{args.feature}' + train_dataloader, test_dataloader, feature_dim, stat_dict = inductive_split_posteriors(args, shadow_train_g, target_train_g) + return run_attack(args, feature_dim, train_dataloader, test_dataloader, stat_dict) + +def save_results(args, stat_dict, log_dir, k, train_acc, train_auc, test_acc, test_auc): + pickle_path = os.path.join(args.data_save_path, f'{args.setting}_{args.dataset}_{args.target_model}_{args.shadow_model}_{args.method}.pickle') + th.save(stat_dict, pickle_path) + + is_scheduled = 1 if args.scheduler else 0 + with open(os.path.join(log_dir, "attack_performance.txt"), "a") as wf: + wf.write(f"{args.dataset}, {args.target_model}, {args.shadow_model}, {args.edge_feature}, {is_scheduled}, {args.optim}, {args.lr}, {args.method}, {train_acc:.3f}, {train_auc:.3f}, {test_acc:.3f}, {test_auc:.3f}, {str(datetime.timedelta(seconds=k))}, {args.seed}\n") + +def main(): + args = arg_parse() + args.model_save_path = './data/save_model/gnn/' + args.data_save_path = './data/' + log_dir = 'output/logs/' + + initialize_seeds(args) + begin = datetime.datetime.now() + + target_train_g, shadow_train_g, n_classes = prepare_graph_data(args) + print_graph_statistics(target_train_g, shadow_train_g, n_classes, args) + + model, train_acc, train_auc, test_acc, test_auc, stat_dict = perform_attack(args, target_train_g, shadow_train_g) + + end = datetime.datetime.now() + k = (end - begin).seconds + + save_results(args, stat_dict, log_dir, k, train_acc, train_auc, test_acc, test_auc) + +if __name__ == '__main__': + main() diff --git a/new_code/model/__init__.py b/new_code/model/__init__.py new file mode 100644 index 0000000..2688cfa --- /dev/null +++ b/new_code/model/__init__.py @@ -0,0 +1,2 @@ +from .gnn import * +from .mlp import * diff --git a/new_code/model/gnn.py b/new_code/model/gnn.py new file mode 100644 index 0000000..2bbed33 --- /dev/null +++ b/new_code/model/gnn.py @@ -0,0 +1,402 @@ +import torch as th +th.manual_seed(0) +import torch.nn as nn +import dgl +import dgl.nn.pytorch as dglnn + + +class SAGE(nn.Module): + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + batch_size, + num_workers, + dropout): + super().__init__() + self.n_layers = n_layers + self.n_hidden = n_hidden + self.n_classes = n_classes + self.layers = nn.ModuleList() + self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean')) + for i in range(1, n_layers - 1): + self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean')) + self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean')) + self.dropout = nn.Dropout(dropout) + self.activation = activation + self.batch_size = batch_size + self.num_workers = num_workers + + def forward(self, blocks, x): + h = x + for l, (layer, block) in enumerate(zip(self.layers, blocks)): + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + return h + + + def inference(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + + x = y + return y + + def extract_embedding(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.NodeDataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + x = y + # return the embedding after the first layer; + break + return y + + + +class GCN(nn.Module): + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + batch_size, + num_workers, + dropout): + super().__init__() + self.n_layers = n_layers + self.n_hidden = n_hidden + self.n_classes = n_classes + self.layers = nn.ModuleList() + # self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'gcn')) + # for i in range(1, n_layers - 1): + # self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'gcn')) + # self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'gcn')) + self.layers.append(dglnn.GraphConv(in_feats, n_hidden, allow_zero_in_degree=True)) + for i in range(1, n_layers - 1): + self.layers.append(dglnn.GraphConv(n_hidden, n_hidden, allow_zero_in_degree=True)) + self.layers.append(dglnn.GraphConv(n_hidden, n_classes, allow_zero_in_degree=True)) + self.dropout = nn.Dropout(dropout) + self.activation = activation + self.batch_size = batch_size + self.num_workers = num_workers + + def forward(self, blocks, x): + h = x + for l, (layer, block) in enumerate(zip(self.layers, blocks)): + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + return h + + def inference(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + + x = y + return y + + def extract_embedding(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + x = y + # return the embedding after the first layer; + break + return y + + + +class GAT(nn.Module): + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + batch_size, + num_workers, + dropout,num_heads=2): + super().__init__() + self.n_layers = n_layers + self.n_hidden = n_hidden + self.n_classes = n_classes + self.num_heads = num_heads + self.layers = nn.ModuleList() + self.layers.append(dglnn.GATConv(in_feats, n_hidden, num_heads, allow_zero_in_degree=True)) + for i in range(1, n_layers - 1): + self.layers.append(dglnn.GATConv(n_hidden*num_heads, n_hidden, num_heads, allow_zero_in_degree=True)) + self.layers.append(dglnn.GATConv(n_hidden*num_heads, n_classes, 1, allow_zero_in_degree=True)) + self.dropout = nn.Dropout(dropout) + self.activation = activation + self.batch_size = batch_size + self.num_workers = num_workers + + def forward(self, blocks, x): + h = x + for l, (layer, block) in enumerate(zip(self.layers, blocks)): + h = layer(block, h).flatten(1) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + logits = h + return logits + + def inference(self, g, x, device): + """ + Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). + g : the entire graph. + x : the input of entire node set. + + The inference code is written in a fashion that it could handle any number of nodes and + layers. + """ + # During inference with sampling, multi-layer blocks are very inefficient because + # lots of computations in the first few layers are repeated. + # Therefore, we compute the representation of all nodes layer by layer. The nodes + # on each layer are of course splitted in batches. + # TODO: can we standardize this? + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden*self.num_heads if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h).flatten(1) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + + x = y + return y + + def extract_embedding(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden*self.num_heads if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h).flatten(1) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + x = y + # return the embedding after the first layer; + break + return y + +class GIN(nn.Module): + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + batch_size, + num_workers, + dropout): + super().__init__() + self.n_layers = 2 + self.n_hidden = n_hidden + self.n_classes = n_classes + self.layers = nn.ModuleList() + linear = nn.Linear(in_feats, n_hidden) + self.layers.append(dglnn.GINConv(linear, 'sum')) + for i in range(1, n_layers-1): + linear = nn.Linear(n_hidden, n_hidden) + self.layers.append(dglnn.GINConv(linear, 'sum')) + linear = nn.Linear(n_hidden, n_classes) + self.layers.append(dglnn.GINConv(linear, 'mean')) + + self.dropout = nn.Dropout(dropout) + self.activation = activation + self.batch_size = batch_size + self.num_workers = num_workers + + def forward(self, blocks, x): + h = x + for l, (layer, block) in enumerate(zip(self.layers, blocks)): + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + return h + + def inference(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + + x = y + return y + + def extract_embedding(self, g, x, device): + for l, layer in enumerate(self.layers): + y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) + + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) + dataloader = dgl.dataloading.DataLoader( + g, + th.arange(g.number_of_nodes()), + sampler, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=self.num_workers) + + for input_nodes, output_nodes, blocks in dataloader: + block = blocks[0] + + block = block.int().to(device) + h = x[input_nodes].to(device) + h = layer(block, h) + if l != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + + y[output_nodes] = h.cpu() + x = y + # return the embedding after the first layer; + break + return y \ No newline at end of file diff --git a/new_code/model/mlp.py b/new_code/model/mlp.py new file mode 100644 index 0000000..63426d1 --- /dev/null +++ b/new_code/model/mlp.py @@ -0,0 +1,80 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class MLP_Base(nn.Module): + def __init__(self, input_dims, layer_sizes, output_size): + super(MLP_Base, self).__init__() + self.input_dims = input_dims + self.layers = nn.ModuleList() + + # Create a list of fully connected layers for each input dimension + for i, dim in enumerate(input_dims): + self.layers.append(self.create_fc_layers(dim, layer_sizes[i])) + + self.fc_out = nn.Linear(sum(layer_sizes), output_size) # Output layer + + def create_fc_layers(self, input_dim, layer_sizes): + layers = [] + prev_size = input_dim + for size in layer_sizes: + layers.append(nn.Linear(prev_size, size)) + layers.append(nn.ReLU()) + prev_size = size + return nn.Sequential(*layers) + + def forward(self, *inputs): + assert len(inputs) == len(self.input_dims), "Number of inputs should match input dimensions" + + outputs = [] + for i, x in enumerate(inputs): + x = x.view(-1, self.input_dims[i]) + x = self.layers[i](x) + outputs.append(x) + + combined = torch.cat(outputs, dim=1) + out = self.fc_out(combined) + return out + + +class MLP_ATTACK(MLP_Base): + def __init__(self, dim_in): + # Use the base class with the given dimension + super(MLP_ATTACK, self).__init__(input_dims=[dim_in], layer_sizes=[[128, 32]], output_size=2) + + +class MLP_ATTACK_PLUS(MLP_Base): + def __init__(self, dim_in_1, dim_in_2): + # Use the base class with two input dimensions + super(MLP_ATTACK_PLUS, self).__init__(input_dims=[dim_in_1, dim_in_2], + layer_sizes=[[128, 64, 16], [64, 16]], + output_size=2) + + +class MLP_ATTACK_PLUS2(MLP_Base): + def __init__(self, dim_in_1, dim_in_2): + # Use the base class with two input dimensions + super(MLP_ATTACK_PLUS2, self).__init__(input_dims=[dim_in_1, dim_in_2], + layer_sizes=[[16, 4], [128, 64, 16]], + output_size=2) + + +class MLP_ATTACK_ALL(MLP_Base): + def __init__(self, dim_in_1, dim_in_2, dim_in_3): + # Use the base class with three input dimensions + super(MLP_ATTACK_ALL, self).__init__(input_dims=[dim_in_1, dim_in_2, dim_in_3], + layer_sizes=[[128, 64, 16], [128, 64, 16], [4]], + output_size=2) + + +class MLP_Target(nn.Module): + def __init__(self, dim_in, dim_out): + super(MLP_Target, self).__init__() + self.fc1 = nn.Linear(dim_in, 32) + self.fc2 = nn.Linear(32, dim_out) + + def forward(self, x): + x = x.view(-1, self.fc1.in_features) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return x diff --git a/new_code/train.py b/new_code/train.py new file mode 100644 index 0000000..1d58e1f --- /dev/null +++ b/new_code/train.py @@ -0,0 +1,92 @@ +import torch as th +import argparse +import os + +from load_data.load_graph import load_dataset, split_target_shadow, split_target_shadow_by_prop, split_train_test +from attack.target import run_gnn + +th.set_num_threads(1) + + +def arg_parse(): + argparser = argparse.ArgumentParser("multi-gpu training") + argparser.add_argument('--gpu', type=int, default=0, + help="GPU device ID. Use -1 for CPU training") + argparser.add_argument('--dataset', type=str, default='Cora') + argparser.add_argument('--num_epochs', type=int, default=200) + argparser.add_argument('--n_hidden', type=int, default=128) + argparser.add_argument('--gnn_layers', type=int, default=2) + argparser.add_argument('--batch_size', type=int, default=1000) + argparser.add_argument('--lr', type=float, default=0.001) + argparser.add_argument('--dropout', type=float, default=0.5) + argparser.add_argument('--log-every', type=int, default=20) + argparser.add_argument('--eval-every', type=int, default=5) + argparser.add_argument('--model', type=str, default='graphsage') + argparser.add_argument('--mode', type=str, default='target') + argparser.add_argument('--fan-out', type=str, default='10,25') + argparser.add_argument('--num_workers', type=int, default=4, + help="Number of sampling processes. Use 0 for no extra process.") + argparser.add_argument('--model_save_path', type=str, default='../data/save_model/gnn/') + argparser.add_argument('--attack_model_save_path', type=str, default='../data/save_model/mlp/') + argparser.add_argument('--load_trained', type=str, default='no') + argparser.add_argument('--dp', action='store_true') + argparser.add_argument('--epsilon', type=int, default=8) + argparser.add_argument('--delta', type=float, default=1e-5) + argparser.add_argument('--noise_seed', type=int, default=42) + argparser.add_argument('--noise_type', type=str, default='laplace') + argparser.add_argument('--perturb_type', type=str, default='continuous') + argparser.add_argument('--prop', type=int, + help="use a specified propotion of the shadow dataset") + argparser.add_argument("--seed", type=int, default=0, help="seed",) + args = argparser.parse_args() + + # if args.gpu >= 0: + # args.device = th.device('cuda:%d' % args.gpu) + # else: + args.device = th.device('cpu') + + return args + + +if __name__ == '__main__': + + args = arg_parse() + args.model_save_path = f'./data/save_model/gnn/' + args.data_save_path = f'./data/' + log_dir = 'output/logs/' + os.makedirs(args.model_save_path, exist_ok=True) + os.makedirs(log_dir, exist_ok=True) + g, n_classes = load_dataset(args.dataset) + + in_feats = g.ndata['features'].shape[1] + args.in_feats = in_feats + args.n_classes = n_classes + args.setting = 'inductive' + + if args.prop: + target_g, shadow_g = split_target_shadow_by_prop(args, g) + else: + target_g, shadow_g = split_target_shadow(g) + + + if args.mode == 'target': + target_train_g, target_test_g = split_train_test(target_g) + + target_train_g.create_formats_() + target_test_g.create_formats_() + + run_data = target_train_g, target_test_g + + elif args.mode == 'shadow': + shadow_train_g, shadow_test_g = split_train_test(shadow_g) + + shadow_train_g.create_formats_() + shadow_test_g.create_formats_() + + run_data = shadow_train_g, shadow_test_g + + train_acc, test_acc = run_gnn(args, run_data) + prop = args.prop if args.prop else 100 + with open(os.path.join(log_dir, "target_preformance.txt"), "a") as wf: + wf.write("%s, %s, %s, %d, %.3f, %.3f, %.3d\n" % (args.dataset, args.model, args.mode, prop, train_acc, test_acc, args.seed)) + diff --git a/new_code/utils/__init__.py b/new_code/utils/__init__.py new file mode 100644 index 0000000..2fb483b --- /dev/null +++ b/new_code/utils/__init__.py @@ -0,0 +1,4 @@ +from .graph_features import * +from .load_model import * +from .model_evaluation import * +from .model_query import * \ No newline at end of file diff --git a/new_code/utils/graph_features.py b/new_code/utils/graph_features.py new file mode 100644 index 0000000..d3a3ad0 --- /dev/null +++ b/new_code/utils/graph_features.py @@ -0,0 +1,90 @@ +import networkx as nx +import dgl + + +def compute_jaccard_coefficient(nx_g, pairs): + """ + Computes the Jaccard coefficient for each pair of nodes in the graph. + + Args: + nx_g (networkx.Graph): The graph object. + pairs (list of tuple): List of node pairs to compute Jaccard coefficient. + + Returns: + dict: A dictionary where the keys are node pairs and the values are their Jaccard coefficients. + """ + jaccard_dict = {} + for u, v, p in nx.jaccard_coefficient(nx_g, pairs): + jaccard_dict[(u, v)] = round(p, 3) + return jaccard_dict + + +def compute_preferential_attachment(nx_g, pairs): + """ + Computes the preferential attachment for each pair of nodes in the graph. + + Args: + nx_g (networkx.Graph): The graph object. + pairs (list of tuple): List of node pairs to compute preferential attachment. + + Returns: + dict: A dictionary where the keys are node pairs and the values are their preferential attachment scores. + """ + attach_dict = {} + for u, v, p in nx.preferential_attachment(nx_g, pairs): + attach_dict[(u, v)] = round(p, 3) + return attach_dict + + +def compute_common_neighbors(nx_g, pairs): + """ + Computes the number of common neighbors for each pair of nodes in the graph. + + Args: + nx_g (networkx.Graph): The graph object. + pairs (list of tuple): List of node pairs to compute common neighbors. + + Returns: + dict: A dictionary where the keys are node pairs and the values are the number of common neighbors. + """ + neighbors_dict = {} + for start_id, end_id in pairs: + neighbors_dict[(start_id, end_id)] = len(list(nx.common_neighbors(nx_g, start_id, end_id))) + return neighbors_dict + + +def generate_graph_features(g, pairs, k=1, label=1): + """ + Generate graph-based features for a given set of node pairs. + + Args: + g (DGLGraph): The input DGL graph. + pairs (list of tuple): List of node pairs for which the features are to be generated. + k (int): The radius for the ego graph. + label (int): The label of the edge to be removed for the subgraph when label = 1. + + Returns: + tuple: A tuple containing three dictionaries for Jaccard, Preferential Attachment, and Common Neighbors. + """ + nx_g = nx.Graph(dgl.to_networkx(g, node_attrs=["features"])) + + jaccard_dict = {} + attach_dict = {} + neighbors_dict = {} + + for pair in pairs: + start_subgraph_nodes = list(nx.ego.ego_graph(nx_g, n=pair[0], radius=k).nodes()) + end_subgraph_nodes = list(nx.ego.ego_graph(nx_g, n=pair[1], radius=k).nodes()) + subgraph_nodes = start_subgraph_nodes + end_subgraph_nodes + subgraph = nx_g.subgraph(subgraph_nodes).copy() + + if label == 1: + subgraph.remove_edge(pair[0], pair[1]) + + # Compute features for the subgraph + jaccard_dict[pair] = round(next(nx.jaccard_coefficient(subgraph, [(pair[0], pair[1])]))[2], 3) + attach_dict[pair] = round(next(nx.preferential_attachment(subgraph, [(pair[0], pair[1])]))[2], 3) + neighbors_dict[pair] = len(list(nx.common_neighbors(subgraph, pair[0], pair[1]))) + + print("Finished generating graph features.") + return jaccard_dict, attach_dict, neighbors_dict diff --git a/new_code/utils/load_model.py b/new_code/utils/load_model.py new file mode 100644 index 0000000..e79a7dc --- /dev/null +++ b/new_code/utils/load_model.py @@ -0,0 +1,54 @@ +import torch as th +import torch.nn.functional as F +from model.gnn import SAGE, GAT, GIN, GCN + +def get_gnn_model(config): + """ + Returns a GNN model based on the given configuration. + + Args: + config: The configuration object containing model parameters. + + Returns: + model: The GNN model (SAGE, GAT, GIN, or GCN). + """ + model_map = { + 'graphsage': SAGE, + 'gat': GAT, + 'gin': GIN, + 'gcn': GCN + } + + model_class = model_map.get(config.model.lower()) + + if model_class is None: + raise ValueError(f"Unsupported model type: {config.model}") + + return model_class( + config.in_feats, + config.n_hidden, + config.n_classes, + config.gnn_layers, + F.relu, + config.batch_size, + config.num_workers, + config.dropout + ) + + +def load_model(model, model_path, device): + """ + Loads a trained model from the given file path. + + Args: + model: The model to be loaded. + model_path (str): The path to the saved model file. + device: The device to load the model onto (CPU or GPU). + + Returns: + model: The model with loaded weights. + """ + print(f"Loading model from: {model_path}") + state_dict = th.load(model_path, map_location=device) + model.load_state_dict(state_dict) + return model diff --git a/new_code/utils/model_evaluation.py b/new_code/utils/model_evaluation.py new file mode 100644 index 0000000..c71bd25 --- /dev/null +++ b/new_code/utils/model_evaluation.py @@ -0,0 +1,45 @@ +import torch as th +from scipy.special import softmax + + +def compute_accuracy(predictions, labels): + """ + Compute the accuracy of predictions given the labels. + + Args: + predictions (torch.Tensor): The predicted values. + labels (torch.Tensor): The true labels. + + Returns: + float: The accuracy of the predictions. + """ + labels = labels.long() + correct_predictions = (th.argmax(predictions, dim=1) == labels).float() + accuracy = correct_predictions.sum() / len(predictions) + return accuracy + + +def evaluate_model(model, graph, inputs, labels, validation_node_ids, device): + """ + Evaluate the model on the validation set. + + Args: + model: The model to be evaluated. + graph (DGLGraph): The entire graph. + inputs (torch.Tensor): The features of all the nodes. + labels (torch.Tensor): The labels of all the nodes. + validation_node_ids (list): List of node IDs for validation. + device: The device to run the evaluation on (CPU or GPU). + + Returns: + tuple: The accuracy of the model and the softmax probabilities of the predictions. + """ + model.eval() # Set the model to evaluation mode + with th.no_grad(): # Disable gradient computation + predictions = model.inference(graph, inputs, device) + + model.train() # Switch back to training mode + accuracy = compute_accuracy(predictions[validation_node_ids], labels[validation_node_ids]) + softmax_probs = softmax(predictions[validation_node_ids].detach().cpu().numpy(), axis=1) + + return accuracy, softmax_probs diff --git a/new_code/utils/model_query.py b/new_code/utils/model_query.py new file mode 100644 index 0000000..d585d87 --- /dev/null +++ b/new_code/utils/model_query.py @@ -0,0 +1,43 @@ +import os +import torch as th +from utils.load_model import load_model, get_gnn_model + +def query_trained_model(config, train_index, graph, mode): + """ + Query a trained model using the 0-hop training graph nodes. + + Args: + config: The configuration object containing model and dataset parameters. + train_index (list): List of training node indices. + graph (DGLGraph): The graph used for inference. + mode (str): The mode in which the model is used ('target' or 'shadow'). + + Returns: + dict: A dictionary with node indices as keys and model predictions as values. + """ + # Set the features and classes according to the mode ('target' or 'shadow') + if config.diff: + config.in_feats = config.target_in_feats if mode == 'target' else config.shadow_in_feats + config.n_classes = config.target_classes if mode == 'target' else config.shadow_classes + + # Set the model according to the mode + config.model = config.target_model if mode == 'target' else config.shadow_model + # config.device = th.device('cpu') # Set device to CPU explicitly + config.device = th.device('cuda' if th.cuda.is_available() else 'cpu') + + model = get_gnn_model(config).to(config.device) + + model_save_path = os.path.join(config.model_save_path, f'{config.setting}_{config.dataset}_{config.model}_{mode}.pth') + print(f"Loading {mode} model from: {model_save_path}") + model = load_model(model, model_save_path, config.device) + + # Query the trained model + model.eval() + with th.no_grad(): + predictions = model.inference(graph, graph.ndata['features'], config.device) + + # Store the predictions for the training nodes + result_dict = {train_index[i]: predictions[train_index[i]] for i in range(len(train_index))} + + print(f"Finished querying {mode} model!") + return result_dict diff --git a/pygip/protect/__init__.py b/pygip/protect/__init__.py index 733ba09..1846720 100644 --- a/pygip/protect/__init__.py +++ b/pygip/protect/__init__.py @@ -1 +1 @@ -from .gnn_mea import * +from .attack import * diff --git a/pygip/protect/attack.py b/pygip/protect/attack.py index 0abaead..e67f4f1 100644 --- a/pygip/protect/attack.py +++ b/pygip/protect/attack.py @@ -372,34 +372,170 @@ def attack(self): print(best_performance_metrics) -class ModelExtractionAttack1(ModelExtractionAttack): +# class ModelExtractionAttack1(ModelExtractionAttack): + +# def __init__(self, dataset, attack_node_fraction, selected_node_file, query_label_file, shadow_graph_file=None): +# super().__init__(dataset, attack_node_fraction) +# self.attack_node_number = 700 +# self.selected_node_file = selected_node_file +# self.query_label_file = query_label_file +# self.shadow_graph_file = shadow_graph_file + +# def attack(self): + +# # read the selected node file +# selected_node_file = open(self.selected_node_file, "r") +# lines1 = selected_node_file.readlines() +# attack_nodes = [] +# for line_1 in lines1: +# attack_nodes.append(int(line_1)) +# selected_node_file.close() + +# # find the testing node +# testing_nodes = [] +# for i in range(self.node_number): +# if i not in attack_nodes: +# testing_nodes.append(i) + +# attack_features = self.features[attack_nodes] + +# # mark the test/train split. +# for i in range(self.node_number): +# if i in attack_nodes: +# self.test_mask[i] = 0 +# self.train_mask[i] = 1 +# else: +# self.test_mask[i] = 1 +# self.train_mask[i] = 0 + +# sub_test_mask = self.test_mask + +# # get their labels +# query_label_file = open(self.query_label_file, "r") +# lines2 = query_label_file.readlines() +# all_query_labels = [] +# attack_query = [] +# for line_2 in lines2: +# all_query_labels.append(int(line_2.split()[1])) +# if int(line_2.split()[0]) in attack_nodes: +# attack_query.append(int(line_2.split()[1])) +# query_label_file.close() + +# attack_query = torch.LongTensor(attack_query) +# all_query_labels = torch.LongTensor(all_query_labels) + +# # build shadow graph +# shadow_graph_file = open(self.shadow_graph_file, "r") +# lines3 = shadow_graph_file.readlines() +# adj_matrix = np.zeros( +# (self.attack_node_number, self.attack_node_number)) +# for line_3 in lines3: +# list_line = line_3.split() +# adj_matrix[int(list_line[0])][int(list_line[1])] = 1 +# adj_matrix[int(list_line[1])][int(list_line[0])] = 1 +# shadow_graph_file.close() + +# g_shadow = np.asmatrix(adj_matrix) +# sub_g = nx.from_numpy_array(g_shadow) + +# # add self loop +# sub_g.remove_edges_from(nx.selfloop_edges(sub_g)) +# sub_g.add_edges_from(zip(sub_g.nodes(), sub_g.nodes())) +# sub_g = DGLGraph(sub_g) +# n_edges = sub_g.number_of_edges() + +# # normalization +# degs = sub_g.in_degrees().float() +# norm = torch.pow(degs, -0.5) +# norm[torch.isinf(norm)] = 0 +# sub_g.ndata['norm'] = norm.unsqueeze(1) + +# # build GCN + +# # todo check this +# # g = DGLGraph(data.graph) +# # g_numpy = nx.to_numpy_array(data.graph) +# sub_g_b = nx.from_numpy_array( +# np.asmatrix(self.graph.adjacency_matrix().to_dense())) + +# # graph preprocess and calculate normalization factor +# # sub_g_b = nx.from_numpy_array(sub_g_b) +# # add self loop + +# sub_g_b.remove_edges_from(nx.selfloop_edges(sub_g_b)) +# sub_g_b.add_edges_from(zip(sub_g_b.nodes(), sub_g_b.nodes())) + +# sub_g_b = DGLGraph(sub_g_b) +# n_edges = sub_g_b.number_of_edges() +# # normalization +# degs = sub_g_b.in_degrees().float() +# norm = torch.pow(degs, -0.5) +# norm[torch.isinf(norm)] = 0 + +# sub_g_b.ndata['norm'] = norm.unsqueeze(1) + +# # Train the DNN +# net = Net_shadow(self.feature_number, self.label_number) +# print(net) + +# # +# optimizer = torch.optim.Adam( +# net.parameters(), lr=1e-2, weight_decay=5e-4) + +# dur = [] + +# best_performance_metrics = GraphNeuralNetworkMetric() + +# print("===================Model Extracting================================") + +# for epoch in tqdm(range(200)): +# if epoch >= 3: +# t0 = time.time() + +# net.train() +# logits = net(sub_g, attack_features) +# logp = torch.nn.functional.log_softmax(logits, dim=1) +# loss = torch.nn.functional.nll_loss(logp, attack_query) + +# # weights = [1/num_0, 1/num_1, 1/num_2, 1/num_3, 1/num_4, 1/num_5, 1/num_6] +# # class_weights = th.FloatTensor(weights) +# # ============================================================================= +# # criterion = torch.nn.CrossEntropyLoss() +# # loss = criterion(logp, attack_query) +# # ============================================================================= + +# optimizer.zero_grad() +# loss.backward() +# optimizer.step() + +# if epoch >= 3: +# dur.append(time.time() - t0) + +# focus_gnn_metrics = GraphNeuralNetworkMetric( +# 0, 0, net, sub_g_b, self.features, self.test_mask, all_query_labels, self.labels) +# focus_gnn_metrics.evaluate() + +# best_performance_metrics.fidelity = max( +# best_performance_metrics.fidelity, focus_gnn_metrics.fidelity) +# best_performance_metrics.accuracy = max( +# best_performance_metrics.accuracy, focus_gnn_metrics.accuracy) + +# print(best_performance_metrics) - def __init__(self, dataset, attack_node_fraction, selected_node_file, query_label_file, shadow_graph_file=None): +class ModelExtractionAttack1(ModelExtractionAttack): + def __init__(self, dataset, attack_node_fraction, selected_nodes, query_labels, shadow_graph): super().__init__(dataset, attack_node_fraction) self.attack_node_number = 700 - self.selected_node_file = selected_node_file - self.query_label_file = query_label_file - self.shadow_graph_file = shadow_graph_file + self.selected_nodes = selected_nodes + self.query_labels = query_labels + self.shadow_graph = shadow_graph def attack(self): - - # read the selected node file - selected_node_file = open(self.selected_node_file, "r") - lines1 = selected_node_file.readlines() - attack_nodes = [] - for line_1 in lines1: - attack_nodes.append(int(line_1)) - selected_node_file.close() - - # find the testing node - testing_nodes = [] - for i in range(self.node_number): - if i not in attack_nodes: - testing_nodes.append(i) - + attack_nodes = self.selected_nodes + testing_nodes = [i for i in range(self.node_number) if i not in attack_nodes] attack_features = self.features[attack_nodes] - # mark the test/train split. + # mark the test/train split for i in range(self.node_number): if i in attack_nodes: self.test_mask[i] = 0 @@ -409,117 +545,48 @@ def attack(self): self.train_mask[i] = 0 sub_test_mask = self.test_mask - - # get their labels - query_label_file = open(self.query_label_file, "r") - lines2 = query_label_file.readlines() - all_query_labels = [] - attack_query = [] - for line_2 in lines2: - all_query_labels.append(int(line_2.split()[1])) - if int(line_2.split()[0]) in attack_nodes: - attack_query.append(int(line_2.split()[1])) - query_label_file.close() - - attack_query = torch.LongTensor(attack_query) - all_query_labels = torch.LongTensor(all_query_labels) + attack_query = torch.LongTensor([self.query_labels[node] for node in attack_nodes]) + all_query_labels = torch.LongTensor(self.query_labels) # build shadow graph - shadow_graph_file = open(self.shadow_graph_file, "r") - lines3 = shadow_graph_file.readlines() - adj_matrix = np.zeros( - (self.attack_node_number, self.attack_node_number)) - for line_3 in lines3: - list_line = line_3.split() - adj_matrix[int(list_line[0])][int(list_line[1])] = 1 - adj_matrix[int(list_line[1])][int(list_line[0])] = 1 - shadow_graph_file.close() + adj_matrix = np.zeros((self.attack_node_number, self.attack_node_number)) + for edge in self.shadow_graph: + adj_matrix[edge[0]][edge[1]] = 1 + adj_matrix[edge[1]][edge[0]] = 1 g_shadow = np.asmatrix(adj_matrix) sub_g = nx.from_numpy_array(g_shadow) - - # add self loop sub_g.remove_edges_from(nx.selfloop_edges(sub_g)) sub_g.add_edges_from(zip(sub_g.nodes(), sub_g.nodes())) sub_g = DGLGraph(sub_g) n_edges = sub_g.number_of_edges() - - # normalization degs = sub_g.in_degrees().float() norm = torch.pow(degs, -0.5) norm[torch.isinf(norm)] = 0 sub_g.ndata['norm'] = norm.unsqueeze(1) - # build GCN - - # todo check this - # g = DGLGraph(data.graph) - # g_numpy = nx.to_numpy_array(data.graph) - sub_g_b = nx.from_numpy_array( - np.asmatrix(self.graph.adjacency_matrix().to_dense())) - - # graph preprocess and calculate normalization factor - # sub_g_b = nx.from_numpy_array(sub_g_b) - # add self loop - - sub_g_b.remove_edges_from(nx.selfloop_edges(sub_g_b)) - sub_g_b.add_edges_from(zip(sub_g_b.nodes(), sub_g_b.nodes())) - - sub_g_b = DGLGraph(sub_g_b) - n_edges = sub_g_b.number_of_edges() - # normalization - degs = sub_g_b.in_degrees().float() - norm = torch.pow(degs, -0.5) - norm[torch.isinf(norm)] = 0 - - sub_g_b.ndata['norm'] = norm.unsqueeze(1) - # Train the DNN net = Net_shadow(self.feature_number, self.label_number) - print(net) - - # - optimizer = torch.optim.Adam( - net.parameters(), lr=1e-2, weight_decay=5e-4) - + optimizer = torch.optim.Adam(net.parameters(), lr=1e-2, weight_decay=5e-4) dur = [] - best_performance_metrics = GraphNeuralNetworkMetric() - print("===================Model Extracting================================") - for epoch in tqdm(range(200)): if epoch >= 3: t0 = time.time() - net.train() logits = net(sub_g, attack_features) logp = torch.nn.functional.log_softmax(logits, dim=1) loss = torch.nn.functional.nll_loss(logp, attack_query) - - # weights = [1/num_0, 1/num_1, 1/num_2, 1/num_3, 1/num_4, 1/num_5, 1/num_6] - # class_weights = th.FloatTensor(weights) - # ============================================================================= - # criterion = torch.nn.CrossEntropyLoss() - # loss = criterion(logp, attack_query) - # ============================================================================= - optimizer.zero_grad() loss.backward() optimizer.step() - if epoch >= 3: dur.append(time.time() - t0) - - focus_gnn_metrics = GraphNeuralNetworkMetric( - 0, 0, net, sub_g_b, self.features, self.test_mask, all_query_labels, self.labels) + focus_gnn_metrics = GraphNeuralNetworkMetric(0, 0, net, sub_g_b, self.features, self.test_mask, all_query_labels, self.labels) focus_gnn_metrics.evaluate() - - best_performance_metrics.fidelity = max( - best_performance_metrics.fidelity, focus_gnn_metrics.fidelity) - best_performance_metrics.accuracy = max( - best_performance_metrics.accuracy, focus_gnn_metrics.accuracy) - + best_performance_metrics.fidelity = max(best_performance_metrics.fidelity, focus_gnn_metrics.fidelity) + best_performance_metrics.accuracy = max(best_performance_metrics.accuracy, focus_gnn_metrics.accuracy) print(best_performance_metrics)