init

nathanieljevans · nathanieljevans · commit 8257454a4652 · 2025-05-30T15:26:49.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+/data/
+
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/README.md b/README.md
@@ -1 +1,21 @@
-# complex2
+# Implementation of the $Complex^2$ method proposed in "How to Turn Your Knowledge Graph Embeddings into Generative Models" 
+
+THis repository implements a heterogenous-graph version of $Complex^2$ that implicitly constrains edge connections within edge type. 
+
+Original citation: 
+```
+@misc{loconte2024turnknowledgegraphembeddings,
+      title={How to Turn Your Knowledge Graph Embeddings into Generative Models}, 
+      author={Lorenzo Loconte and Nicola Di Mauro and Robert Peharz and Antonio Vergari},
+      year={2024},
+      eprint={2305.15944},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2305.15944}, 
+} 
+```
+
+
+
+
+
diff --git a/biokg.ipynb b/biokg.ipynb
diff --git a/complex2/data/TriplesDataset.py b/complex2/data/TriplesDataset.py
@@ -0,0 +1,35 @@
+
+
+import torch 
+from torch.utils.data import Dataset
+import sys 
+import numpy as np
+sys.path.append('../')
+
+class TriplesDataset(Dataset):
+    """"""
+
+    def __init__(self, triples, filter_to_relation=None):
+        """
+
+        """
+        self.pos_heads = torch.tensor(triples['head'], dtype=torch.long)
+        self.pos_tails = torch.tensor(triples['tail'], dtype=torch.long)
+        self.pos_relations = torch.tensor(triples['relation'], dtype=torch.long)
+
+        if filter_to_relation is not None: 
+            idxs = torch.isin(self.pos_relations, torch.tensor(filter_to_relation, dtype=torch.long)).nonzero(as_tuple=True)[0]
+            self.pos_heads = self.pos_heads[idxs]
+            self.pos_tails = self.pos_tails[idxs]
+            self.pos_relations = self.pos_relations[idxs]
+
+    def __len__(self):
+        return len(self.pos_heads)
+
+    def __getitem__(self, idx):
+        
+        pos_head = self.pos_heads[idx].detach()
+        pos_tail = self.pos_tails[idx].detach()
+        pos_relation = self.pos_relations[idx].detach()
+
+        return pos_head, pos_tail, pos_relation
diff --git a/complex2/models/ComplEx2.py b/complex2/models/ComplEx2.py
@@ -0,0 +1,155 @@
+'''
+Heterogenous version of (distinct from paper implementation)
+How to Turn Your Knowledge Graph Embeddings into Generative Models (https://arxiv.org/pdf/2305.15944) 
+'''
+import numpy as np
+import torch 
+import torch_geometric as pyg
+import time
+import sys 
+
+
+class ComplEx2(torch.nn.Module): 
+
+    def __init__(self, data, hidden_channels=512, scale_grad_by_freq=False, dtype=torch.float32, dropout=0.): 
+
+        super().__init__()
+
+        self.data = data 
+        self.rel2type = {v.item(0):k for k,v in data['edge_reltype'].items()}
+        self.relations = self.rel2type.keys()
+        self.hidden_channels = hidden_channels
+        self.dtype = dtype
+
+        embed_kwargs = {'max_norm':None, # this causes an error when not None by in-place modifications 
+                        'scale_grad_by_freq':scale_grad_by_freq,
+                        'dtype':self.dtype}
+
+        # these now represent categorical logit embeddings
+        self.head_embedding_real_dict = torch.nn.ModuleDict({nodetype:torch.nn.Embedding(num_nodes, embedding_dim=hidden_channels, **embed_kwargs) for nodetype, num_nodes in self.data['num_nodes_dict'].items()})
+        self.head_embedding_imag_dict = torch.nn.ModuleDict({nodetype:torch.nn.Embedding(num_nodes, embedding_dim=hidden_channels, **embed_kwargs) for nodetype, num_nodes in self.data['num_nodes_dict'].items()})
+        self.tail_embedding_real_dict = torch.nn.ModuleDict({nodetype:torch.nn.Embedding(num_nodes, embedding_dim=hidden_channels, **embed_kwargs) for nodetype, num_nodes in self.data['num_nodes_dict'].items()})
+        self.tail_embedding_imag_dict = torch.nn.ModuleDict({nodetype:torch.nn.Embedding(num_nodes, embedding_dim=hidden_channels, **embed_kwargs) for nodetype, num_nodes in self.data['num_nodes_dict'].items()})
+        self.relation_real_embedding = torch.nn.Embedding(len(self.data['edge_index_dict']), embedding_dim=hidden_channels, **embed_kwargs)
+        self.relation_imag_embedding = torch.nn.Embedding(len(self.data['edge_index_dict']), embedding_dim=hidden_channels, **embed_kwargs)
+        
+        self.nodetype2int = {nodetype:torch.tensor([i],dtype=torch.long) for i,nodetype in enumerate(data['num_nodes_dict'].keys())}
+
+        # weight initialization
+        for key in self.head_embedding_real_dict: 
+            self.init_params(self.head_embedding_real_dict[key].weight.data)
+            self.init_params(self.head_embedding_imag_dict[key].weight.data)
+            self.init_params(self.tail_embedding_real_dict[key].weight.data)
+            self.init_params(self.tail_embedding_imag_dict[key].weight.data)
+        self.init_params(self.relation_real_embedding.weight.data)
+        self.init_params(self.relation_imag_embedding.weight.data)
+
+        # Trying to emulate https://github.com/april-tools/gekcs/blob/main/src/kbc/gekc_models.py ; line 579 
+        # number of consistent triples, in this case edge type specific  
+        # I think this justifies the use of log1p
+        self.eps_dict = {(h,r,t):1/(self.data['num_nodes_dict'][h] * self.data['num_nodes_dict'][t]) for (h,r,t) in self.data['edge_index_dict'].keys()}
+
+        self.dropout = dropout 
+
+    def init_params(self, tensor, init_loc=0, init_scale=10e-3): 
+        # https://github.com/april-tools/gekcs/blob/main/src/kbc/distributions.py ## line 60 
+        # This initial outputs of ComplEx^2 will be approx. normally distributed and centered (in log-space)
+        init_loc = np.log(tensor.shape[-1]) / 3.0 + 0.5 * (init_scale ** 2)
+        t = torch.exp(torch.randn(*tensor.shape, dtype=self.dtype) * init_scale - init_loc)
+        tensor.copy_(t.float())
+
+    def partition_function(self, headtype:str, relint:int, tailtype:str) -> torch.tensor: 
+        #Squared ComplEx partition function as described in "How to Turn our KGE in generative models" 
+        #Slightly different version than used in paper, since we are conditioning on relation. Faster and less memory. 
+        #Subject (real) ~ Sr
+        Sr = self.head_embedding_real_dict[headtype].weight
+        Si = self.head_embedding_imag_dict[headtype].weight
+        Pr = self.relation_real_embedding(relint).view(1,-1)
+        Pi = self.relation_imag_embedding(relint).view(1,-1)
+        Or = self.tail_embedding_real_dict[tailtype].weight
+        Oi = self.tail_embedding_imag_dict[tailtype].weight
+
+        if self.do_node_real is not None: 
+            Sr = Sr*self.do_node_real[headtype]
+            Si = Si*self.do_node_imag[headtype]
+            Or = Or*self.do_node_real[tailtype]
+            Oi = Oi*self.do_node_imag[tailtype]
+
+        SrSr = Sr.T@Sr  ;  OrOr = Or.T@Or
+        SiSi = Si.T@Si  ;  OiOi = Oi.T@Oi 
+        SrSi = Sr.T@Si  ;  OrOi = Or.T@Oi 
+        SiSr = Si.T@Sr  ;  OiOr = Oi.T@Or
+        # SrSi =/= SiSr
+
+        A2 = (Pr @ (SrSr * OrOr) @ Pr.T).sum()
+        B2 = (Pr @ (SiSi * OiOi) @ Pr.T).sum()
+        C2 = (Pi @ (SrSr * OiOi) @ Pi.T).sum()
+        D2 = (Pi @ (SiSi * OrOr) @ Pi.T).sum()
+        AB = (Pr @ (SrSi * OrOi) @ Pr.T).sum() # AB == BA
+        AC = (Pr @ (SrSr * OrOi) @ Pi.T).sum()
+        AD = (Pr @ (SrSi * OrOr) @ Pi.T).sum()
+        BC = (Pr @ (SiSr * OiOi) @ Pi.T).sum()
+        BD = (Pr @ (SiSi * OiOr) @ Pi.T).sum()
+        CD = (Pi @ (SrSi * OiOr) @ Pi.T).sum()
+
+        return A2 + B2 + C2 + D2 + 2*AB + 2*AC + 2*BC - 2*AD - 2*BD - 2*CD
+
+    
+    def score(self, head_idx, relation_idx, tail_idx, headtype, tailtype):
+
+        u_re = self.head_embedding_real_dict[headtype](head_idx)
+        u_im = self.head_embedding_imag_dict[headtype](head_idx)
+        v_re = self.tail_embedding_real_dict[tailtype](tail_idx)
+        v_im = self.tail_embedding_imag_dict[tailtype](tail_idx)
+        r_re = self.relation_real_embedding(relation_idx) 
+        r_im = self.relation_imag_embedding(relation_idx)
+
+        if self.do_node_real is not None: 
+            u_re *= self.do_node_real[headtype][head_idx]
+            u_im *= self.do_node_imag[headtype][head_idx]
+            v_re *= self.do_node_real[tailtype][tail_idx]
+            v_im *= self.do_node_imag[tailtype][tail_idx]
+
+        scores = (triple_dot(u_re, r_re, v_re) +
+                  triple_dot(u_im, r_re, v_im) +
+                  triple_dot(u_re, r_im, v_im) -
+                  triple_dot(u_im, r_im, v_re))**2
+        
+        return scores
+    
+    def set_dropout_masks(self, device): 
+
+        if (self.training) and (self.dropout > 0): 
+
+            self.do_node_real = {}
+            self.do_node_imag = {} 
+            for nodetype in self.head_embedding_imag_dict.keys(): 
+                self.do_node_real[nodetype] = 1.*(torch.rand(size=self.head_embedding_imag_dict[nodetype].weight.size(), device=device) > self.dropout)
+                self.do_node_imag[nodetype] = 1.*(torch.rand(size=self.head_embedding_imag_dict[nodetype].weight.size(), device=device) > self.dropout) 
+        else: 
+            self.do_node_real = None
+            self.do_node_imag = None
+
+    def forward(self, head, relation, tail):
+        ''''''
+        log_prob = torch.zeros((head.size(0)), dtype=self.dtype, device=head.device)
+
+        self.set_dropout_masks(device=head.device)
+
+        for rel in torch.unique(relation): 
+            h,r,t   = self.rel2type[rel.item()]
+            rel_idx = torch.nonzero(relation == rel, as_tuple=True)[0]
+            Z       = self.partition_function(headtype=h, relint=rel, tailtype=t)
+            phi     = self.score(head_idx          = head[rel_idx],
+                                 relation_idx      = relation[rel_idx], 
+                                 tail_idx          = tail[rel_idx], 
+                                 headtype          = h, 
+                                 tailtype          = t)
+            
+            log_prob[rel_idx] = torch.log(phi + self.eps_dict[(h,r,t)]) - torch.log1p(Z)
+            
+        return log_prob
+
+def triple_dot(x,y,z):
+    return (x * y * z).sum(dim=-1)
+
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,29 @@
+name: complex2
+
+channels: 
+    - pytorch 
+    - nvidia 
+    - pyg
+    - defaults 
+    - conda-forge
+
+dependencies:
+    - python
+    - matplotlib
+    - pyg
+    - pytorch
+    - numpy 
+    - pandas
+    - torchvision
+    - seaborn
+    - scikit-learn
+    - pytorch-cuda
+    - h5py
+    - tensorboard
+    - statsmodels
+    - pip
+    - pip:
+        - ipykernel
+        - biopython
+        - openpyxl
+        - ogb
diff --git a/setup.py b/setup.py
@@ -0,0 +1,8 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='complex2',
+    version='0.1',
+    packages=find_packages(where='.'),
+    package_dir={'': '.'},
+)