ChEB-AI
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 27 additions & 6 deletions b/‎README.md‎
Lines changed: 27 additions & 6 deletions
diff --git a/‎chebai/cli.py‎
Lines changed: 30 additions & 1 deletion b/‎chebai/cli.py‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎chebai/loss/bce_weighted.py‎
Lines changed: 23 additions & 13 deletions b/‎chebai/loss/bce_weighted.py‎
Lines changed: 23 additions & 13 deletions
diff --git a/‎chebai/loss/focal_loss.py‎
Lines changed: 152 additions & 0 deletions b/‎chebai/loss/focal_loss.py‎
Lines changed: 152 additions & 0 deletions
diff --git a/‎chebai/loss/semantic.py‎
Lines changed: 2 additions & 2 deletions b/‎chebai/loss/semantic.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎chebai/models/base.py‎
Lines changed: 6 additions & 2 deletions b/‎chebai/models/base.py‎
Lines changed: 6 additions & 2 deletions
@@ -176,3 +176,7 @@ lightning_logs
 logs
 .isort.cfg
 /.vscode
+
+*.out
+*.err
+*.sh
@@ -3,14 +3,30 @@
 ChEBai is a deep learning library designed for the integration of deep learning methods with chemical ontologies, particularly ChEBI.
 The library emphasizes the incorporation of the semantic qualities of the ontology into the learning process.
 
-## Installation
+##  News
+
+We now support regression tasks!
+
+## Note for developers
 
-You can install ChEBai via pip:
+If you have used ChEBai before PR #39, the file structure in which your ChEBI-data is saved has changed. This means that
+datasets will be freshly generated. The data however is the same. If you want to keep the old data (including the old
+splits), you can use a migration script. It copies the old data to the new location for a specific ChEBI class
+(including chebi version and other parameters). The script can be called by specifying the data module from a config
 ```
-pip install chebai
+python chebai/preprocessing/migration/chebi_data_migration.py migrate --datamodule=[path-to-data-config]
+```
+or by specifying the class name (e.g. `ChEBIOver50`) and arguments separately
 ```
+python chebai/preprocessing/migration/chebi_data_migration.py migrate --class_name=[data-class] [--chebi_version=[version]]
+```
+The new dataset will by default generate random data splits (with a given seed).
+To reuse a fixed data split, you have to provide the path of the csv file generated during the migration:
+`--data.init_args.splits_file_path=[path-to-processed_data]/splits.csv`
 
-Alternatively, you can get the latest development version directly from GitHub:
+## Installation
+
+To install ChEBai, follow these steps:
 
 1. Clone the repository:
 ```
@@ -63,11 +79,16 @@ A command with additional options may look like this:
 python3 -m chebai fit --trainer=configs/training/default_trainer.yml --model=configs/model/electra.yml --model.train_metrics=configs/metrics/micro-macro-f1.yml --model.test_metrics=configs/metrics/micro-macro-f1.yml --model.val_metrics=configs/metrics/micro-macro-f1.yml --model.pretrained_checkpoint=electra_pretrained.ckpt --model.load_prefix=generator. --data=configs/data/chebi50.yml --model.criterion=configs/loss/bce.yml --data.init_args.batch_size=10 --trainer.logger.init_args.name=chebi50_bce_unweighted --data.init_args.num_workers=9 --model.pass_loss_kwargs=false --data.init_args.chebi_version=231 --data.init_args.data_limit=1000
 ```
 
-### Fine-tuning for Toxicity prediction
+### Fine-tuning for classification tasks, e.g. Toxicity prediction
 ```
 python -m chebai fit --config=[path-to-your-tox21-config] --trainer.callbacks=configs/training/default_callbacks.yml  --model.pretrained_checkpoint=[path-to-pretrained-model]
 ```
 
+### Fine-tuning for regression tasks, e.g. solubility prediction
+```
+python -m chebai fit --config=[path-to-your-esol-config] --trainer.callbacks=configs/training/solCur_callbacks.yml  --model.pretrained_checkpoint=[path-to-pretrained-model]
+```
+
 ### Predicting classes given SMILES strings
 ```
 python3 -m chebai predict_from_file --model=[path-to-model-config] --checkpoint_path=[path-to-model] --input_path={path-to-file-containing-smiles] [--classes_path=[path-to-classes-file]] [--save_to=[path-to-output]]
@@ -81,7 +102,7 @@ The `classes_path` is the path to the dataset's `raw/classes.txt` file that cont
 You can evaluate a model trained on the ontology extension task in one of two ways:
 
 ### 1. Using the Jupyter Notebook
-An example notebook is provided at `tutorials/eval_model_basic.ipynb`.  
+An example notebook is provided at `tutorials/eval_model_basic.ipynb`.
 - Load your finetuned model and run the evaluation cells to compute metrics on the test set.
 
 ### 2. Using the Lightning CLI
 
@@ -60,15 +60,44 @@ def call_data_methods(data: Type[XYBaseDataModule]):
         )
 
         for kind in ("train", "val", "test"):
-            for average in ("micro-f1", "macro-f1", "balanced-accuracy"):
+            for average in (
+                "micro-f1",
+                "macro-f1",
+                "balanced-accuracy",
+                "roc-auc",
+                "f1",
+                "mse",
+                "rmse",
+                "r2",
+            ):
+                # When using lightning > 2.5.1 then need to uncomment all metrics that are not used
+                # for average in ("mse", "rmse","r2"): # for regression
+                # for average in ("f1", "roc-auc"): # for binary classification
+                # for average in ("micro-f1", "macro-f1", "roc-auc"): # for multilabel classification
+                # for average in ("micro-f1", "macro-f1", "balanced-accuracy", "roc-auc"): # for multilabel classification using balanced-accuracy
                 parser.link_arguments(
                     "data.num_of_labels",
                     f"model.init_args.{kind}_metrics.init_args.metrics.{average}.init_args.num_labels",
                     apply_on="instantiate",
                 )
+
         parser.link_arguments(
             "data.num_of_labels", "trainer.callbacks.init_args.num_labels"
         )
+        # parser.link_arguments(
+        #     "model.init_args.out_dim", "trainer.callbacks.init_args.num_labels"
+        # )
+        # parser.link_arguments(
+        #     "data", "model.init_args.criterion.init_args.data_extractor"
+        # )
+        # parser.link_arguments(
+        #     "data.init_args.chebi_version",
+        #     "model.init_args.criterion.init_args.data_extractor.init_args.chebi_version",
+        # )
+
+        parser.link_arguments(
+            "data", "model.init_args.criterion.init_args.data_extractor"
+        )
 
     @staticmethod
     def subcommands() -> Dict[str, Set[str]]:
 
@@ -10,7 +10,6 @@
 class BCEWeighted(torch.nn.BCEWithLogitsLoss):
     """
     BCEWithLogitsLoss with weights automatically computed according to the beta parameter.
-    If beta is None or data_extractor is None, the loss is unweighted.
 
     This class computes weights based on the formula from the paper by Cui et al. (2019):
     https://openaccess.thecvf.com/content_CVPR_2019/papers/Cui_Class-Balanced_Loss_Based_on_Effective_Number_of_Samples_CVPR_2019_paper.pdf
@@ -22,7 +21,7 @@ class BCEWeighted(torch.nn.BCEWithLogitsLoss):
 
     def __init__(
         self,
-        beta: Optional[float] = None,
+        beta: float = 0.99,
         data_extractor: Optional[XYBaseDataModule] = None,
         **kwargs,
     ):
@@ -32,11 +31,26 @@ def __init__(
         if isinstance(data_extractor, LabeledUnlabeledMixed):
             data_extractor = data_extractor.labeled
         self.data_extractor = data_extractor
+
+        assert (
+            isinstance(beta, float) and beta > 0.0
+        ), f"Beta parameter must be a float with value greater than 0.0, for loss class {self.__class__.__name__}."
+
+        assert (
+            self.data_extractor is not None
+        ), f"Data extractor must be provided if this loss class ({self.__class__.__name__}) is used."
+
+        assert all(
+            os.path.exists(os.path.join(self.data_extractor.processed_dir, file_name))
+            for file_name in self.data_extractor.processed_file_names
+        ), "Dataset files not found. Make sure the dataset is processed before using this loss."
+
         assert (
             isinstance(self.data_extractor, _ChEBIDataExtractor)
             or self.data_extractor is None
         )
         super().__init__(**kwargs)
+        self.pos_weight: Optional[torch.Tensor] = None
 
     def set_pos_weight(self, input: torch.Tensor) -> None:
         """
@@ -45,17 +59,7 @@ def set_pos_weight(self, input: torch.Tensor) -> None:
         Args:
             input (torch.Tensor): The input tensor for which to set the positive weights.
         """
-        if (
-            self.beta is not None
-            and self.data_extractor is not None
-            and all(
-                os.path.exists(
-                    os.path.join(self.data_extractor.processed_dir, file_name)
-                )
-                for file_name in self.data_extractor.processed_file_names
-            )
-            and self.pos_weight is None
-        ):
+        if self.pos_weight is None:
             print(
                 f"Computing loss-weights based on v{self.data_extractor.chebi_version} dataset (beta={self.beta})"
             )
@@ -96,3 +100,9 @@ def forward(
         """
         self.set_pos_weight(input)
         return super().forward(input, target)
+
+
+class UnWeightedBCEWithLogitsLoss(torch.nn.BCEWithLogitsLoss):
+    def forward(self, input, target, **kwargs):
+        # As the custom passed kwargs are not used in BCEWithLogitsLoss, we can ignore them
+        return super().forward(input, target)
@@ -0,0 +1,152 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# from https://github.com/itakurah/Focal-loss-PyTorch
+
+
+class FocalLoss(nn.Module):
+    def __init__(
+        self,
+        gamma=2,
+        alpha=None,
+        reduction="mean",
+        task_type="binary",
+        num_classes=None,
+    ):
+        """
+        Unified Focal Loss class for binary, multi-class, and multi-label classification tasks.
+        :param gamma: Focusing parameter, controls the strength of the modulating factor (1 - p_t)^gamma
+        :param alpha: Balancing factor, can be a scalar or a tensor for class-wise weights. If None, no class balancing is used.
+        :param reduction: Specifies the reduction method: 'none' | 'mean' | 'sum'
+        :param task_type: Specifies the type of task: 'binary', 'multi-class', or 'multi-label'
+        :param num_classes: Number of classes (only required for multi-class classification)
+        """
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.task_type = task_type
+        self.num_classes = num_classes
+
+        # Handle alpha for class balancing in multi-class tasks
+        if (
+            task_type == "multi-class"
+            and alpha is not None
+            and isinstance(alpha, (list, torch.Tensor))
+        ):
+            assert (
+                num_classes is not None
+            ), "num_classes must be specified for multi-class classification"
+            if isinstance(alpha, list):
+                self.alpha = torch.Tensor(alpha)
+            else:
+                self.alpha = alpha
+
+    def forward(self, inputs, targets):
+        """
+        Forward pass to compute the Focal Loss based on the specified task type.
+        :param inputs: Predictions (logits) from the model.
+                       Shape:
+                         - binary/multi-label: (batch_size, num_classes)
+                         - multi-class: (batch_size, num_classes)
+        :param targets: Ground truth labels.
+                        Shape:
+                         - binary: (batch_size,)
+                         - multi-label: (batch_size, num_classes)
+                         - multi-class: (batch_size,)
+        """
+        if self.task_type == "binary":
+            return self.binary_focal_loss(inputs, targets)
+        elif self.task_type == "multi-class":
+            return self.multi_class_focal_loss(inputs, targets)
+        elif self.task_type == "multi-label":
+            return self.multi_label_focal_loss(inputs, targets)
+        else:
+            raise ValueError(
+                f"Unsupported task_type '{self.task_type}'. Use 'binary', 'multi-class', or 'multi-label'."
+            )
+
+    def binary_focal_loss(self, inputs, targets):
+        """Focal loss for binary classification."""
+        probs = torch.sigmoid(inputs)
+        targets = targets.float()
+
+        # Compute binary cross entropy
+        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+
+        # Compute focal weight
+        p_t = probs * targets + (1 - probs) * (1 - targets)
+        focal_weight = (1 - p_t) ** self.gamma
+
+        # Apply alpha if provided
+        if self.alpha is not None:
+            alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
+            bce_loss = alpha_t * bce_loss
+
+        # Apply focal loss weighting
+        loss = focal_weight * bce_loss
+
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        return loss
+
+    def multi_class_focal_loss(self, inputs, targets):
+        """Focal loss for multi-class classification."""
+        if self.alpha is not None:
+            alpha = self.alpha.to(inputs.device)
+
+        # Convert logits to probabilities with softmax
+        probs = F.softmax(inputs, dim=1)
+
+        # One-hot encode the targets
+        targets_one_hot = F.one_hot(targets, num_classes=self.num_classes).float()
+
+        # Compute cross-entropy for each class
+        ce_loss = -targets_one_hot * torch.log(probs)
+
+        # Compute focal weight
+        p_t = torch.sum(probs * targets_one_hot, dim=1)  # p_t for each sample
+        focal_weight = (1 - p_t) ** self.gamma
+
+        # Apply alpha if provided (per-class weighting)
+        if self.alpha is not None:
+            alpha_t = alpha.gather(0, targets)
+            ce_loss = alpha_t.unsqueeze(1) * ce_loss
+
+        # Apply focal loss weight
+        loss = focal_weight.unsqueeze(1) * ce_loss
+
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        return loss
+
+    def multi_label_focal_loss(self, inputs, targets):
+        """Focal loss for multi-label classification."""
+        probs = torch.sigmoid(inputs)
+
+        # Compute binary cross entropy
+        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+
+        # Compute focal weight
+        p_t = probs * targets + (1 - probs) * (1 - targets)
+        focal_weight = (1 - p_t) ** self.gamma
+
+        # Apply alpha if provided
+        if self.alpha is not None:
+            alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
+            bce_loss = alpha_t * bce_loss
+
+        # Apply focal loss weight
+        loss = focal_weight * bce_loss
+
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        return loss
@@ -2,7 +2,7 @@
 import math
 import os
 import pickle
-from typing import TYPE_CHECKING, List, Literal, Union
+from typing import TYPE_CHECKING, List, Literal, Union, Tuple
 
 import torch
 
@@ -62,7 +62,7 @@ def __init__(
         pos_epsilon: float = 0.01,
         multiply_by_softmax: bool = False,
         use_sigmoidal_implication: bool = False,
-        weight_epoch_dependent: Union[bool | tuple[int, int]] = False,
+        weight_epoch_dependent: Union[bool, Tuple[int, int]] = False,
         start_at_epoch: int = 0,
         violations_per_cls_aggregator: Literal[
             "sum", "max", "mean", "log-sum", "log-max", "log-mean"
 
@@ -42,14 +42,19 @@ def __init__(
         exclude_hyperparameter_logging: Optional[Iterable[str]] = None,
         **kwargs,
     ):
-        super().__init__()
+        super().__init__(**kwargs)
+        # super().__init__()
         if exclude_hyperparameter_logging is None:
             exclude_hyperparameter_logging = tuple()
         self.criterion = criterion
         assert out_dim is not None, "out_dim must be specified"
         assert input_dim is not None, "input_dim must be specified"
         self.out_dim = out_dim
         self.input_dim = input_dim
+        print(
+            f"Input dimension for the model: {self.input_dim}",
+            f"Output dimension for the model: {self.out_dim}",
+        )
 
         self.save_hyperparameters(
             ignore=[
@@ -273,7 +278,6 @@ def _execute(
                 loss_kwargs = dict()
                 if self.pass_loss_kwargs:
                     loss_kwargs = loss_kwargs_candidates
-                loss_kwargs["current_epoch"] = self.trainer.current_epoch
                 loss = self.criterion(loss_data, loss_labels, **loss_kwargs)
                 if isinstance(loss, tuple):
                     unnamed_loss_index = 1