Merge pull request #6 from boschresearch/feature/5-anomalous-dataset

kgoebler · web-flow · commit ce066ded09e6 · 2024-03-25T11:29:34.000+01:00
Feature/5 anomalous dataset
diff --git a/README.md b/README.md
@@ -4,9 +4,9 @@
 [![Code style: ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/format.json)](https://github.com/astral-sh/ruff)
 
 This repo provides details regarding $\texttt{causalAssembly}$, a causal discovery benchmark data tool based on complex production data.
-Theoretical details and information regarding construction are presented in the paper:
+Theoretical details and information regarding construction are presented in the [paper](https://arxiv.org/abs/2306.10816):
 
-    Göbler, K., Windisch, T., Pychynski, T., Sonntag, S., Roth, M., & Drton, M. (2023). causalAssembly: Generating Realistic Production Data for Benchmarking Causal Discovery. arXiv preprint arXiv:2306.10816.
+    Göbler, K., Windisch, T., Pychynski, T., Sonntag, S., Roth, M., & Drton, M. causalAssembly: Generating Realistic Production Data for Benchmarking Causal Discovery, to appear in Proceedings of the 3rd Conference on Causal Learning and Reasoning (CLeaR), 2024, 
 ## Authors
 * [Konstantin Goebler](mailto:konstantin.goebler@de.bosch.com)
 * [Steffen Sonntag](mailto:steffen.sonntag@de.bosch.com)
@@ -69,6 +69,55 @@ assembly_line.Station3.drf = fit_drf(assembly_line.Station3, data=assembly_line_
 station3_sample = assembly_line.Station3.sample_from_drf(size=n_select)
 
 ```
+### <a name="Interventional data">Interventional data</a>
+In case you want to create interventional data, we currently support hard and soft interventions.
+For soft interventions we use `sympy`'s `RandomSymbol` class. Essentially, soft interventions should
+be declared by choosing your preferred random variable with associated distribution from [here](https://docs.sympy.org/latest/modules/stats.html#continuous-types). Simple examples include:
+
+```python
+from sympy.stats import Beta, Normal, Uniform
+
+x = Beta("x", 1, 1)
+y = Normal("y", 0, 1)
+z = Uniform("z", 0, 1)
+
+```
+
+The following example is similar to the basic use example above where we now intervene on two nodes in the graph.
+
+```python
+from sympy.stats import Beta
+
+from causalAssembly.drf_fitting import fit_drf
+from causalAssembly.models_dag import ProductionLineGraph
+
+seed = 2023
+n_select = 500
+
+assembly_line_data = ProductionLineGraph.get_data()
+
+# take subsample for demonstration purposes
+assembly_line_data = assembly_line_data.sample(n_select, random_state=seed, replace=False)
+
+# load in ground truth
+assembly_line = ProductionLineGraph.get_ground_truth()
+
+# fit drf and sample for entire line
+assembly_line.drf = fit_drf(assembly_line, data=assembly_line_data)
+
+# intervene on two nodes in the assembly line
+assembly_line.intervene_on(
+    nodes_values={"Station3_mp_41": 2, "Station4_mp_58": Beta("noise", 1, 1)}
+)
+
+# sample from the corresponding interventional distribution
+my_int_df = assembly_line.sample_from_interventional_drf(size=5)
+
+print(my_int_df[["Station3_mp_41", "Station4_mp_58"]])
+
+```
+
+Note that intervening does not alter any of the functionalities introduced above. The interevened upon DAGs are stored in `mutilated_dags`. When calling `sample_from_drf()` the ground truth DAG as described in the paper is used. To sample from the interventional distribution, you must use `sample_from_interventional_drf`.
 
 ### <a name="how-to-semisynthesize">How to semisynthesize</a>
 
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.1.2
+1.1.3
diff --git a/causalAssembly/models_dag.py b/causalAssembly/models_dag.py
@@ -30,6 +30,8 @@
 from matplotlib.patches import BoxStyle, FancyBboxPatch
 from networkx.readwrite import json_graph
 from scipy.stats import gaussian_kde
+from sympy.stats import sample as sympy_sample
+from sympy.stats.rv import RandomSymbol
 
 from causalAssembly.dag_utils import _bootstrap_sample, tuples_from_cartesian_product
 from causalAssembly.pdag import PDAG, dag2cpdag
@@ -79,6 +81,78 @@ def _sample_from_drf(
     return new_df[prod_object.nodes]
 
 
+def _interventional_sample_from_drf(
+    prod_object: ProductionLineGraph,
+    which_intervention: int | str = 0,
+    size=10,
+    smoothed: bool = True,
+) -> pd.DataFrame:
+    if not prod_object.drf:
+        raise ValueError("Nothing to sample from. Learn DRF first!")
+
+    if not prod_object.mutilated_dags:
+        raise ValueError("No mutilated DAGs available. Please intervene first.")
+
+    if not prod_object.interventional_drf:
+        raise ValueError(
+            "No intervention values available. \
+            Please verify your hard/soft interventions."
+        )
+
+    if isinstance(which_intervention, int):
+        intervention_replace_dict = list(prod_object.interventional_drf.values())[
+            which_intervention
+        ]
+    elif isinstance(which_intervention, str):
+        intervention_replace_dict = prod_object.interventional_drf[which_intervention]
+
+    else:
+        raise ValueError("Please specify which intervention you want to sample from.")
+
+    # for node, value in intervention_replace_dict.items():
+    #     prod_object.drf[node] = value
+
+    sample_dict = {}
+    for node in prod_object.causal_order:
+        if node in intervention_replace_dict:
+            if isinstance(intervention_replace_dict[node], int):
+                sample_dict[node] = np.repeat(a=intervention_replace_dict[node], repeats=size)
+
+            elif isinstance(intervention_replace_dict[node], RandomSymbol):
+                sample_dict[node] = sympy_sample(
+                    expr=intervention_replace_dict[node], size=size, seed=prod_object.random_state
+                )
+            else:
+                raise NotImplementedError(
+                    "Currently only hard and soft interventions are implemented"
+                )
+            continue
+
+        if isinstance(prod_object.drf[node], gaussian_kde):
+            # Node has no parents, generate a sample using bootstrapping
+            #
+            if smoothed:
+                sample_dict[node] = prod_object.drf[node].resample(
+                    size=size, seed=prod_object.random_state
+                )[0]
+            else:
+                sample_dict[node] = _bootstrap_sample(
+                    rng=prod_object.random_state,
+                    data=prod_object.drf[node].dataset[0],
+                    size=size,
+                )
+        else:
+            parents = prod_object.parents(of_node=node)
+            new_data = pd.DataFrame({col: sample_dict[col] for col in parents})
+            # new_data = pd.DataFrame(sample_dict[parents])
+            forest = prod_object.drf[node]
+            sample_dict[node] = forest.produce_sample(
+                newdata=new_data, random_state=prod_object.random_state
+            )
+    new_df = pd.DataFrame(sample_dict)
+    return new_df[prod_object.nodes]
+
+
 class ProcessCell:
     """
     Representation of a single Production Line Cell
@@ -288,6 +362,19 @@ def sample_from_drf(self, size=10, smoothed: bool = True) -> pd.DataFrame:
         """
         return _sample_from_drf(prod_object=self, size=size, smoothed=smoothed)
 
+    def interventional_sample_from_drf(self, size=10, smoothed: bool = True) -> pd.DataFrame:
+        """Draw from the trained DRF.
+
+        Args:
+            size (int, optional): Number of samples to be drawn. Defaults to 10.
+            smoothed (bool, optional): If set to true, marginal distributions will
+                be sampled from smoothed bootstraps. Defaults to True.
+
+        Returns:
+            pd.DataFrame: Data frame that follows the distribution implied by the ground truth.
+        """
+        return _interventional_sample_from_drf(prod_object=self, size=size, smoothed=smoothed)
+
     def _generate_random_dag(self, n_nodes: int = 5, p: float = 0.1) -> nx.DiGraph:
         """
         Creates a random DAG by
@@ -711,6 +798,8 @@ def __init__(self):
         self.cell_connector_edges = list()
         self.cell_order = list()
         self.drf: dict = dict()
+        self.interventional_drf: dict = dict()
+        self.__init_mutilated_dag()
 
     @property
     def random_state(self):
@@ -722,6 +811,9 @@ def random_state(self, r: np.random.Generator):
             raise AssertionError("Specify numpy random number generator object!")
         self._random_state = r
 
+    def __init_mutilated_dag(self):
+        self.mutilated_dags = dict()
+
     @property
     def graph(self) -> nx.DiGraph:
         """
@@ -1002,6 +1094,78 @@ def connect_across_cells_manually(self, edges: list[tuple]):
         """
         self.cell_connector_edges.extend(edges)
 
+    def intervene_on(self, nodes_values: dict[str, RandomSymbol | float]):
+        """Specify hard or soft intervention. If you want to intervene
+        upon more than one node provide a list of nodes to intervene on
+        and a list of corresponding values to set these nodes to.
+        (see example). The mutilated dag will automatically be
+        stored in `mutiliated_dags`.
+
+        Args:
+            nodes_values (dict[str, RandomSymbol | float]): either single real
+                number or sympy.stats.RandomSymbol. If you like to intervene on
+                more than one node, just provide more key-value pairs.
+
+        Raises:
+            AssertionError: If node(s) are not in the graph
+        """
+        if not self.drf:
+            raise AssertionError("You need to train a drf first.")
+        drf_replace = {}
+
+        if not set(nodes_values.keys()).issubset(set(self.nodes)):
+            raise AssertionError(
+                "One or more nodes you want to intervene upon are not in the graph."
+            )
+
+        mutilated_dag = self.graph.copy()
+
+        for node, value in nodes_values.items():
+            old_incoming = self.parents(of_node=node)
+            edges_to_remove = [(old, node) for old in old_incoming]
+            mutilated_dag.remove_edges_from(edges_to_remove)
+            drf_replace[node] = value
+
+        self.mutilated_dags[
+            f"do({list(nodes_values.keys())})"
+        ] = mutilated_dag  # specifiying the same set twice will override
+
+        self.interventional_drf[f"do({list(nodes_values.keys())})"] = drf_replace
+
+    @property
+    def interventions(self) -> list:
+        """Returns all interventions performed on the original graph
+
+        Returns:
+            list: list of intervened upon nodes in do(x) notation.
+        """
+        return list(self.mutilated_dags.keys())
+
+    def interventional_amat(self, which_intervention: int | str) -> pd.DataFrame:
+        """Returns the adjacency matrix of a chosen mutilated DAG.
+
+        Args:
+            which_intervention (int | str): Integer count of your chosen intervention or
+                literal string.
+
+        Raises:
+            ValueError: "The intervention you provide does not exist."
+
+        Returns:
+            pd.DataFrame: Adjacency matrix.
+        """
+        if isinstance(which_intervention, str) and which_intervention not in self.interventions:
+            raise ValueError("The intervention you provide does not exist.")
+
+        if isinstance(which_intervention, int) and which_intervention > len(self.interventions):
+            raise ValueError("The intervention you index does not exist.")
+
+        if isinstance(which_intervention, int):
+            which_intervention = self.interventions[which_intervention]
+
+        mutilated_dag = self.mutilated_dags[which_intervention].copy()
+        return nx.to_pandas_adjacency(mutilated_dag, weight=None)
+
     @classmethod
     def get_ground_truth(cls) -> ProductionLineGraph:
         """Loads in the ground_truth as described in the paper:
@@ -1142,6 +1306,27 @@ def sample_from_drf(self, size=10, smoothed: bool = True) -> pd.DataFrame:
         """
         return _sample_from_drf(prod_object=self, size=size, smoothed=smoothed)
 
+    def sample_from_interventional_drf(
+        self, which_intervention: str | int = 0, size=10, smoothed: bool = True
+    ) -> pd.DataFrame:
+        """Draw from the trained and intervened upon DRF.
+
+        Args:
+            size (int, optional): Number of samples to be drawn. Defaults to 10.
+            which_intervention (str | int): Which intervention to choose from.
+                Both the literal name (see the property `interventions`) and the index
+                are possible. Defaults to the first intervention.
+            smoothed (bool, optional): If set to true, marginal distributions will
+                be sampled from smoothed bootstraps. Defaults to True.
+
+        Returns:
+            pd.DataFrame: Data frame that follows the interventional distribution
+                implied by the ground truth.
+        """
+        return _interventional_sample_from_drf(
+            prod_object=self, which_intervention=which_intervention, size=size, smoothed=smoothed
+        )
+
     def hidden_nodes(self) -> list:
         """Returns list of nodes marked as hidden
 
@@ -1230,7 +1415,7 @@ def show(self, meta_description: list | None = None, fig_size: tuple = (15, 8)):
         Raises:
             AssertionError: Meta list entry needs to exist for each cell!
         """
-        fig, ax = plt.subplots(figsize=fig_size)
+        _, ax = plt.subplots(figsize=fig_size)
 
         pos = {}
 
diff --git a/tests/test_models_dag.py b/tests/test_models_dag.py
@@ -20,6 +20,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from sympy.stats import Beta
 
 from causalAssembly.models_dag import NodeAttributes, ProcessCell, ProductionLineGraph
 
@@ -603,3 +604,8 @@ def test_between_edges_adjacency_matrix(self):
         )
         assert between_amat.loc[pline.cell1.nodes, pline.cell1.nodes].sum().sum() == 0
         assert between_amat.loc[pline.cell2.nodes, pline.cell2.nodes].sum().sum() == 0
+
+    def test_interventional_drf_error(self):
+        testline = ProductionLineGraph()
+        with pytest.raises(ValueError):
+            testline.sample_from_interventional_drf()