diff --git a/README.md b/README.md
index 30e976f..e556c8c 100644
--- a/README.md
+++ b/README.md
@@ -4,10 +4,12 @@
[](https://github.com/astral-sh/ruff)
This repo provides details regarding $\texttt{causalAssembly}$, a causal discovery benchmark data tool based on complex production data.
-Theoretical details and information regarding construction are presented in the [paper](https://arxiv.org/abs/2306.10816):
+Theoretical details and information regarding construction are presented in the [paper](https://proceedings.mlr.press/v236/gobler24a.html):
+
+ Göbler, K., Windisch, T., Drton, M., Pychynski, T., Roth, M., & Sonntag, S. (2024). causalAssembly: Generating Realistic Production Data for Benchmarking Causal Discovery. In Proceedings of the Third Conference on Causal Learning and Reasoning (pp. 609–642). PMLR.
- Göbler, K., Windisch, T., Pychynski, T., Sonntag, S., Roth, M., & Drton, M. causalAssembly: Generating Realistic Production Data for Benchmarking Causal Discovery, to appear in Proceedings of the 3rd Conference on Causal Learning and Reasoning (CLeaR), 2024,
## Authors
+
* [Konstantin Goebler](mailto:konstantin.goebler@de.bosch.com)
* [Steffen Sonntag](mailto:steffen.sonntag@de.bosch.com)
@@ -26,14 +28,13 @@ The package can be installed as follows
pip install causalAssembly
-[comment]: <> (git+https://github.com/boschresearch/causalAssembly.git)
## How to use
This is how $\texttt{causalAssembly}$'s functionality may be used. Be sure to read the [documentation](https://boschresearch.github.io/causalAssembly/) for more in-depth details regarding available functions and classes.
In case you want to train a distributional random forests yourself (see [how to semisynthetsize](#how-to-semisynthesize)),
-you need an R installation as well as the corresponding [drf](https://cran.r-project.org/web/packages/drf/index.html) R package.
+you need an R installation.
Sampling has first been proposed in [[2]](#2).
*Note*: For Windows users the python package [rpy2](https://github.com/rpy2/rpy2) might cause issues.
@@ -69,7 +70,9 @@ assembly_line.Station3.drf = fit_drf(assembly_line.Station3, data=assembly_line_
station3_sample = assembly_line.Station3.sample_from_drf(size=n_select)
```
+
### Interventional data
+
In case you want to create interventional data, we currently support hard and soft interventions.
For soft interventions we use `sympy`'s `RandomSymbol` class. Essentially, soft interventions should
be declared by choosing your preferred random variable with associated distribution from [here](https://docs.sympy.org/latest/modules/stats.html#continuous-types). Simple examples include:
@@ -158,7 +161,6 @@ if nx.is_directed_acyclic_graph(s_graph):
### How to generate random production DAGs
-
The `ProductionLineGraph` class can further be used to generate completely random DAGs that follow an assembly line logic. Consider the following example:
```python
@@ -183,12 +185,11 @@ example_line.connect_cells(forward_probs= [.1])
example_line.show()
```
-### How to generate FCMs
+### How to generate FCMs
$\texttt{causalAssembly}$ also allows creating structural causal models (SCM) or synonymously functional causal models (FCM). In particular, we employ symbolic programming to allow for a seamless interplay between readability and performance. The `FCM` class is completely general and inherits no production data logic. See the example below for construction and usage.
-
```python
import numpy as np
@@ -279,6 +280,7 @@ Please feel free to contact one of the authors in case you wish to contribute.
| [matplotlib](https://github.com/matplotlib/matplotlib) | [Other](https://github.com/matplotlib/matplotlib/tree/main/LICENSE) | Dependency |
| [sympy](https://github.com/sympy/sympy) | [BSD-3-Clause License](https://github.com/sympy/sympy/blob/master/LICENSE) | Dependency |
| [rpy2](https://github.com/rpy2/rpy2) | [GNU General Public License v2.0](https://github.com/rpy2/rpy2/blob/master/LICENSE) | Dependency |
+
### Development dependency
| Name | License | Type |
diff --git a/VERSION b/VERSION
index 26aaba0..6085e94 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.0
+1.2.1
diff --git a/causalAssembly/drf_fitting.py b/causalAssembly/drf_fitting.py
index c57e2e7..ee4a6c0 100644
--- a/causalAssembly/drf_fitting.py
+++ b/causalAssembly/drf_fitting.py
@@ -19,17 +19,26 @@
import numpy as np
import pandas as pd
import rpy2.robjects as ro
-import rpy2.robjects.numpy2ri
-from rpy2.robjects import pandas2ri
+import rpy2.robjects.packages as rpackages
+from rpy2.robjects import numpy2ri, pandas2ri
+from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
from scipy.stats import gaussian_kde
from causalAssembly.dag import DAG
from causalAssembly.models_dag import ProcessCell, ProductionLineGraph
-rpy2.robjects.numpy2ri.activate()
-pandas2ri.activate()
+# Converter for numpy + pandas instead of using deprecated activate()
+R_CONVERTER = ro.default_converter + numpy2ri.converter + pandas2ri.converter
+
base_r_package = importr("base")
+utils = importr("utils")
+
+if not rpackages.isinstalled("drf"):
+ # select a mirror for R packages
+ utils.chooseCRANmirror(graphics=False)
+ utils.install_packages("drf", repos="https://cloud.r-project.org/")
+
drf_r_package = importr("drf")
@@ -41,22 +50,24 @@ class DRF:
"""
def __init__(self, **fit_params):
- """Initialize DRF object."""
+ """Initialize the DRF object with fit parameters."""
self.fit_params = fit_params
def fit(self, X: pd.DataFrame, Y: pd.DataFrame | pd.Series):
"""Fit DRF in order to estimate conditional distribution P(Y|X=x).
Args:
- X (pd.DataFrame): Conditioning set.
- Y (pd.DataFrame): Variable of interest (can be vector-valued).
+ X (pd.DataFrame): Predictor variables.
+ Y (pd.DataFrame | pd.Series): Response variable(s).
"""
self.X_train = X
self.Y_train = Y
- X_r = ro.conversion.py2rpy(X)
- Y_r = ro.conversion.py2rpy(Y)
- self.r_fit_object = drf_r_package.drf(X_r, Y_r, **self.fit_params)
+ # Use localconverter
+ with localconverter(R_CONVERTER):
+ X_r = ro.conversion.py2rpy(X)
+ Y_r = ro.conversion.py2rpy(Y)
+ self.r_fit_object = drf_r_package.drf(X_r, Y_r, **self.fit_params)
def produce_sample(
self,
@@ -72,15 +83,18 @@ def produce_sample(
n (int, optional): Number of n-samples to draw. Defaults to 1.
Returns:
- np.ndarray: New predicted samlpe of Y.
+ np.ndarray: New predicted sample of Y.
"""
- newdata_r = ro.conversion.py2rpy(newdata)
- r_output = drf_r_package.predict_drf(self.r_fit_object, newdata_r)
+ with localconverter(R_CONVERTER):
+ newdata_r = ro.conversion.py2rpy(newdata)
+ r_output = drf_r_package.predict_drf(self.r_fit_object, newdata_r)
- weights = base_r_package.as_matrix(r_output[0])
+ # Convert back to Python
+ weights = ro.conversion.rpy2py(base_r_package.as_matrix(r_output[0]))
+ Y = ro.conversion.rpy2py(base_r_package.as_matrix(r_output[1]))
- Y = pd.DataFrame(base_r_package.as_matrix(r_output[1]))
- Y = Y.apply(pd.Series)
+ if not isinstance(Y, pd.DataFrame):
+ Y = pd.DataFrame(Y)
sample = np.zeros((newdata.shape[0], Y.shape[1], n))
for i in range(newdata.shape[0]):
@@ -98,17 +112,14 @@ def fit_drf(graph: ProductionLineGraph | ProcessCell | DAG, data: pd.DataFrame):
graph (ProductionLineGraph | ProcessCell | DAG): Graph to fit the DRF to.
data (pd.DataFrame): Columns of dataframe need to match name and order of the graph
- Raises:
- ValueError: Raises error if columns don't meet this requirement
+ Raises: ValueError: Raises error if columns don't meet this requirement
- Returns:
- (dict): dict of fitted DRFs.
+ Returns: (dict): dict of fitted DRFs.
"""
tempdata = data.copy()
if set(graph.nodes).issubset(tempdata.columns):
tempdata = tempdata[graph.nodes]
-
else:
raise ValueError("Data columns don't match node names.")
@@ -118,9 +129,8 @@ def fit_drf(graph: ProductionLineGraph | ProcessCell | DAG, data: pd.DataFrame):
if not parents:
drf_dict[node] = gaussian_kde(tempdata[node].to_numpy())
elif parents:
- drf_object = DRF(
- min_node_size=15, num_trees=2000, splitting_rule="FourierMMD"
- ) # default setting as suggested in the paper
+ # default setting as suggested in the paper
+ drf_object = DRF(min_node_size=15, num_trees=2000, splitting_rule="FourierMMD")
X = tempdata[parents]
Y = tempdata[node]
drf_object.fit(X, Y)