From c19fb9814af963d64f38afdb90d5e386afaba852 Mon Sep 17 00:00:00 2001
From: Chuck Garcia <chuckgarcian@gmail.com>
Date: Mon, 8 Jul 2024 10:05:16 -0400
Subject: [PATCH 01/11] Fix virify function

---
 cutqc/graph_contraction.py |  2 +-
 cutqc/main.py              |  2 ++
 example.py                 | 15 ++++++++++++---
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/cutqc/graph_contraction.py b/cutqc/graph_contraction.py
index 0c7fb80..cb2aafb 100644
--- a/cutqc/graph_contraction.py
+++ b/cutqc/graph_contraction.py
@@ -94,4 +94,4 @@ def compute(self):
             1 / 2**self.num_cuts, reconstructed_prob
         ).numpy()
         self.times["compute"] = perf_counter() - compute_begin
-        return reconstructed_prob
+        return reconstructed_prob
\ No newline at end of file
diff --git a/cutqc/main.py b/cutqc/main.py
index b0d8cde..b63577e 100644
--- a/cutqc/main.py
+++ b/cutqc/main.py
@@ -140,6 +140,8 @@ def verify(self):
             subcircuits=self.subcircuits,
             dd_bins=self.approximation_bins,
         )
+        
+        print (f"Approximate Error: {self.approximation_error}")
         print("verify took %.3f" % (perf_counter() - verify_begin))
 
     def clean_data(self):
diff --git a/example.py b/example.py
index fdb8ccb..fc4cef3 100644
--- a/example.py
+++ b/example.py
@@ -33,15 +33,24 @@
             "max_cuts": 10,
             "num_subcircuits": [2, 3],
         },
-        verbose=True,
+        verbose=False,
     )
+    
+    print ("-- Cut -- ")    
     cutqc.cut()
     if not cutqc.has_solution:
         raise Exception("The input circuit and constraints have no viable cuts")
+    print ("-- Done Cutting -- \n")    
     
-    # add comment
+    print ("-- Evaluate --")
     cutqc.evaluate(eval_mode="sv", num_shots_fn=None)
+    print ("-- Done Evaluating -- \n")
+
+    print ("-- Build --")
     cutqc.build(mem_limit=32, recursion_depth=1)
+    print ("-- Done Building -- \n")
+    
+    cutqc.verify ()
     print("Cut: %d recursions." % (cutqc.num_recursions))
-    print(cutqc.approximation_bins)
+    # print(cutqc.approximation_bins)
     cutqc.clean_data()

From be3c0097c0f990f8962042f662eec1191efaf711 Mon Sep 17 00:00:00 2001
From: Chuck Garcia <chuckgarcian@gmail.com>
Date: Mon, 8 Jul 2024 13:55:59 -0400
Subject: [PATCH 02/11] Fix segfault bug in graph_contraction.py

---
 cutqc/dynamic_definition.py        |  5 ++
 cutqc/graph_contraction.py         | 86 ++++++++++++++++--------------
 cutqc_runtime/graph_contraction.py |  5 +-
 example.py                         |  4 +-
 4 files changed, 55 insertions(+), 45 deletions(-)

diff --git a/cutqc/dynamic_definition.py b/cutqc/dynamic_definition.py
index ac90425..d3a4c7c 100644
--- a/cutqc/dynamic_definition.py
+++ b/cutqc/dynamic_definition.py
@@ -326,9 +326,14 @@ def full_verify(full_circuit, complete_path_map, subcircuits, dd_bins):
     real_probability = quasi_to_real(
         quasiprobability=reconstructed_prob, mode="nearest"
     )
+    print (f"MSE: {MSE(target=ground_truth, obs=real_probability)}")
     approximation_error = (
         MSE(target=ground_truth, obs=real_probability)
         * 2**full_circuit.num_qubits
         / np.linalg.norm(ground_truth) ** 2
     )
+    
+    print (f"Reconstructed Error: {reconstructed_prob}")
+    print (f"Real Error: {real_probability}")
+    
     return reconstructed_prob, approximation_error
diff --git a/cutqc/graph_contraction.py b/cutqc/graph_contraction.py
index cb2aafb..a964c58 100644
--- a/cutqc/graph_contraction.py
+++ b/cutqc/graph_contraction.py
@@ -18,8 +18,7 @@ def compute_summation_term(*argv):
                 tf.tensordot(summation_term, subcircuit_entry_prob, axes=0), [-1]
             )
     return summation_term
-
-
+                
 class GraphContractor(object):
     def __init__(self, compute_graph, subcircuit_entry_probs, num_cuts) -> None:
         super().__init__()
@@ -44,54 +43,59 @@ def __init__(self, compute_graph, subcircuit_entry_probs, num_cuts) -> None:
         )
         self.overhead = {"additions": 0, "multiplications": 0}
         self.reconstructed_prob = self.compute()
-
+        
     def compute(self):
         edges = self.compute_graph.get_edges(from_node=None, to_node=None)
 
-        make_dataset_begin = perf_counter()
-        dataset = None
+        partial_compute_begin = perf_counter()
+        reconstructed_prob = None
+        counter = 0
         for edge_bases in itertools.product(["I", "X", "Y", "Z"], repeat=len(edges)):
             self.compute_graph.assign_bases_to_edges(edge_bases=edge_bases, edges=edges)
-            summation_term = []
-            cumulative_len = 1
+            summation_term = None
             for subcircuit_idx in self.smart_order:
-                subcircuit_entry_init_meas = self.compute_graph.get_init_meas(
-                    subcircuit_idx=subcircuit_idx
-                )
-                subcircuit_entry_prob = self.subcircuit_entry_probs[subcircuit_idx][
-                    subcircuit_entry_init_meas
-                ]
-                summation_term.append(subcircuit_entry_prob)
-                cumulative_len *= len(subcircuit_entry_prob)
-                self.overhead["multiplications"] += cumulative_len
-            self.overhead["multiplications"] -= len(summation_term[0])
-            dataset_elem = tf.data.Dataset.from_tensors(tuple(summation_term))
-            if dataset is None:
-                dataset = dataset_elem
+                subcircuit_entry_prob = get_subcircuit_entry_prob (self, subcircuit_idx)
+                if summation_term is None:
+                    summation_term = subcircuit_entry_prob
+                else:
+                    summation_term = tf.reshape(
+                        tf.tensordot(summation_term, subcircuit_entry_prob, axes=0),
+                        [-1],
+                    )
+                    self.overhead["multiplications"] += len(summation_term)
+            if reconstructed_prob is None:
+                reconstructed_prob = summation_term
             else:
-                dataset = dataset.concatenate(dataset_elem)
+                reconstructed_prob += summation_term
+                self.overhead["additions"] += len(summation_term)
+            counter += 1
+            
+            
         self.compute_graph.remove_bases_from_edges(edges=self.compute_graph.edges)
-        dataset = dataset.batch(
-            batch_size=1, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False
-        )
-        self.times["make_dataset"] = perf_counter() - make_dataset_begin
-
-        compute_begin = perf_counter()
-        dataset = dataset.map(
-            compute_summation_term,
-            num_parallel_calls=tf.data.AUTOTUNE,
-            deterministic=False,
-        )
+        partial_compute_time = perf_counter() - partial_compute_begin
 
-        reconstructed_prob = None
-        for x in dataset:
-            if reconstructed_prob is None:
-                reconstructed_prob = x
-            else:
-                self.overhead["additions"] += len(reconstructed_prob)
-                reconstructed_prob += x
+        scale_begin = perf_counter()
         reconstructed_prob = tf.math.scalar_mul(
             1 / 2**self.num_cuts, reconstructed_prob
         ).numpy()
-        self.times["compute"] = perf_counter() - compute_begin
-        return reconstructed_prob
\ No newline at end of file
+        scale_time = perf_counter() - scale_begin
+
+        self.times["compute"] = (
+            partial_compute_time / counter * 4 ** len(edges) + scale_time
+        )
+        self.overhead["additions"] = int(
+            self.overhead["additions"] / counter * 4 ** len(edges)
+        )
+        self.overhead["multiplications"] = int(
+            self.overhead["multiplications"] / counter * 4 ** len(edges)
+        )
+        return reconstructed_prob
+
+
+def get_subcircuit_entry_prob (gc : GraphContractor, subcircuit_idx : int):
+        '''
+        Returns The subcircuit Entry Probability for the subcircuit at index 
+        'SUBCIRCUIT_IDX' of the graph contractor object 'GC'.
+        '''
+        subcircuit_entry_init_meas = gc.compute_graph.get_init_meas(subcircuit_idx)        
+        return gc.subcircuit_entry_probs[subcircuit_idx][subcircuit_entry_init_meas]
diff --git a/cutqc_runtime/graph_contraction.py b/cutqc_runtime/graph_contraction.py
index f3ce8fc..9779315 100644
--- a/cutqc_runtime/graph_contraction.py
+++ b/cutqc_runtime/graph_contraction.py
@@ -60,9 +60,8 @@ def compute(self):
             self.compute_graph.assign_bases_to_edges(edge_bases=edge_bases, edges=edges)
             summation_term = None
             for subcircuit_idx in self.smart_order:
-                subcircuit_entry_prob = self.pseudo_subcircuit_entry_probs[
-                    subcircuit_idx
-                ]
+                subcircuit_entry_prob = self.pseudo_subcircuit_entry_probs[subcircuit_idx]
+                
                 if summation_term is None:
                     summation_term = subcircuit_entry_prob
                 else:
diff --git a/example.py b/example.py
index fc4cef3..dc1fda4 100644
--- a/example.py
+++ b/example.py
@@ -9,6 +9,7 @@
 # from cutqc_runtime.main import CutQC # Use this just to benchmark the runtime
 
 from cutqc.main import CutQC # Use this for exact computation
+# from cutqc_runtime.main import CutQC # Use this for exact computation
 
 from helper_functions.benchmarks import generate_circ
 
@@ -27,7 +28,8 @@
         name="%s_%d" % (circuit_type, circuit_size),
         circuit=circuit,
         cutter_constraints={
-            "max_subcircuit_width": math.ceil(circuit.num_qubits / 4 * 3),
+            "max_subcircuit_width": 10,
+            # "max_subcircuit_width": math.ceil(circuit.num_qubits / 4 * 3),
             "max_subcircuit_cuts": 10,
             "subcircuit_size_imbalance": 2,
             "max_cuts": 10,

From a5c265a71b47b1176f6c5599ab94d6df7ea405a9 Mon Sep 17 00:00:00 2001
From: Chuck Garcia <chuckgarcian@gmail.com>
Date: Sun, 29 Sep 2024 04:43:45 -0500
Subject: [PATCH 03/11] Add Distributed Reconstruction Implmentation

---
 README.md                                 |  18 +-
 __init__.py                               |   0
 cutqc/abstract_graph_contractor.py        |  58 ++++++
 cutqc/cutter.py                           |  10 +-
 cutqc/distributed_graph_contraction.py    | 217 ++++++++++++++++++++++
 cutqc/dynamic_definition.py               |  67 ++++---
 cutqc/graph_contraction.py                | 110 +++++------
 cutqc/main.py                             | 169 ++++++++++++++---
 cutqc/post_process_helper.py              |  24 +--
 cutqc_runtime/dynamic_definition.py       |   6 +-
 cutqc_runtime/graph_contraction.py        |   1 +
 data_collection_scripts/collect_data.py   |  48 +++++
 data_collection_scripts/example.py        |  66 +++++++
 distributed_example/Getting_Started.ipynb | 174 +++++++++++++++++
 distributed_example/README.md             |  50 +++++
 distributed_example/cut_and_eval.py       |  56 ++++++
 distributed_example/cut_and_eval.slurm    |  28 +++
 distributed_example/dist_driver.py        |  38 ++++
 distributed_example/dist_driver.slurm     |  31 ++++
 example.py                                |  36 ++--
 helper_functions/non_ibmq_functions.py    |   2 +-
 helper_functions/schedule.py              |  12 +-
 qcg/QAOA/hw_efficient_ansatz.py           |   8 +-
 qcg/VQE/uccsd_ansatz.py                   |   4 +-
 qcg/utils/testhwea.py                     |   1 +
 requirements.txt                          |  14 ++
 26 files changed, 1068 insertions(+), 180 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 cutqc/abstract_graph_contractor.py
 create mode 100644 cutqc/distributed_graph_contraction.py
 create mode 100644 data_collection_scripts/collect_data.py
 create mode 100644 data_collection_scripts/example.py
 create mode 100644 distributed_example/Getting_Started.ipynb
 create mode 100644 distributed_example/README.md
 create mode 100644 distributed_example/cut_and_eval.py
 create mode 100644 distributed_example/cut_and_eval.slurm
 create mode 100644 distributed_example/dist_driver.py
 create mode 100644 distributed_example/dist_driver.slurm
 create mode 100644 requirements.txt

diff --git a/README.md b/README.md
index 1687fa5..d78ca95 100644
--- a/README.md
+++ b/README.md
@@ -21,18 +21,14 @@ Use this mode if you are just interested in the runtime performance of CutQC.
 ## Installation
 1. Make a Python virtual environment:
 ```
-conda create -n cutqc python=3.8
-conda deactivate && conda activate cutqc
-```
-2. CutQC uses the [Gurobi](https://www.gurobi.com) solver. Install Gurobi and obtain a license.
-To install Gurobi for Python, follow the [instructions](https://www.gurobi.com/documentation/9.1/quickstart_linux/cs_python_installation_opt.html). Here we copy paste the up-to-date command as of 05/10/2021 for convenience.
-```
-conda config --add channels https://conda.anaconda.org/gurobi
-conda install gurobi
+conda create cutqc python=3.12
+conda activate cutqc
 ```
+2. CutQC uses the [Gurobi](https://www.gurobi.com) solver. Obtain and install a Gurobi license.
+Follow the [instructions](https://support.gurobi.com/hc/en-us/articles/14799677517585-Getting-Started-with-Gurobi-Optimizer).
 3. Install required packages:
 ```
-pip install numpy qiskit matplotlib pydot scipy tqdm pylatexenc scikit-learn tensorflow networkx
+pip install -r requirements.txt
 ```
 
 ## Example Code
@@ -49,8 +45,8 @@ If you use CutQC in your work, we would appreciate it if you cite our paper:
 
 Tang, Wei, Teague Tomesh, Martin Suchara, Jeffrey Larson, and Margaret Martonosi. "CutQC: using small quantum computers for large quantum circuit evaluations." In Proceedings of the 26th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 473-486. 2021.
 
-## Questions
-Please reach out to Wei Tang (weit@princeton.edu) for any questions and clarifications.
+## Contact Us
+Please open an issue here. Please reach out to [Wei Tang](https://www.linkedin.com/in/weitang39/).
 
 ## TODO
 - [ ] Qubit reorder function
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cutqc/abstract_graph_contractor.py b/cutqc/abstract_graph_contractor.py
new file mode 100644
index 0000000..1bdea4d
--- /dev/null
+++ b/cutqc/abstract_graph_contractor.py
@@ -0,0 +1,58 @@
+from abc import ABC, abstractmethod
+from time import perf_counter
+import numpy as np
+from cutqc.post_process_helper import ComputeGraph
+
+class AbstractGraphContractor(ABC):
+
+    @abstractmethod
+    def _compute(self):
+        pass
+    
+    def reconstruct(self, compute_graph: ComputeGraph, subcircuit_entry_probs: dict, num_cuts: int) -> None:
+        self.compute_graph = compute_graph
+        self.subcircuit_entry_probs = subcircuit_entry_probs
+        self.overhead = {"additions": 0, "multiplications": 0}
+        self.num_cuts = num_cuts
+        self._set_smart_order()
+        
+        start_time = perf_counter()
+        res = self._compute()    
+        end_time = perf_counter() - start_time
+        self.times['compute'] = end_time
+
+        return res
+    
+    def _set_smart_order(self) -> None:
+        """
+        Sets the order in which Kronecker products are computed (greedy subcircuit order).
+        """
+        subcircuit_entry_lengths = {}
+        for subcircuit_idx in self.subcircuit_entry_probs:
+            first_entry_init_meas = list(self.subcircuit_entry_probs[subcircuit_idx].keys())[0]
+            length = len(self.subcircuit_entry_probs[subcircuit_idx][first_entry_init_meas])
+            subcircuit_entry_lengths[subcircuit_idx] = length
+
+        # Sort according to subcircuit lengths (greedy-subcircuit-order)
+        self.smart_order = sorted(
+            subcircuit_entry_lengths.keys(),
+            key=lambda subcircuit_idx: subcircuit_entry_lengths[subcircuit_idx],
+        )
+        
+        self.subcircuit_entry_lengths = [subcircuit_entry_lengths[i] for i in self.smart_order]
+        print(f"subcircuit_entry_length: {self.subcircuit_entry_lengths}", flush=True)
+        self.result_size = np.prod(self.subcircuit_entry_lengths)
+
+
+    def _get_subcircuit_entry_prob(self, subcircuit_idx: int):
+        """
+        Returns The subcircuit Entry Probability for the subcircuit at index
+        'SUBCIRCUIT_IDX' 
+        """
+
+        subcircuit_entry_init_meas = self.compute_graph.get_init_meas(subcircuit_idx)
+        return self.subcircuit_entry_probs[subcircuit_idx][subcircuit_entry_init_meas]
+    
+    @abstractmethod
+    def _get_paulibase_probability(self, edge_bases: tuple, edges: list):
+        pass
\ No newline at end of file
diff --git a/cutqc/cutter.py b/cutqc/cutter.py
index 4ea63fb..20ff198 100644
--- a/cutqc/cutter.py
+++ b/cutqc/cutter.py
@@ -320,9 +320,9 @@ def read_circ(circuit):
     for vertex in dag.topological_op_nodes():
         if len(vertex.qargs) != 2:
             raise Exception("vertex does not have 2 qargs!")
-        
+
         arg0, arg1 = vertex.qargs
-        
+
         vertex_name = "%s[%d]%d %s[%d]%d" % (
             arg0._register.name,
             arg0._index,
@@ -383,9 +383,9 @@ def cuts_parser(cuts, circ):
 
         wire = None
         for qubit in circ.qubits:
-            if qubit._register.name == qubit_cut[0].split("[")[0] and qubit._index == int(
-                qubit_cut[0].split("[")[1].split("]")[0]
-            ):
+            if qubit._register.name == qubit_cut[0].split("[")[
+                0
+            ] and qubit._index == int(qubit_cut[0].split("[")[1].split("]")[0]):
                 wire = qubit
         tmp = 0
         all_Q_gate_idx = None
diff --git a/cutqc/distributed_graph_contraction.py b/cutqc/distributed_graph_contraction.py
new file mode 100644
index 0000000..eacb701
--- /dev/null
+++ b/cutqc/distributed_graph_contraction.py
@@ -0,0 +1,217 @@
+"""
+File: distributed_graph_contraction.py
+Original Author: Wei Tang (tangwei13579@gmail.com)
+Current Version Author: Charles "Chuck" Garcia (chuckgarcian@utexas.edu)
+Description: Distributed implementation of Wei Tang's original TensorFlow CutQC implementation.
+"""
+
+import itertools
+from time import perf_counter
+from typing import List, Optional
+import numpy as np
+import torch
+import torch.distributed as dist
+from cutqc.abstract_graph_contractor import AbstractGraphContractor
+from cutqc.post_process_helper import ComputeGraph
+
+
+__host_machine__ = 0
+
+
+class DistributedGraphContractor(AbstractGraphContractor):
+    """
+     Distributed Graph Contractor Implementation
+    
+     Args:
+            local_rank (int): Node identifier value
+            compute_backend (str): Device used for compute (Default is GPU)
+            
+    """
+    def __init__(self, local_rank: Optional[int] = None, compute_backend: str = 'gpu') -> None:
+        self.local_rank = local_rank
+        
+        # Set up compute devices based on backend
+        self.mp_backend = torch.device(f"cuda:{local_rank}" if dist.get_backend() == 'nccl' else "cpu") # Deviced used MP
+        self.compute_device = torch.device(f"cuda:{local_rank}") if compute_backend == 'gpu' else self.mp_backend
+        self.is_gpu = compute_backend == 'gpu'
+        
+        print ("Worker {}, compute_device: {}".format (dist.get_rank(), self.compute_device), flush=True)
+
+        if dist.get_rank() != __host_machine__:
+            self._initiate_worker_loop()
+        
+        self.times = {'compute': 0}
+        self.compute_graph = None
+        self.subcircuit_entry_probs = None
+        self.reconstructed_prob = None
+
+
+    def terminate_distributed_process(self):
+        """
+        Sends signal to workers to finish their execution.
+        """
+        termination_signal = torch.tensor([-1], dtype=torch.int64).to(self.mp_backend)
+        for rank in range(1, dist.get_world_size()):
+            dist.send(termination_signal, dst=rank)
+        
+        print(f"DESTROYING NOW! {self.times['compute']}", flush=True)
+        dist.destroy_process_group()
+
+    def _get_paulibase_probability (self, edge_bases: tuple, edges: list):
+        """
+        Returns probability contribution for the basis 'edge_bases' in the circuit
+        cutting decomposition.
+        """
+        with torch.no_grad():
+            self.compute_graph.assign_bases_to_edges(edge_bases=edge_bases, edges=edges)
+
+            # Create list of kronecker product terms
+            flat_size = np.sum(self.subcircuit_entry_lengths)
+            flat = torch.empty(flat_size)
+            idx = 0
+            
+            # Store all probability tensors into single flattened tensor
+            for size, subcircuit_idx in zip(self.subcircuit_entry_lengths, self.smart_order):
+                subcircuit_entry_prob = self._get_subcircuit_entry_prob(subcircuit_idx)
+                flat[idx:idx+size] = torch.tensor(subcircuit_entry_prob, dtype=torch.float32)
+                idx += size
+
+        return flat
+
+    def _send_distributed(self, dataset: List[torch.Tensor], num_batches: int) -> torch.Tensor:
+        """
+        Decomposes `dataset` list into 'num_batches' number of batches and distributes
+        to worker processes.
+        """
+        torch.set_default_device(self.mp_backend)
+
+        with torch.no_grad():
+            print ("LEN(DATASET): {}".format (len(dataset)), flush=True)
+            print ("NUMBER BATCHES: {}".format (num_batches), flush=True)
+            if len(dataset) < num_batches:
+                raise ValueError("Error 2000: Invalid number of requested batches -- Too many nodes allocated, for dataset length {} and {} number of batches".format (len(dataset), num_batches))
+            
+            batches = torch.stack(dataset).tensor_split(num_batches)
+            tensor_sizes = torch.tensor(self.subcircuit_entry_lengths, dtype=torch.int64)
+            tensor_sizes_shape = torch.tensor(tensor_sizes.shape, dtype=torch.int64)
+
+            if dist.get_backend() == 'gloo':
+                op_list = []
+                # List of sending objects
+                for dst, batch in enumerate(batches, start=1):
+                    op_list.extend([
+                        dist.P2POp(dist.isend, tensor_sizes_shape, dst),
+                        dist.P2POp(dist.isend, tensor_sizes, dst),
+                        dist.P2POp(dist.isend, torch.tensor(batch.shape, dtype=torch.int64), dst),
+                        dist.P2POp(dist.isend, batch, dst),
+                    ])
+                handles = dist.batch_isend_irecv(op_list)
+            else:
+                # NCCL backend
+                for dst_rank, batch in enumerate(batches, start=1):
+                    # Non-Blocking send on NCCL
+                    dist.isend(tensor_sizes_shape, dst=dst_rank)
+                    dist.isend(tensor_sizes, dst=dst_rank)
+                    dist.isend(torch.tensor(batch.shape), dst=dst_rank)
+                    dist.isend(batch.to(self.compute_device), dst=dst_rank)
+            
+            # Receive Results
+            output_buff = torch.zeros(self.result_size, dtype=torch.float32)
+            dist.reduce(output_buff, dst=0, op=dist.ReduceOp.SUM)
+        
+        return torch.mul(output_buff, (1/2**self.num_cuts))
+
+    def _compute(self) -> np.ndarray:
+        """
+        Performs distributed graph contraction. Returns the reconstructed probability.
+        """
+        edges = self.compute_graph.get_edges(from_node=None, to_node=None)
+        summation_terms_sequence = []
+
+        # Assemble sequence of uncomputed kronecker products
+        for edge_bases in itertools.product(["I", "X", "Y", "Z"], repeat=len(edges)):
+            summation_terms = self._get_paulibase_probability(edge_bases, edges)
+            summation_terms_sequence.append(summation_terms)
+
+        self.compute_graph.remove_bases_from_edges(edges=self.compute_graph.edges)
+        
+        # Distribute and Execute reconstruction on nodes
+        num_batches = dist.get_world_size() - 1  # No batch for host
+        reconstructed_prob = self._send_distributed(summation_terms_sequence, num_batches)
+
+        return reconstructed_prob.cpu().numpy()
+    
+
+    def _receive_from_host(self):
+        """
+        Receives tensors sent by host. Returns batch and unpadded sizes.
+        """
+        torch.set_default_device(self.mp_backend)
+        torch.cuda.device(self.compute_device)
+        if (self.is_gpu): torch.cuda.device(self.compute_device)
+        
+        with torch.no_grad():
+            tensor_sizes_shape = torch.empty([1], dtype=torch.int64)
+            dist.recv(tensor=tensor_sizes_shape, src=0)
+            
+            # Check for termination signal
+            if tensor_sizes_shape.item() == -1:
+                print(f"WORKER {dist.get_rank()} DYING", flush=True)
+                dist.destroy_process_group()
+                exit()
+
+            # Used to unflatten
+            tensor_sizes = torch.empty(tensor_sizes_shape, dtype=torch.int64)
+            dist.recv(tensor=tensor_sizes, src=0)
+
+            # Get shape of the batch we are receiving
+            batch_shape = torch.empty([2], dtype=torch.int64)
+            dist.recv(tensor=batch_shape, src=0)
+            
+            # Create an empty batch tensor and receive its data
+            batch = torch.empty(tuple(batch_shape), dtype=torch.float32)
+            dist.recv(tensor=batch, src=0)
+
+        return batch_shape[0], batch, tensor_sizes
+
+    def _initiate_worker_loop(self):
+        """
+        Primary worker loop.
+
+        Each worker receives a portion of the workload from the host/master node.
+        Once done with computation, all nodes perform a collective reduction
+        operation back to the host. Synchronization among nodes is provided via
+        barriers and blocked message passing.
+        """
+        from pprint import pprint
+        
+        while True:
+            torch.cuda.device(self.compute_device)
+            num_batches, batch, tensor_sizes = self._receive_from_host()        
+            
+            # Ensure Enough Size
+            gpu_free = torch.cuda.mem_get_info()[0]
+            batch_mem_size = batch.element_size() * torch.prod(tensor_sizes) * num_batches 
+            assert (batch_mem_size < gpu_free), ValueError ("Error 2006: Batch of size {}, to large for GPU device of size {}".format (batch_mem_size, gpu_free))
+                                
+            # Execute kronecker products in parallel (vectorization)                    
+            torch.cuda.memory._record_memory_history()            
+            lambda_fn = lambda x: compute_kronecker_product(x, tensor_sizes)
+            vec_fn = torch.func.vmap(lambda_fn)
+            res = vec_fn(batch)
+            torch.cuda.memory._dump_snapshot("compute_snap.pickle")        
+            
+            del (batch)
+            res = res.sum(dim=0)
+            
+            # Send Back to host
+            dist.reduce(res.to(self.mp_backend), dst=__host_machine__, op=dist.ReduceOp.SUM)
+            
+
+from functools import reduce
+def compute_kronecker_product(flattened: torch.Tensor, sizes: torch.Tensor) -> torch.Tensor:
+    """
+    Computes sequence of Kronecker products, where operands are tensors in 'components'.
+    """
+    tensors = torch.split(flattened, tuple(sizes))
+    return reduce(torch.kron, tensors)
diff --git a/cutqc/dynamic_definition.py b/cutqc/dynamic_definition.py
index d3a4c7c..2f19cde 100644
--- a/cutqc/dynamic_definition.py
+++ b/cutqc/dynamic_definition.py
@@ -1,20 +1,23 @@
 import itertools, copy, pickle, subprocess
 from time import perf_counter
 import numpy as np
+import torch
 
 from helper_functions.non_ibmq_functions import evaluate_circ
 from helper_functions.conversions import quasi_to_real
 from helper_functions.metrics import MSE
 
 from cutqc.evaluator import get_num_workers
-from cutqc.graph_contraction import GraphContractor
+# from cutqc.graph_contraction import GraphContractor
+from cutqc.distributed_graph_contraction import DistributedGraphContractor
 from cutqc.helper_fun import add_times
 from cutqc.post_process_helper import get_reconstruction_qubit_order
 
+import torch.distributed as dist
 
 class DynamicDefinition(object):
     def __init__(
-        self, compute_graph, data_folder, num_cuts, mem_limit, recursion_depth
+        self, compute_graph, data_folder, num_cuts, mem_limit, recursion_depth, parallel_reconstruction=False, local_rank=None, compute_backend='gpu'
     ) -> None:
         super().__init__()
         self.compute_graph = compute_graph
@@ -23,8 +26,11 @@ def __init__(
         self.mem_limit = mem_limit
         self.recursion_depth = recursion_depth
         self.dd_bins = {}
-        self.overhead = {"additions": 0, "multiplications": 0}
+        self.local_rank = local_rank
+        self.graph_contractor = DistributedGraphContractor (local_rank=self.local_rank, compute_backend=compute_backend) if (parallel_reconstruction) else GraphContractor()
+        self.parallel_reconstruction = parallel_reconstruction
 
+        self.overhead = {"additions": 0, "multiplications": 0}
         self.times = {"get_dd_schedule": 0, "merge_states_into_bins": 0, "sort": 0}
 
     def build(self):
@@ -42,6 +48,7 @@ def build(self):
         )
         largest_bins = []  # [{recursion_layer, bin_id}]
         recursion_layer = 0
+
         while recursion_layer < self.recursion_depth:
             # print('-'*10,'Recursion Layer %d'%(recursion_layer),'-'*10)
             """Get qubit states"""
@@ -56,25 +63,25 @@ def build(self):
                     recursion_layer=bin_to_expand["recursion_layer"],
                     bin_id=bin_to_expand["bin_id"],
                 )
-            pickle.dump(
+            pickle.dump (
                 dd_schedule, open("%s/dd_schedule.pckl" % self.data_folder, "wb")
             )
             self.times["get_dd_schedule"] += perf_counter() - get_dd_schedule_begin
-
             merged_subcircuit_entry_probs = self.merge_states_into_bins()
 
             """ Build from the merged subcircuit entries """
-            graph_contractor = GraphContractor(
+            reconstructed_prob = self.graph_contractor.reconstruct (
                 compute_graph=self.compute_graph,
                 subcircuit_entry_probs=merged_subcircuit_entry_probs,
-                num_cuts=self.num_cuts,
-            )
-            reconstructed_prob = graph_contractor.reconstructed_prob
-            smart_order = graph_contractor.smart_order
-            recursion_overhead = graph_contractor.overhead
+                num_cuts=self.num_cuts                
+                )
+        
+            
+            smart_order = self.graph_contractor.smart_order
+            recursion_overhead = self.graph_contractor.overhead
             self.overhead["additions"] += recursion_overhead["additions"]
             self.overhead["multiplications"] += recursion_overhead["multiplications"]
-            self.times = add_times(times_a=self.times, times_b=graph_contractor.times)
+            self.times = add_times(times_a=self.times, times_b=self.graph_contractor.times)
 
             self.dd_bins[recursion_layer] = dd_schedule
             self.dd_bins[recursion_layer]["smart_order"] = smart_order
@@ -107,6 +114,12 @@ def build(self):
                 )[: self.recursion_depth]
             self.times["sort"] += perf_counter() - sort_begin
             recursion_layer += 1
+        
+        # Terminate the parallized process         
+        print("Compute Time: {}".format (self.graph_contractor.times["compute"]))
+        # if (self.parallel_reconstruction):
+        #     self.graph_contractor.terminate_distributed_process()
+
 
     def initialize_dynamic_definition_schedule(self):
         schedule = {}
@@ -155,9 +168,9 @@ def next_dynamic_definition_schedule(self, recursion_layer, bin_id):
                 next_dd_schedule["subcircuit_state"][subcircuit_idx]
             ):
                 if qubit_state == "active":
-                    next_dd_schedule["subcircuit_state"][subcircuit_idx][
-                        qubit_ctr
-                    ] = int(binary_bin_idx[binary_state_idx_ptr])
+                    next_dd_schedule["subcircuit_state"][subcircuit_idx][qubit_ctr] = (
+                        int(binary_bin_idx[binary_state_idx_ptr])
+                    )
                     binary_state_idx_ptr += 1
         next_dd_schedule["upper_bin"] = (recursion_layer, bin_id)
 
@@ -237,9 +250,9 @@ def merge_states_into_bins(self):
             )
             for subcircuit_idx in rank_merged_subcircuit_entry_probs:
                 if subcircuit_idx not in merged_subcircuit_entry_probs:
-                    merged_subcircuit_entry_probs[
-                        subcircuit_idx
-                    ] = rank_merged_subcircuit_entry_probs[subcircuit_idx]
+                    merged_subcircuit_entry_probs[subcircuit_idx] = (
+                        rank_merged_subcircuit_entry_probs[subcircuit_idx]
+                    )
                 else:
                     merged_subcircuit_entry_probs[subcircuit_idx].update(
                         rank_merged_subcircuit_entry_probs[subcircuit_idx]
@@ -302,9 +315,9 @@ def read_dd_bins(subcircuit_out_qubits, dd_bins):
                     ["0", "1"], repeat=num_merged
                 ):
                     for merged_qubit_ctr in range(num_merged):
-                        binary_full_state[
-                            merged_qubit_indices[merged_qubit_ctr]
-                        ] = binary_merged_state[merged_qubit_ctr]
+                        binary_full_state[merged_qubit_indices[merged_qubit_ctr]] = (
+                            binary_merged_state[merged_qubit_ctr]
+                        )
                     full_state = "".join(binary_full_state)[::-1]
                     full_state_idx = int(full_state, 2)
                     reconstructed_prob[full_state_idx] = average_state_prob
@@ -326,14 +339,20 @@ def full_verify(full_circuit, complete_path_map, subcircuits, dd_bins):
     real_probability = quasi_to_real(
         quasiprobability=reconstructed_prob, mode="nearest"
     )
-    print (f"MSE: {MSE(target=ground_truth, obs=real_probability)}")
+    # print (f"MSE: {MSE(target=ground_truth, obs=real_probability)}")
+    # print ("real_probability: {}".format (real_probability))
+    # print ("real_probability.shape: {}".format (real_probability.shape))
+    # print ("ground_truth: {}".format (ground_truth))
+    # print ("ground_truth.shape: {}".format (ground_truth.shape))
+    
     approximation_error = (
         MSE(target=ground_truth, obs=real_probability)
         * 2**full_circuit.num_qubits
         / np.linalg.norm(ground_truth) ** 2
     )
     
-    print (f"Reconstructed Error: {reconstructed_prob}")
-    print (f"Real Error: {real_probability}")
+    
+    # print (f"Reconstructed Error: {reconstructed_prob}")
+    # print (f"Real Error: {real_probability}")
     
     return reconstructed_prob, approximation_error
diff --git a/cutqc/graph_contraction.py b/cutqc/graph_contraction.py
index a964c58..f04b1c8 100644
--- a/cutqc/graph_contraction.py
+++ b/cutqc/graph_contraction.py
@@ -2,75 +2,66 @@
 from time import perf_counter
 import numpy as np
 import logging, os
+from cutqc.post_process_helper import ComputeGraph
+from cutqc.abstract_graph_contractor import AbstractGraphContractor
+import tensorflow as tf
+
 
 logging.disable(logging.WARNING)
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
-import tensorflow as tf
 
 
-def compute_summation_term(*argv):
-    summation_term = None
-    for subcircuit_entry_prob in argv:
-        if summation_term is None:
-            summation_term = subcircuit_entry_prob
-        else:
-            summation_term = tf.reshape(
-                tf.tensordot(summation_term, subcircuit_entry_prob, axes=0), [-1]
-            )
-    return summation_term
-                
-class GraphContractor(object):
-    def __init__(self, compute_graph, subcircuit_entry_probs, num_cuts) -> None:
+class GraphContractor(AbstractGraphContractor):
+    def __init__(self) -> None: 
         super().__init__()
-        self.times = {}
-        self.compute_graph = compute_graph
-        self.subcircuit_entry_probs = subcircuit_entry_probs
-        self.num_cuts = num_cuts
-        self.subcircuit_entry_lengths = {}
-        for subcircuit_idx in subcircuit_entry_probs:
-            first_entry_init_meas = list(subcircuit_entry_probs[subcircuit_idx].keys())[
-                0
-            ]
-            length = len(subcircuit_entry_probs[subcircuit_idx][first_entry_init_meas])
-            self.subcircuit_entry_lengths[subcircuit_idx] = length
-        self.num_qubits = 0
-        for subcircuit_idx in compute_graph.nodes:
-            self.num_qubits += compute_graph.nodes[subcircuit_idx]["effective"]
-
-        self.smart_order = sorted(
-            self.subcircuit_entry_lengths.keys(),
-            key=lambda subcircuit_idx: self.subcircuit_entry_lengths[subcircuit_idx],
-        )
-        self.overhead = {"additions": 0, "multiplications": 0}
-        self.reconstructed_prob = self.compute()
+        self.times = {}            
+        self.reconstructed_prob = None
         
-    def compute(self):
+        # Used to compute
+        self.compute_graph = None
+        self.subcircuit_entry_probs = None
+        self.num_cuts = None
+
+    
+    def _get_paulibase_probability(self, edge_bases: tuple, edges: list):
+        """
+        Returns the probability contribution for the basis 'EDGE_BASES' in the circuit
+        cutting decomposition.
+        """
+
+        summation_term = None
+        self.compute_graph.assign_bases_to_edges(edge_bases=edge_bases, edges=edges)
+
+        for subcircuit_idx in self.smart_order:
+            subcircuit_entry_prob = self._get_subcircuit_entry_prob(subcircuit_idx)
+            if summation_term is None:
+                summation_term = subcircuit_entry_prob
+            else:
+                summation_term = tf.reshape(
+                    tf.tensordot(summation_term, subcircuit_entry_prob, axes=0),
+                    [-1],
+                )
+                self.overhead["multiplications"] += len(summation_term)
+
+        return summation_term
+
+    def _compute(self):
+        '''
+        Internal function that actualy does the reconstruct
+        '''
         edges = self.compute_graph.get_edges(from_node=None, to_node=None)
 
         partial_compute_begin = perf_counter()
-        reconstructed_prob = None
+        reconstructed_prob = tf.zeros_like(self._get_paulibase_probability(["I"] * len(edges), edges))
         counter = 0
+
+        # Compute Kronecker sums over the different basis
         for edge_bases in itertools.product(["I", "X", "Y", "Z"], repeat=len(edges)):
-            self.compute_graph.assign_bases_to_edges(edge_bases=edge_bases, edges=edges)
-            summation_term = None
-            for subcircuit_idx in self.smart_order:
-                subcircuit_entry_prob = get_subcircuit_entry_prob (self, subcircuit_idx)
-                if summation_term is None:
-                    summation_term = subcircuit_entry_prob
-                else:
-                    summation_term = tf.reshape(
-                        tf.tensordot(summation_term, subcircuit_entry_prob, axes=0),
-                        [-1],
-                    )
-                    self.overhead["multiplications"] += len(summation_term)
-            if reconstructed_prob is None:
-                reconstructed_prob = summation_term
-            else:
-                reconstructed_prob += summation_term
-                self.overhead["additions"] += len(summation_term)
+            summation_term = self._get_paulibase_probability(edge_bases, edges)
+            reconstructed_prob = tf.add(reconstructed_prob, summation_term)
+            self.overhead["additions"] += len(summation_term)
             counter += 1
             
-            
         self.compute_graph.remove_bases_from_edges(edges=self.compute_graph.edges)
         partial_compute_time = perf_counter() - partial_compute_begin
 
@@ -90,12 +81,3 @@ def compute(self):
             self.overhead["multiplications"] / counter * 4 ** len(edges)
         )
         return reconstructed_prob
-
-
-def get_subcircuit_entry_prob (gc : GraphContractor, subcircuit_idx : int):
-        '''
-        Returns The subcircuit Entry Probability for the subcircuit at index 
-        'SUBCIRCUIT_IDX' of the graph contractor object 'GC'.
-        '''
-        subcircuit_entry_init_meas = gc.compute_graph.get_init_meas(subcircuit_idx)        
-        return gc.subcircuit_entry_probs[subcircuit_idx][subcircuit_entry_init_meas]
diff --git a/cutqc/main.py b/cutqc/main.py
index b63577e..1d657ad 100644
--- a/cutqc/main.py
+++ b/cutqc/main.py
@@ -1,4 +1,5 @@
 import subprocess, os
+import pickle
 from time import perf_counter
 
 from cutqc.helper_fun import check_valid, add_times
@@ -10,34 +11,132 @@
 )
 from cutqc.dynamic_definition import DynamicDefinition, full_verify
 
+from datetime import timedelta
+import torch.distributed as dist
+
+__host_machine__ = 0
 
 class CutQC:
     """
     The main module for CutQC
     cut --> evaluate results --> verify (optional)
     """
-
-    def __init__(self, name, circuit, cutter_constraints, verbose):
+    
+    def __init__(self, 
+                 name=None, 
+                 circuit=None, 
+                 cutter_constraints=None, 
+                 verbose=False,              
+                 parallel_reconstruction=False, 
+                 reconstruct_only=False, 
+                 load_data=None, 
+                 compute_backend='gpu', 
+                 comm_backend = 'nccl',
+                 gpus_per_node = None,
+                 world_rank = None,
+                 world_size = None, 
+                 ):
         """
         Args:
-        name : name of the input quantum circuit
-        circuit : the input quantum circuit
-        cutter_constraints : cutting constraints to satisfy
+        name: name of the input quantum circuit
+        circuit: the input quantum circuit
+        cutter_constraints: cutting constraints to satisfy
 
         verbose: setting verbose to True to turn on logging information.
-        Useful to visualize what happens,
-        but may produce very long outputs for complicated circuits.
+                 Useful to visualize what happens,
+                 but may produce very long outputs for complicated circuits.
+
+        --- Distributed Reconstruction Related Arguments ---          
+        
+        parallel_reconstruction (Optional): When set to 'True', reconstruction 
+                is executed distributed. Default FALSE
+        
+        reconstruct_only (Optional): When enabled, cutqc performs only reconstructions. 
+                          Distrubuted reconstruction requires that this be 'TRUE'.
+                          Default FALSE
+        
+        load_data (Optional): String of file name to load subcircuit outputs 
+                        from a previous CutQC instance. Default NONE.
+
+        compute_backend (Optional): Compute processing device used if 
+                                    parallel_reconstruction is set to 'TRUE'. 
+                                    'cpu' for cpu and 'gpu' for gpu. Default GPU
+
+        comm_backend (Optional): message passing backend internally used by pytorch for 
+                                 sending data between nodes. Default NCCL.
+        gpus_per_node (Optional): Number of GPUs per node in the case they are 
+                                  used as the compute backend.
+        world_rank (Optional):   Global Identifier. Default NONE.                       
+        world_size (Optional):   Total number of nodes
+        
         """
-        check_valid(circuit=circuit)
         self.name = name
         self.circuit = circuit
-        self.cutter_constraints = cutter_constraints
+        self.cutter_constraints = cutter_constraints        
         self.verbose = verbose
         self.times = {}
+
+        self.compute_graph = None
+        self.tmp_data_folder = None
+        self.num_cuts = None
+        self.complete_path_map = None
+        self.subcircuits = None
+                
+        if reconstruct_only:
+            # Multi node - Pytorch Version
+            if parallel_reconstruction:                
+                self.compute_backend = compute_backend
+                self._setup_for_dist_reconstruction (load_data, comm_backend, world_rank, world_size, gpus_per_node)
+            
+            # Single node - Tensorflow Version
+            else:
+                self._load_data(load_data)
+        
+        elif not reconstruct_only: 
+            # Cutting, evaluation and reconstruction are occuring all at once.
+            self._initialize_for_serial_reconstruction(circuit)    
+            
+    def _setup_for_dist_reconstruction (self, load_data, comm_backend: str, world_rank: int, world_size: int, gpus_per_node: int):
+        """
+        Sets up to call the distributed kernel. Worker nodes 
+        
+        Args:
+            comm_backend: message passing backend internally used by pytorch for 
+                        sending data between nodes
+            world_rank:   Global Identifier                       
+            world_size:   Total number of nodes
+            timeout:      Max amount of time pytorch will let any one node wait on 
+                        a message before killing it.
+        """
+        
+        self.local_rank = world_rank - gpus_per_node * (world_rank // gpus_per_node)                
+        self.parallel_reconstruction = True
+        timelimit = timedelta(hours=1)  # Bounded wait time to prevent deadlock
+        
+        dist.init_process_group(comm_backend, rank=world_rank, world_size=world_size, timeout=timelimit)
+        
+        # Only host should load subcircuit data
+        if dist.get_rank() == __host_machine__:
+            # Todo: I think ideally the workers should on start load their own data
+            self._load_data(load_data)
+
+    def _load_data(self, load_data):
+        with open(load_data, 'rb') as inp:
+            loaded_cutqc = pickle.load(inp)
+            self.__dict__.update(vars(loaded_cutqc))
+
+    def _initialize_for_serial_reconstruction(self, circuit):
+        check_valid(circuit=circuit)
         self.tmp_data_folder = "cutqc/tmp_data"
+        self._setup_tmp_folder()
+
+    def _setup_tmp_folder(self):
         if os.path.exists(self.tmp_data_folder):
             subprocess.run(["rm", "-r", self.tmp_data_folder])
         os.makedirs(self.tmp_data_folder)
+    
+    def destroy_distributed (self):
+        self.dd.graph_contractor.terminate_distributed_process()
 
     def cut(self):
         """
@@ -106,32 +205,45 @@ def build(self, mem_limit, recursion_depth):
             print("--> Build %s" % (self.name))
 
         # Keep these times and discard the rest
-        self.times = {
-            "cutter": self.times["cutter"],
-            "evaluate": self.times["evaluate"],
-        }
-
-        build_begin = perf_counter()
-        dd = DynamicDefinition(
+        # self.times = {
+        #     "cutter": self.times["cutter"],
+        #     "evaluate": self.times["evaluate"],
+        # }
+        
+    
+        print ("self.parallel_reconstruction: {}".format (self.parallel_reconstruction))
+        self.dd = DynamicDefinition(
             compute_graph=self.compute_graph,
             data_folder=self.tmp_data_folder,
             num_cuts=self.num_cuts,
             mem_limit=mem_limit,
             recursion_depth=recursion_depth,
+            parallel_reconstruction=self.parallel_reconstruction,
+            local_rank=self.local_rank,
+            compute_backend=self.compute_backend
         )
-        dd.build()
+        self.dd.build ()
 
-        self.times = add_times(times_a=self.times, times_b=dd.times)
-        self.approximation_bins = dd.dd_bins
+        self.times = add_times(times_a=self.times, times_b=self.dd.times)
+        self.approximation_bins = self.dd.dd_bins
         self.num_recursions = len(self.approximation_bins)
-        self.overhead = dd.overhead
-        self.times["build"] = perf_counter() - build_begin
-        self.times["build"] += self.times["cutter"]
-        self.times["build"] -= self.times["merge_states_into_bins"]
+        self.overhead = self.dd.overhead
+        # self.times["build"] = perf_counter() - build_begin
+        # self.times["build"] += self.times["cutter"]
+        # self.times["build"] -= self.times["merge_states_into_bins"]
 
         if self.verbose:
             print("Overhead = {}".format(self.overhead))
 
+        return self.dd.graph_contractor.times["compute"]
+
+    def save_eval_data (self, foldername: str) -> None:
+        '''
+        Saves subcircuit evaluation data which can be used in a future 
+        instance of `cutqc` for reconstruction.
+        '''
+        subprocess.run(["cp", "-r", self.tmp_data_folder, foldername])
+    
     def verify(self):
         verify_begin = perf_counter()
         reconstructed_prob, self.approximation_error = full_verify(
@@ -143,7 +255,15 @@ def verify(self):
         
         print (f"Approximate Error: {self.approximation_error}")
         print("verify took %.3f" % (perf_counter() - verify_begin))
+        return self.approximation_error
 
+    def save_cutqc_obj (self, filename : str) -> None:
+        '''
+        Saves CutQC instance as the pickle file 'FILENAME'
+        '''
+        with open (filename, 'wb') as outp:
+            pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)
+    
     def clean_data(self):
         subprocess.run(["rm", "-r", self.tmp_data_folder])
 
@@ -176,7 +296,8 @@ def _run_subcircuits(self):
         if os.path.exists(self.tmp_data_folder):
             subprocess.run(["rm", "-r", self.tmp_data_folder])
         os.makedirs(self.tmp_data_folder)
-        run_subcircuit_instances(
+        
+        run_subcircuit_instances (
             subcircuits=self.subcircuits,
             subcircuit_instances=self.subcircuit_instances,
             eval_mode=self.eval_mode,
diff --git a/cutqc/post_process_helper.py b/cutqc/post_process_helper.py
index 004b886..b4546a4 100644
--- a/cutqc/post_process_helper.py
+++ b/cutqc/post_process_helper.py
@@ -62,18 +62,18 @@ def get_init_meas(self, subcircuit_idx):
         for edge in edges_to_node:
             _, v_for_edge, edge_attributes = edge
             assert v_for_edge == subcircuit_idx
-            entry_init[
-                bare_subcircuit.qubits.index(edge_attributes["rho_qubit"])
-            ] = edge_attributes["basis"]
+            entry_init[bare_subcircuit.qubits.index(edge_attributes["rho_qubit"])] = (
+                edge_attributes["basis"]
+            )
 
         entry_meas = ["comp"] * bare_subcircuit.num_qubits
         edges_from_node = self.get_edges(from_node=subcircuit_idx, to_node=None)
         for edge in edges_from_node:
             u_for_edge, _, edge_attributes = edge
             assert u_for_edge == subcircuit_idx
-            entry_meas[
-                bare_subcircuit.qubits.index(edge_attributes["O_qubit"])
-            ] = edge_attributes["basis"]
+            entry_meas[bare_subcircuit.qubits.index(edge_attributes["O_qubit"])] = (
+                edge_attributes["basis"]
+            )
         return (tuple(entry_init), tuple(entry_meas))
 
     def get_contraction_edges(
@@ -293,14 +293,14 @@ def generate_subcircuit_entries(compute_graph):
                 ) = edge
                 if subcircuit_idx == upstream_subcircuit_idx:
                     O_qubit = edge_attributes["O_qubit"]
-                    subcircuit_entry_meas[
-                        bare_subcircuit.qubits.index(O_qubit)
-                    ] = edge_basis
+                    subcircuit_entry_meas[bare_subcircuit.qubits.index(O_qubit)] = (
+                        edge_basis
+                    )
                 elif subcircuit_idx == downstream_subcircuit_idx:
                     rho_qubit = edge_attributes["rho_qubit"]
-                    subcircuit_entry_init[
-                        bare_subcircuit.qubits.index(rho_qubit)
-                    ] = edge_basis
+                    subcircuit_entry_init[bare_subcircuit.qubits.index(rho_qubit)] = (
+                        edge_basis
+                    )
                 else:
                     raise IndexError(
                         "Generating entries for a subcircuit. subcircuit_idx should be either upstream or downstream"
diff --git a/cutqc_runtime/dynamic_definition.py b/cutqc_runtime/dynamic_definition.py
index 336c094..58f56ea 100644
--- a/cutqc_runtime/dynamic_definition.py
+++ b/cutqc_runtime/dynamic_definition.py
@@ -144,9 +144,9 @@ def next_dynamic_definition_schedule(self, recursion_layer, bin_id):
                 next_dd_schedule["subcircuit_state"][subcircuit_idx]
             ):
                 if qubit_state == "active":
-                    next_dd_schedule["subcircuit_state"][subcircuit_idx][
-                        qubit_ctr
-                    ] = int(binary_bin_idx[binary_state_idx_ptr])
+                    next_dd_schedule["subcircuit_state"][subcircuit_idx][qubit_ctr] = (
+                        int(binary_bin_idx[binary_state_idx_ptr])
+                    )
                     binary_state_idx_ptr += 1
         next_dd_schedule["upper_bin"] = (recursion_layer, bin_id)
 
diff --git a/cutqc_runtime/graph_contraction.py b/cutqc_runtime/graph_contraction.py
index 9779315..66730b0 100644
--- a/cutqc_runtime/graph_contraction.py
+++ b/cutqc_runtime/graph_contraction.py
@@ -70,6 +70,7 @@ def compute(self):
                         [-1],
                     )
                     self.overhead["multiplications"] += len(summation_term)
+        
             if reconstructed_prob is None:
                 reconstructed_prob = summation_term
             else:
diff --git a/data_collection_scripts/collect_data.py b/data_collection_scripts/collect_data.py
new file mode 100644
index 0000000..443ccb8
--- /dev/null
+++ b/data_collection_scripts/collect_data.py
@@ -0,0 +1,48 @@
+# Author: Ellie Vogel
+
+import os
+import subprocess
+
+# Define the sets of variables
+variable_sets = [
+    {'circuit_size': 22, 'max_subcircuit_width': 20, 'circuit_type': 'adder'},
+    {'circuit_size': 24, 'max_subcircuit_width': 20, 'circuit_type': 'adder'},
+    {'circuit_size': 26, 'max_subcircuit_width': 20, 'circuit_type': 'adder'},
+    {'circuit_size': 28, 'max_subcircuit_width': 20, 'circuit_type': 'adder'},
+    {'circuit_size': 30, 'max_subcircuit_width': 20, 'circuit_type': 'adder'}
+]
+
+# Read the SLURM script template
+with open('run.slurm', 'r') as file:
+    slurm_template = file.read()
+
+# Directory to store generated SLURM scripts
+slurm_scripts_dir = 'generated_slurm_scripts'
+os.makedirs(slurm_scripts_dir, exist_ok=True)
+
+previous_job_id = None
+
+# Generate and submit SLURM scripts for each set of variables
+for i, variables in enumerate(variable_sets):
+    slurm_script_content = slurm_template.format(**variables)
+    slurm_script_path = os.path.join(slurm_scripts_dir, f'slurm_script_{i}.slurm')
+
+    # Write the generated SLURM script to a file
+    with open(slurm_script_path, 'w') as slurm_script_file:
+        slurm_script_file.write(slurm_script_content)
+
+    # Construct the sbatch command
+    sbatch_command = ['sbatch']
+    if previous_job_id:
+        sbatch_command.extend(['--dependency=afterok:' + previous_job_id])
+    sbatch_command.append(slurm_script_path)
+
+    # Submit the SLURM script using sbatch and capture the job ID
+    result = subprocess.run(sbatch_command, capture_output=True, text=True)
+    output = result.stdout.strip()
+
+    # Extract job ID from sbatch output
+    job_id = output.split()[-1]
+    previous_job_id = job_id
+
+print('All SLURM scripts have been submitted.')
\ No newline at end of file
diff --git a/data_collection_scripts/example.py b/data_collection_scripts/example.py
new file mode 100644
index 0000000..2330feb
--- /dev/null
+++ b/data_collection_scripts/example.py
@@ -0,0 +1,66 @@
+import os
+import math
+import logging
+import argparse
+
+logging.disable(logging.WARNING)
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
+# Comment this line if using GPU
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+# from cutqc_runtime.main import CutQC # Use this just to benchmark the runtime
+from cutqc.main import CutQC  # Use this for exact computation
+# from cutqc_runtime.main import CutQC # Use this for exact computation
+from helper_functions.benchmarks import generate_circ
+
+def main(circuit_size, max_subcircuit_width, circuit_type):
+    circuit_type = circuit_type
+    circuit = generate_circ(
+        num_qubits=circuit_size,
+        depth=1,
+        circuit_type=circuit_type,
+        reg_name="q",
+        connected_only=True,
+        seed=None,
+    )
+    cutqc = CutQC(
+        name="%s_%d_%d" % (circuit_type, max_subcircuit_width, circuit_size),
+        circuit=circuit,
+        cutter_constraints={
+            "max_subcircuit_width": max_subcircuit_width,
+            # "max_subcircuit_width": math.ceil(circuit.num_qubits / 4 * 3),
+            "max_subcircuit_cuts": 10,
+            "subcircuit_size_imbalance": 2,
+            "max_cuts": 10,
+            "num_subcircuits": [2, 3, 4, 5, 6, 7, 8],
+        },
+        verbose=True,
+    )
+    
+    print("-- Cut --")
+    cutqc.cut()
+    if not cutqc.has_solution:
+        raise Exception("The input circuit and constraints have no viable cuts")
+    print("-- Done Cutting -- \n")
+    
+    print("-- Evaluate --")
+    cutqc.evaluate(eval_mode="sv", num_shots_fn=None)
+    print("-- Done Evaluating -- \n")
+
+    print("-- Build --")
+    cutqc.build(mem_limit=128, recursion_depth=1)
+    print("-- Done Building -- \n")
+    
+    # cutqc.verify()
+    # print("Cut: %d recursions." % (cutqc.num_recursions))
+    # print(cutqc.approximation_bins)
+    cutqc.clean_data()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run CutQC with given parameters")
+    parser.add_argument('--circuit_size', type=int, required=True, help='Size of the circuit')
+    parser.add_argument('--max_subcircuit_width', type=int, required=True, help='Max width of subcircuit')
+    parser.add_argument('--circuit_type', type=str, required=True, help='Circuit Type')
+    args = parser.parse_args()
+    
+    main(args.circuit_size, args.max_subcircuit_width, args.circuit_type)
\ No newline at end of file
diff --git a/distributed_example/Getting_Started.ipynb b/distributed_example/Getting_Started.ipynb
new file mode 100644
index 0000000..a399b29
--- /dev/null
+++ b/distributed_example/Getting_Started.ipynb
@@ -0,0 +1,174 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Index\n",
+    "\n",
+    "1. [Quick Intro](#1.-Quick-Intro)\n",
+    "2. [Setup](#2.-Setup)\n",
+    "3. [Cutting and Evaluating](#3.-Cut-and-Evaluation)\n",
+    "4. [Initializing CutQC object for Reconstruction](#4.-Initializing-CutQC-object-for-Reconstruction)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Quick Intro\n",
+    "\n",
+    "CutQC is a 'full stack' circuit circuit cutting framework.  CutQC has three components: 1. Cut 2. Subcircuit Evaluation 3. Reconstruction. The last of which has a distrubuted implmenteation that can run on multi-node compute clusters. This document will discuss how to setup and run the distrubuted reconstruction component. All slurm and pyton excerpts shown in this notebook can be found in the examples directory."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "# 2. Setup\n",
+    "\n",
+    "This section shall briefly cover the environment variables and message passing backend required to run cutqc distrbuted reconstruction. \n",
+    "\n",
+    "At its core, a 'distributed application' is the execution of a program on a set of interconnected -- but independent -- machines that compute in parallel. Independent in this case means each machine executes a separate instance of the program with seperate execution contexts.\n",
+    "\n",
+    "As a result, different compute nodes require a way to talk to each other. In many cases 'communication' largely can be broken down into two parts: \n",
+    "  1. Initializatoin: Compute nodes need to 'find eachother, i.e a handshake. \n",
+    "  2. Post-initialization: The manner in which data is actually being sent. This depends partially on available hardware. \n",
+    "\n",
+    "What all of this really means is when using CutQC, you must supply/set some extra information to faccilitate these two components. \n",
+    "\n",
+    "Note, Appart from that everything else is transparent.\n",
+    "\n",
+    "During the execution of distrubted cutqc, this commucation is handled by the [commucation backend](https://pytorch.org/docs/stable/distributed.html), "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "### Initialization\n",
+    "\n",
+    "When running distributed reconstruction, you must start a separate instance of `cutqc` on each node. And as stated above, Initially, these instances will be unaware of each other's existence. However, `cutqc` relies on PyTorch to exchange connection information (handshake). Each node must have access to the following information  for the handshake:\n",
+    "\n",
+    "- `MASTER_PORT`: Port on the host machine\n",
+    "- `MASTER_ADDR`: IP address of the host machine\n",
+    "- `WORLD_SIZE`: Total number of nodes involved\n",
+    "- `RANK`: Unique process identifier \n",
+    "\n",
+    "At the moment, this is done by setting this done by setting these values as environment variables on all machines involved and also passing it to cutqc when. \n",
+    "\n",
+    "More information about environment intialization can be [found here.](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization)\n",
+    "\n",
+    "\n",
+    "Thankfully a compute cluster scheduler like [slurm](https://slurm.schedmd.com/documentation.html) will automatically set the RANK WORLD_SIZE environement variables. Slurm won't automatically set the master_port and mastter_addr envrionment variables, however it does make it easy to do so. \n",
+    "\n",
+    "Bellow is an excerpt from the slurm script used in example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#SBATCH --output=_output/%x.%j.out\n",
+    "#SBATCH --nodes=1                 # node count\n",
+    "#SBATCH --ntasks-per-node=1       # total number of tasks across all nodes\n",
+    "#SBATCH --cpus-per-task=13        # cpu-cores per task (>1 if multi-threaded tasks)\n",
+    "\n",
+    "export MASTER_PORT=$(get_free_port)  \n",
+    "export MASTER_ADDR=$master_addr=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\" | head -n 1) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If done correctly, each node should be able to access port, address, world size information in the driver python by reading their environment variables. \n",
+    "\n",
+    "Bellow is an excerpt from the example driver that does this: \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Title: 'dist_run.py'\n",
+    "import os\n",
+    "\n",
+    "# Environment variables set by slurm script\n",
+    "gpus_per_node = int(os.environ[\"SLURM_GPUS_ON_NODE\"])\n",
+    "WORLD_RANK = int(os.environ[\"SLURM_PROCID\"])\n",
+    "WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n",
+    "MASTER_RANK = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The world_rank and World_size values are passed as arguments to the CutQC constructor. This will be shown bellow in the last section."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If done correctly, each node should be able to access port, address and world size information in the driver python by reading their environment variables. \n",
+    "\n",
+    "Bellow is an excerpt from the example driver that does this: \n",
+    "\n",
+    "\n",
+    "Then, in the driver python file, each node\n",
+    "\n",
+    "Currently, CutQC requires \n",
+    "\n",
+    "The distributed reconstructor requires that the subcircuits be evaluated prior to its execution."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Message Passing Backend\n",
+    "\n",
+    "The steps shown above is strictly for initial coordination of nodes; A message passing backend is used to facitlate the "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. Initializing CutQC object for Reconstruction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 4. Cut and Evaluation"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/distributed_example/README.md b/distributed_example/README.md
new file mode 100644
index 0000000..430e02a
--- /dev/null
+++ b/distributed_example/README.md
@@ -0,0 +1,50 @@
+## How to Use CutQC to Efficiently Perform Subcircuit Reconstruction
+
+This directory contains a simple example of subcircuit reconstruction on an adder circuit, along with a notebook that provides more details on how the environment should be set up.
+
+
+## Cutting and Evaluation 
+
+Before reconstruction can be performed, any given circuit must first be cut and evaluated, and the results saved to a `.pkl` file. This can be done using `cutqc.save_cutqc_obj(filename)` as seen in the `cut_and_eval.py` file.
+
+Run `cut_and_eval.py` using the corresponding Slurm script. In other words:
+
+    sbatch cut_and_eval.slurm
+
+Running the slurm script should produce an output file containing:
+
+    --- Cut --- 
+    Set parameter GURO_PAR_SPECIAL
+    Set parameter TokenServer to value "license.rc.princeton.edu"
+    --- Evaluate ---
+    --- Dumping CutQC Object into adder_example.pkl ---
+    Completed
+
+## Reconstruction
+
+Once the circuit is cut and the results are computed, you can run parallel reconstruction by calling the slurm script:
+
+    sbatch dist_driver.slurm
+
+Running the slurm script should produce an output file containing:
+
+      MASTER_ADDR=adroit-h11g3
+      MASTER_PORT=31179
+      WORLD_SIZE=2
+      --- Running adder_example.pkl ---
+      self.parallel_reconstruction: True
+      Worker 1, compute_device: cuda:1
+      --- Running adder_example.pkl ---
+      self.parallel_reconstruction: True
+      Worker 0, compute_device: cuda:0
+      subcircuit_entry_length: [32, 32]
+      LEN(DATASET): 16
+      NUMBER BATCHES: 1
+      Compute Time: 1.5637431228533387
+      Approximate Error: 1.2621774483536279e-29
+      verify took 0.011
+      --- Reconstruction Complete ---
+      Total Reconstruction Time:	1.5637431228533387
+      Approxamate Error:	 1.2621774483536279e-29
+      DESTROYING NOW! 1.5637431228533387
+      WORKER 1 DYING
\ No newline at end of file
diff --git a/distributed_example/cut_and_eval.py b/distributed_example/cut_and_eval.py
new file mode 100644
index 0000000..3e3f808
--- /dev/null
+++ b/distributed_example/cut_and_eval.py
@@ -0,0 +1,56 @@
+# Title: cut_and_eval.py
+# Description: Example of how to cut and evaluate for the purposes of 
+# distributed reconstruction
+
+import os, logging
+
+logging.disable(logging.WARNING)
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 
+
+from cutqc.main import CutQC 
+from helper_functions.benchmarks import generate_circ
+
+
+if __name__ == "__main__":
+    filename = "adder_example.pkl"
+    circ_type= 'adder'
+    circ_size=10
+    max_width=10
+    
+    # Generate Example Circuit and Initialize CutQC
+    circuit = generate_circ(
+        num_qubits=circ_size,
+        depth=1,
+        circuit_type=circ_type,
+        reg_name="q",
+        connected_only=True,
+        seed=None,
+    )
+    
+    cutqc = CutQC(
+        name="%s_%d" % (circ_type, circ_size),
+        circuit=circuit,
+        cutter_constraints={
+            "max_subcircuit_width": max_width,
+            "max_subcircuit_cuts": 10,
+            "subcircuit_size_imbalance": 2,
+            "max_cuts": 10,
+            "num_subcircuits": [2, 3, 4, 5, 6, 8],
+        },
+    )
+    
+    print ("--- Cut --- ")    
+    cutqc.cut()
+    
+    if not cutqc.has_solution:
+        raise Exception("The input circuit and constraints have no viable cuts")
+    
+    print ("--- Evaluate ---")
+    cutqc.evaluate(eval_mode="sv", num_shots_fn=None)
+    
+    print ("--- Dumping CutQC Object into {} ---".format (filename))
+    cutqc.save_cutqc_obj (filename)
+
+    print ("Completed")
+
diff --git a/distributed_example/cut_and_eval.slurm b/distributed_example/cut_and_eval.slurm
new file mode 100644
index 0000000..ae16bb8
--- /dev/null
+++ b/distributed_example/cut_and_eval.slurm
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --output='example_cut_and_eval.out'
+#SBATCH --nodes=1                 # node count
+#SBATCH --ntasks-per-node=1       # total number of tasks across all nodes
+#SBATCH --cpus-per-task=12        # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem=40G                 # memory per cpu-core (4G is default)
+#SBATCH --time=00:01:55           # total run time limit (HH:MM:SS)
+
+# Setup for Multi-node Workload
+export MASTER_PORT=$(get_free_port)  # Get a free Port
+export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 
+export MASTER_ADDR=$master_addr
+
+# Sanity Print
+echo "MASTER_ADDR="$MASTER_ADDR
+echo "MASTER_PORT="$MASTER_PORT
+echo "WORLD_SIZE="$WORLD_SIZE
+
+# Load Modules
+module purge
+module load anaconda3/2024.2
+conda activate cutqc
+module load gurobi/10.0.1
+export PYTHONPATH=/usr/licensed/gurobi/10.0.1/linux64/lib/python3.8_utf32
+
+python3 cut_and_eval.py 
+
diff --git a/distributed_example/dist_driver.py b/distributed_example/dist_driver.py
new file mode 100644
index 0000000..839ed94
--- /dev/null
+++ b/distributed_example/dist_driver.py
@@ -0,0 +1,38 @@
+# Title: dist_driver.py
+# Description: Example of how CutQC can be used to effececiently reconstruct 
+# subcircuits
+
+import os
+from cutqc.main import CutQC 
+
+# Environment variables set by slurm script
+GPUS_PER_NODE = int(os.environ["SLURM_GPUS_ON_NODE"])
+WORLD_RANK    = int(os.environ["SLURM_PROCID"])
+WORLD_SIZE    = int(os.environ["WORLD_SIZE"])
+
+if __name__ == "__main__":
+    full_path       = 'adder_example.pkl'
+    compute_backend = 'gpu'
+    comm_backend    = 'nccl'
+    
+    # Load CutQC Instance from Pickle
+    print(f'--- Running {full_path} ---')
+    cutqc = CutQC (
+        parallel_reconstruction = True,
+        reconstruct_only = True,
+        load_data        = full_path,
+        compute_backend  = compute_backend,
+        comm_backend     = comm_backend,
+        gpus_per_node = GPUS_PER_NODE,
+        world_rank     = WORLD_RANK,
+        world_size    = WORLD_SIZE
+    )
+
+    # Initiate Reconstruct
+    compute_time = cutqc.build(mem_limit=32, recursion_depth=1)
+    approximation_error = cutqc.verify()
+
+    print('--- Reconstruction Complete ---')    
+    print ("Total Reconstruction Time:\t{}".format(compute_time))
+    print ("Approxamate Error:\t {}".format (approximation_error))
+    cutqc.destroy_distributed()    
\ No newline at end of file
diff --git a/distributed_example/dist_driver.slurm b/distributed_example/dist_driver.slurm
new file mode 100644
index 0000000..c906fdc
--- /dev/null
+++ b/distributed_example/dist_driver.slurm
@@ -0,0 +1,31 @@
+#!/bin/bash
+#SBATCH --output=example_dist_driver.out
+#SBATCH --nodes=1                 # node count
+#SBATCH --ntasks-per-node=2       # total number of tasks across all nodes
+#SBATCH --cpus-per-task=12        # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem=40G                 # memory per cpu-core (4G is default)
+#SBATCH --gres=gpu:2
+#SBATCH --time=00:01:55           # total run time limit (HH:MM:SS)
+#SBATCH --mail-type=begin         # send email when job begins
+#SBATCH --mail-type=end           # send email when job ends
+#SBATCH --mail-type=fail          # send mail if job fails
+
+# Setup for Multi-node Workload
+export MASTER_PORT=$(get_free_port)  # Get a free Port
+export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 
+export MASTER_ADDR=$master_addr
+
+# Sanity Print
+echo "MASTER_ADDR="$MASTER_ADDR
+echo "MASTER_PORT="$MASTER_PORT
+echo "WORLD_SIZE="$WORLD_SIZE
+
+# Load Modules
+module purge
+module load anaconda3/2024.2
+conda activate cutqc
+module load gurobi/10.0.1
+export PYTHONPATH=/usr/licensed/gurobi/10.0.1/linux64/lib/python3.8_utf32
+
+srun python dist_driver.py
diff --git a/example.py b/example.py
index dc1fda4..2ff1b5b 100644
--- a/example.py
+++ b/example.py
@@ -1,18 +1,16 @@
-import os, math
-import os, logging
-
-logging.disable(logging.WARNING)
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
-# Comment this line if using GPU
-os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+import os, math, logging
 
 # from cutqc_runtime.main import CutQC # Use this just to benchmark the runtime
 
-from cutqc.main import CutQC # Use this for exact computation
-# from cutqc_runtime.main import CutQC # Use this for exact computation
+from cutqc.main import CutQC  # Use this for exact computation
 
 from helper_functions.benchmarks import generate_circ
 
+logging.disable(logging.WARNING)
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
+# Comment this line if using GPU
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
 if __name__ == "__main__":
     circuit_type = "supremacy"
     circuit_size = 16
@@ -28,31 +26,21 @@
         name="%s_%d" % (circuit_type, circuit_size),
         circuit=circuit,
         cutter_constraints={
-            "max_subcircuit_width": 10,
-            # "max_subcircuit_width": math.ceil(circuit.num_qubits / 4 * 3),
+            "max_subcircuit_width": math.ceil(circuit.num_qubits / 4 * 3),
             "max_subcircuit_cuts": 10,
             "subcircuit_size_imbalance": 2,
             "max_cuts": 10,
             "num_subcircuits": [2, 3],
         },
-        verbose=False,
+        verbose=True,
     )
-    
-    print ("-- Cut -- ")    
     cutqc.cut()
     if not cutqc.has_solution:
         raise Exception("The input circuit and constraints have no viable cuts")
-    print ("-- Done Cutting -- \n")    
-    
-    print ("-- Evaluate --")
-    cutqc.evaluate(eval_mode="sv", num_shots_fn=None)
-    print ("-- Done Evaluating -- \n")
 
-    print ("-- Build --")
+    # add comment
+    cutqc.evaluate(eval_mode="sv", num_shots_fn=None)
     cutqc.build(mem_limit=32, recursion_depth=1)
-    print ("-- Done Building -- \n")
-    
-    cutqc.verify ()
     print("Cut: %d recursions." % (cutqc.num_recursions))
-    # print(cutqc.approximation_bins)
+    print(cutqc.approximation_bins)
     cutqc.clean_data()
diff --git a/helper_functions/non_ibmq_functions.py b/helper_functions/non_ibmq_functions.py
index 9155e1e..a26bc0a 100644
--- a/helper_functions/non_ibmq_functions.py
+++ b/helper_functions/non_ibmq_functions.py
@@ -23,7 +23,7 @@ def read_dict(filename):
         while 1:
             try:
                 file_content.update(pickle.load(f))
-            except (EOFError):
+            except EOFError:
                 break
         f.close()
     else:
diff --git a/helper_functions/schedule.py b/helper_functions/schedule.py
index bcfdbeb..3994570 100644
--- a/helper_functions/schedule.py
+++ b/helper_functions/schedule.py
@@ -284,16 +284,18 @@ def run_simulation_jobs(self, device_name):
             self.circ_dict[key]["mapped_circuit"] = mapped_circuit
 
             backend = Aer.get_backend("qasm_simulator")
-            new_circuit = transpile (value["mapped_circuit"], backend)
-            
-            simulation_result = backend.run (new_circuit, shots=value["shots"], noise_model=noise_model).result()
+            new_circuit = transpile(value["mapped_circuit"], backend)
+
+            simulation_result = backend.run(
+                new_circuit, shots=value["shots"], noise_model=noise_model
+            ).result()
             # simulation_result = execute(
             #     value["mapped_circuit"],
-        
+
             #     noise_model=noise_model,
             #     shots=value["shots"],
             # ).result()
-            
+
             counts = simulation_result.get_counts(0)
             counts = dict_to_array(distribution_dict=counts, force_prob=True)
             self.circ_dict[key]["%s|sim" % device_name] = counts
diff --git a/qcg/QAOA/hw_efficient_ansatz.py b/qcg/QAOA/hw_efficient_ansatz.py
index 2136f09..9a553d9 100644
--- a/qcg/QAOA/hw_efficient_ansatz.py
+++ b/qcg/QAOA/hw_efficient_ansatz.py
@@ -146,12 +146,12 @@ def gen_circuit(self):
             # print(len(theta))
             p_idx = 0
             for i in range(self.nq):
-                self.circ.u3(theta[i + p_idx], 0, 0, self.qr[i])
+                self.circ.u(theta[i + p_idx], 0, 0, self.qr[i])
             p_idx += self.nq
 
             # layer 2
             for i in range(self.nq):
-                self.circ.u3(0, 0, theta[i + p_idx], self.qr[i])
+                self.circ.u(0, 0, theta[i + p_idx], self.qr[i])
             p_idx += self.nq
 
             if self.barriers:
@@ -171,12 +171,12 @@ def gen_circuit(self):
                 # PARAMETERIZER
                 # layer 1
                 for i in range(self.nq):
-                    self.circ.u3(theta[i + p_idx], 0, 0, self.qr[i])
+                    self.circ.u(theta[i + p_idx], 0, 0, self.qr[i])
                 p_idx += self.nq
 
                 # layer 2
                 for i in range(self.nq):
-                    self.circ.u3(0, 0, theta[i + p_idx], self.qr[i])
+                    self.circ.u(0, 0, theta[i + p_idx], self.qr[i])
                 p_idx += self.nq
 
             # place measurements on the end of the circuit
diff --git a/qcg/VQE/uccsd_ansatz.py b/qcg/VQE/uccsd_ansatz.py
index 37979c8..c0e7d45 100644
--- a/qcg/VQE/uccsd_ansatz.py
+++ b/qcg/VQE/uccsd_ansatz.py
@@ -248,9 +248,7 @@ def gen_circuit(self):
         # every call to the single or double operator will take param[p_i] as its
         # parameter and then increment the value of p_i
 
-        num_dbl = (
-            self.nq**4 - 6 * self.nq**3 + 11 * self.nq**2 - 6 * self.nq
-        ) / 24
+        num_dbl = (self.nq**4 - 6 * self.nq**3 + 11 * self.nq**2 - 6 * self.nq) / 24
         num_sgl = (self.nq**2 - self.nq) / 2
         numparam = int(num_dbl + num_sgl)
 
diff --git a/qcg/utils/testhwea.py b/qcg/utils/testhwea.py
index e2873bb..82c323f 100644
--- a/qcg/utils/testhwea.py
+++ b/qcg/utils/testhwea.py
@@ -14,6 +14,7 @@
 sv = result.get_statevector(circ)
 print(sv)
 
+
 # entanglement measure
 def sgn_star(n, i):
     if n == 2:
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f3776b5
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+numpy
+qiskit
+matplotlib
+pydot
+scipy
+tqdm
+pylatexenc
+scikit-learn
+tensorflow
+networkx
+gurobipy
+psutil
+black
+qiskit-aer
\ No newline at end of file

From 93a4f6618ed2da9b5583b4cbf4a2764020e4c1b4 Mon Sep 17 00:00:00 2001
From: Chuck Garcia <chuckgarcian@gmail.com>
Date: Fri, 8 Nov 2024 13:14:24 -0600
Subject: [PATCH 04/11] Update main

---
 cutqc/distributed_graph_contraction.py |  4 +-
 cutqc/dynamic_definition.py            |  9 +++--
 cutqc/main.py                          | 52 +++++++++++++-------------
 example.py                             |  1 +
 requirements.txt                       | 25 ++++++-------
 5 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/cutqc/distributed_graph_contraction.py b/cutqc/distributed_graph_contraction.py
index eacb701..d49bbe5 100644
--- a/cutqc/distributed_graph_contraction.py
+++ b/cutqc/distributed_graph_contraction.py
@@ -14,10 +14,8 @@
 from cutqc.abstract_graph_contractor import AbstractGraphContractor
 from cutqc.post_process_helper import ComputeGraph
 
-
 __host_machine__ = 0
 
-
 class DistributedGraphContractor(AbstractGraphContractor):
     """
      Distributed Graph Contractor Implementation
@@ -91,7 +89,7 @@ def _send_distributed(self, dataset: List[torch.Tensor], num_batches: int) -> to
             if len(dataset) < num_batches:
                 raise ValueError("Error 2000: Invalid number of requested batches -- Too many nodes allocated, for dataset length {} and {} number of batches".format (len(dataset), num_batches))
             
-            batches = torch.stack(dataset).tensor_split(num_batches)
+            batches = torch.stack(dataset).tensor_split(num_batches) 
             tensor_sizes = torch.tensor(self.subcircuit_entry_lengths, dtype=torch.int64)
             tensor_sizes_shape = torch.tensor(tensor_sizes.shape, dtype=torch.int64)
 
diff --git a/cutqc/dynamic_definition.py b/cutqc/dynamic_definition.py
index 2f19cde..b1f3d52 100644
--- a/cutqc/dynamic_definition.py
+++ b/cutqc/dynamic_definition.py
@@ -12,12 +12,13 @@
 from cutqc.distributed_graph_contraction import DistributedGraphContractor
 from cutqc.helper_fun import add_times
 from cutqc.post_process_helper import get_reconstruction_qubit_order
+from cutqc.graph_contraction import GraphContractor
 
 import torch.distributed as dist
 
 class DynamicDefinition(object):
     def __init__(
-        self, compute_graph, data_folder, num_cuts, mem_limit, recursion_depth, parallel_reconstruction=False, local_rank=None, compute_backend='gpu'
+        self, compute_graph, data_folder, num_cuts, mem_limit, recursion_depth, pytorch_distributed=False, local_rank=None, compute_backend='gpu'
     ) -> None:
         super().__init__()
         self.compute_graph = compute_graph
@@ -27,8 +28,8 @@ def __init__(
         self.recursion_depth = recursion_depth
         self.dd_bins = {}
         self.local_rank = local_rank
-        self.graph_contractor = DistributedGraphContractor (local_rank=self.local_rank, compute_backend=compute_backend) if (parallel_reconstruction) else GraphContractor()
-        self.parallel_reconstruction = parallel_reconstruction
+        self.graph_contractor = DistributedGraphContractor (local_rank=self.local_rank, compute_backend=compute_backend) if (pytorch_distributed) else GraphContractor()
+        self.pytorch_distributed = pytorch_distributed
 
         self.overhead = {"additions": 0, "multiplications": 0}
         self.times = {"get_dd_schedule": 0, "merge_states_into_bins": 0, "sort": 0}
@@ -117,7 +118,7 @@ def build(self):
         
         # Terminate the parallized process         
         print("Compute Time: {}".format (self.graph_contractor.times["compute"]))
-        # if (self.parallel_reconstruction):
+        # if (self.pytorch_distributed):
         #     self.graph_contractor.terminate_distributed_process()
 
 
diff --git a/cutqc/main.py b/cutqc/main.py
index 1d657ad..ee98c67 100644
--- a/cutqc/main.py
+++ b/cutqc/main.py
@@ -21,13 +21,13 @@ class CutQC:
     The main module for CutQC
     cut --> evaluate results --> verify (optional)
     """
-    
+
     def __init__(self, 
                  name=None, 
                  circuit=None, 
                  cutter_constraints=None, 
                  verbose=False,              
-                 parallel_reconstruction=False, 
+                 pytorch_distributed=False, 
                  reconstruct_only=False, 
                  load_data=None, 
                  compute_backend='gpu', 
@@ -48,43 +48,48 @@ def __init__(self,
 
         --- Distributed Reconstruction Related Arguments ---          
         
-        parallel_reconstruction (Optional): When set to 'True', reconstruction 
+        pytorch_distributed (Optional): When set to 'True', reconstruction 
                 is executed distributed. Default FALSE
         
         reconstruct_only (Optional): When enabled, cutqc performs only reconstructions. 
-                          Distrubuted reconstruction requires that this be 'TRUE'.
+                          Executing with Pytorch requires that this be 'TRUE'.
                           Default FALSE
         
-        load_data (Optional): String of file name to load subcircuit outputs 
-                        from a previous CutQC instance. Default NONE.
+        load_data (Optional): String of file name to load subcircuits outputs 
+                        from a previous CutQC instance. Default None.
 
         compute_backend (Optional): Compute processing device used if 
-                                    parallel_reconstruction is set to 'TRUE'. 
+                                    pytorch_distributed is set to 'TRUE'. 
                                     'cpu' for cpu and 'gpu' for gpu. Default GPU
 
         comm_backend (Optional): message passing backend internally used by pytorch for 
                                  sending data between nodes. Default NCCL.
         gpus_per_node (Optional): Number of GPUs per node in the case they are 
                                   used as the compute backend.
-        world_rank (Optional):   Global Identifier. Default NONE.                       
+        world_rank (Optional):   Global Identifier. Default None.                       
         world_size (Optional):   Total number of nodes
         
         """
+        assert not (pytorch_distributed is False and reconstruct_only is True), "Using pytorch is not available for cutting, since worker nodes are being concurrently initialized."
+
         self.name = name
         self.circuit = circuit
         self.cutter_constraints = cutter_constraints        
         self.verbose = verbose
         self.times = {}
-
+        
         self.compute_graph = None
         self.tmp_data_folder = None
         self.num_cuts = None
         self.complete_path_map = None
         self.subcircuits = None
+        self.local_rank = None
+        self.compute_backend = compute_backend
+        self.pytorch_distributed = pytorch_distributed
                 
         if reconstruct_only:
             # Multi node - Pytorch Version
-            if parallel_reconstruction:                
+            if pytorch_distributed:                
                 self.compute_backend = compute_backend
                 self._setup_for_dist_reconstruction (load_data, comm_backend, world_rank, world_size, gpus_per_node)
             
@@ -93,10 +98,10 @@ def __init__(self,
                 self._load_data(load_data)
         
         elif not reconstruct_only: 
-            # Cutting, evaluation and reconstruction are occuring all at once.
+            # Cutting, evaluation and reconstruction are occurring all at once.
             self._initialize_for_serial_reconstruction(circuit)    
             
-    def _setup_for_dist_reconstruction (self, load_data, comm_backend: str, world_rank: int, world_size: int, gpus_per_node: int):
+    def _setup_for_dist_reconstruction (self, load_data, comm_backend: str, world_rank: int, world_size: int, gpus_per_node: int, timeout=1):
         """
         Sets up to call the distributed kernel. Worker nodes 
         
@@ -110,12 +115,12 @@ def _setup_for_dist_reconstruction (self, load_data, comm_backend: str, world_ra
         """
         
         self.local_rank = world_rank - gpus_per_node * (world_rank // gpus_per_node)                
-        self.parallel_reconstruction = True
-        timelimit = timedelta(hours=1)  # Bounded wait time to prevent deadlock
+        self.pytorch_distributed = True
+        timelimit = timedelta(hours=timeout)  # Bounded wait time to prevent deadlock
         
-        dist.init_process_group(comm_backend, rank=world_rank, world_size=world_size, timeout=timelimit)
+        dist.init_process_group(comm_backend, rank=world_rank, world_size=world_size, timeout=timelimit) # 
         
-        # Only host should load subcircuit data
+        # Only host should load subcircuits data
         if dist.get_rank() == __host_machine__:
             # Todo: I think ideally the workers should on start load their own data
             self._load_data(load_data)
@@ -172,9 +177,10 @@ def cut(self):
         )
         for field in cut_solution:
             self.__setattr__(field, cut_solution[field])
+        
         if "complete_path_map" in cut_solution:
             self.has_solution = True
-            self._generate_metadata()
+            self._generate_metadata ()
         else:
             self.has_solution = False
         self.times["cutter"] = perf_counter() - cutter_begin
@@ -203,22 +209,16 @@ def build(self, mem_limit, recursion_depth):
         """
         if self.verbose:
             print("--> Build %s" % (self.name))
-
-        # Keep these times and discard the rest
-        # self.times = {
-        #     "cutter": self.times["cutter"],
-        #     "evaluate": self.times["evaluate"],
-        # }
-        
     
-        print ("self.parallel_reconstruction: {}".format (self.parallel_reconstruction))
+        # print ("self.pytorch_distributed': {}".format (self.pytorch_distributed))
+        
         self.dd = DynamicDefinition(
             compute_graph=self.compute_graph,
             data_folder=self.tmp_data_folder,
             num_cuts=self.num_cuts,
             mem_limit=mem_limit,
             recursion_depth=recursion_depth,
-            parallel_reconstruction=self.parallel_reconstruction,
+            pytorch_distributed=self.pytorch_distributed,
             local_rank=self.local_rank,
             compute_backend=self.compute_backend
         )
diff --git a/example.py b/example.py
index 2ff1b5b..3f398f9 100644
--- a/example.py
+++ b/example.py
@@ -8,6 +8,7 @@
 
 logging.disable(logging.WARNING)
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
+
 # Comment this line if using GPU
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
diff --git a/requirements.txt b/requirements.txt
index f3776b5..a58fb90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,11 @@
-numpy
-qiskit
-matplotlib
-pydot
-scipy
-tqdm
-pylatexenc
-scikit-learn
-tensorflow
-networkx
-gurobipy
-psutil
-black
-qiskit-aer
\ No newline at end of file
+gurobipy==11.0.2
+networkx==3.1
+numpy==1.24.3
+psutil==6.0.0
+pytz==2024.2
+qiskit==1.1.1
+qiskit_aer==0.14.2
+scikit_learn==1.3.2
+tensorflow==2.13.1
+tensorflow_cpu==2.13.1
+torch==2.3.1

From 28b8e1bc5533a63e8757a17cee99f6ee407298ef Mon Sep 17 00:00:00 2001
From: Chuck Garcia <chuckgarcian@gmail.com>
Date: Fri, 8 Nov 2024 13:32:19 -0600
Subject: [PATCH 05/11] Add requirements

---
 distributed_example/dist_driver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/distributed_example/dist_driver.py b/distributed_example/dist_driver.py
index 839ed94..357ffb7 100644
--- a/distributed_example/dist_driver.py
+++ b/distributed_example/dist_driver.py
@@ -34,5 +34,5 @@
 
     print('--- Reconstruction Complete ---')    
     print ("Total Reconstruction Time:\t{}".format(compute_time))
-    print ("Approxamate Error:\t {}".format (approximation_error))
+    print ("Approximate Error:\t {}".format (approximation_error))
     cutqc.destroy_distributed()    
\ No newline at end of file

From 007eb8fe67575a9442aba72791b709609bd7a79e Mon Sep 17 00:00:00 2001
From: chuckgarcian <chuckgarcian@gmail.com>
Date: Fri, 8 Nov 2024 18:51:43 -0500
Subject: [PATCH 06/11] Update file structure and fix bug in cutqc.main

---
 .../cut_and_eval.py => cut_and_eval.py        |   0
 .../cut_and_eval.slurm => cut_and_eval.slurm  |   4 +-
 cutqc/main.py                                 |   2 +-
 .../dist_driver.py => dist_driver.py          |   6 +-
 .../dist_driver.slurm => dist_driver.slurm    |   4 +-
 distributed_example/Getting_Started.ipynb     | 174 ------------------
 .../README.md => getting_started.md           |  26 +++
 README.md => readme.md                        |   0
 requirements.txt                              |  25 +--
 9 files changed, 48 insertions(+), 193 deletions(-)
 rename distributed_example/cut_and_eval.py => cut_and_eval.py (100%)
 rename distributed_example/cut_and_eval.slurm => cut_and_eval.slurm (91%)
 rename distributed_example/dist_driver.py => dist_driver.py (91%)
 rename distributed_example/dist_driver.slurm => dist_driver.slurm (96%)
 delete mode 100644 distributed_example/Getting_Started.ipynb
 rename distributed_example/README.md => getting_started.md (63%)
 rename README.md => readme.md (100%)

diff --git a/distributed_example/cut_and_eval.py b/cut_and_eval.py
similarity index 100%
rename from distributed_example/cut_and_eval.py
rename to cut_and_eval.py
diff --git a/distributed_example/cut_and_eval.slurm b/cut_and_eval.slurm
similarity index 91%
rename from distributed_example/cut_and_eval.slurm
rename to cut_and_eval.slurm
index ae16bb8..487fcde 100644
--- a/distributed_example/cut_and_eval.slurm
+++ b/cut_and_eval.slurm
@@ -4,7 +4,7 @@
 #SBATCH --ntasks-per-node=1       # total number of tasks across all nodes
 #SBATCH --cpus-per-task=12        # cpu-cores per task (>1 if multi-threaded tasks)
 #SBATCH --mem=40G                 # memory per cpu-core (4G is default)
-#SBATCH --time=00:01:55           # total run time limit (HH:MM:SS)
+#SBATCH --time=00:00:55           # total run time limit (HH:MM:SS)
 
 # Setup for Multi-node Workload
 export MASTER_PORT=$(get_free_port)  # Get a free Port
@@ -20,7 +20,7 @@ echo "WORLD_SIZE="$WORLD_SIZE
 # Load Modules
 module purge
 module load anaconda3/2024.2
-conda activate cutqc
+conda activate cutqc3
 module load gurobi/10.0.1
 export PYTHONPATH=/usr/licensed/gurobi/10.0.1/linux64/lib/python3.8_utf32
 
diff --git a/cutqc/main.py b/cutqc/main.py
index ee98c67..59c5a16 100644
--- a/cutqc/main.py
+++ b/cutqc/main.py
@@ -71,7 +71,7 @@ def __init__(self,
         
         """
         assert not (pytorch_distributed is False and reconstruct_only is True), "Using pytorch is not available for cutting, since worker nodes are being concurrently initialized."
-
+        
         self.name = name
         self.circuit = circuit
         self.cutter_constraints = cutter_constraints        
diff --git a/distributed_example/dist_driver.py b/dist_driver.py
similarity index 91%
rename from distributed_example/dist_driver.py
rename to dist_driver.py
index 357ffb7..2499795 100644
--- a/distributed_example/dist_driver.py
+++ b/dist_driver.py
@@ -12,19 +12,19 @@
 
 if __name__ == "__main__":
     full_path       = 'adder_example.pkl'
-    compute_backend = 'gpu'
+    compute_backend = 'GPU'
     comm_backend    = 'nccl'
     
     # Load CutQC Instance from Pickle
     print(f'--- Running {full_path} ---')
     cutqc = CutQC (
-        parallel_reconstruction = True,
+        pytorch_distributed = True,
         reconstruct_only = True,
         load_data        = full_path,
         compute_backend  = compute_backend,
         comm_backend     = comm_backend,
         gpus_per_node = GPUS_PER_NODE,
-        world_rank     = WORLD_RANK,
+        world_rank    = WORLD_RANK,
         world_size    = WORLD_SIZE
     )
 
diff --git a/distributed_example/dist_driver.slurm b/dist_driver.slurm
similarity index 96%
rename from distributed_example/dist_driver.slurm
rename to dist_driver.slurm
index c906fdc..0aeb1e0 100644
--- a/distributed_example/dist_driver.slurm
+++ b/dist_driver.slurm
@@ -4,7 +4,7 @@
 #SBATCH --ntasks-per-node=2       # total number of tasks across all nodes
 #SBATCH --cpus-per-task=12        # cpu-cores per task (>1 if multi-threaded tasks)
 #SBATCH --mem=40G                 # memory per cpu-core (4G is default)
-#SBATCH --gres=gpu:2
+#SBATCH --gres=gpu:4
 #SBATCH --time=00:01:55           # total run time limit (HH:MM:SS)
 #SBATCH --mail-type=begin         # send email when job begins
 #SBATCH --mail-type=end           # send email when job ends
@@ -24,7 +24,7 @@ echo "WORLD_SIZE="$WORLD_SIZE
 # Load Modules
 module purge
 module load anaconda3/2024.2
-conda activate cutqc
+conda activate cutqc4
 module load gurobi/10.0.1
 export PYTHONPATH=/usr/licensed/gurobi/10.0.1/linux64/lib/python3.8_utf32
 
diff --git a/distributed_example/Getting_Started.ipynb b/distributed_example/Getting_Started.ipynb
deleted file mode 100644
index a399b29..0000000
--- a/distributed_example/Getting_Started.ipynb
+++ /dev/null
@@ -1,174 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Index\n",
-    "\n",
-    "1. [Quick Intro](#1.-Quick-Intro)\n",
-    "2. [Setup](#2.-Setup)\n",
-    "3. [Cutting and Evaluating](#3.-Cut-and-Evaluation)\n",
-    "4. [Initializing CutQC object for Reconstruction](#4.-Initializing-CutQC-object-for-Reconstruction)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 1. Quick Intro\n",
-    "\n",
-    "CutQC is a 'full stack' circuit circuit cutting framework.  CutQC has three components: 1. Cut 2. Subcircuit Evaluation 3. Reconstruction. The last of which has a distrubuted implmenteation that can run on multi-node compute clusters. This document will discuss how to setup and run the distrubuted reconstruction component. All slurm and pyton excerpts shown in this notebook can be found in the examples directory."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
-   "source": [
-    "# 2. Setup\n",
-    "\n",
-    "This section shall briefly cover the environment variables and message passing backend required to run cutqc distrbuted reconstruction. \n",
-    "\n",
-    "At its core, a 'distributed application' is the execution of a program on a set of interconnected -- but independent -- machines that compute in parallel. Independent in this case means each machine executes a separate instance of the program with seperate execution contexts.\n",
-    "\n",
-    "As a result, different compute nodes require a way to talk to each other. In many cases 'communication' largely can be broken down into two parts: \n",
-    "  1. Initializatoin: Compute nodes need to 'find eachother, i.e a handshake. \n",
-    "  2. Post-initialization: The manner in which data is actually being sent. This depends partially on available hardware. \n",
-    "\n",
-    "What all of this really means is when using CutQC, you must supply/set some extra information to faccilitate these two components. \n",
-    "\n",
-    "Note, Appart from that everything else is transparent.\n",
-    "\n",
-    "During the execution of distrubted cutqc, this commucation is handled by the [commucation backend](https://pytorch.org/docs/stable/distributed.html), "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
-   "source": [
-    "### Initialization\n",
-    "\n",
-    "When running distributed reconstruction, you must start a separate instance of `cutqc` on each node. And as stated above, Initially, these instances will be unaware of each other's existence. However, `cutqc` relies on PyTorch to exchange connection information (handshake). Each node must have access to the following information  for the handshake:\n",
-    "\n",
-    "- `MASTER_PORT`: Port on the host machine\n",
-    "- `MASTER_ADDR`: IP address of the host machine\n",
-    "- `WORLD_SIZE`: Total number of nodes involved\n",
-    "- `RANK`: Unique process identifier \n",
-    "\n",
-    "At the moment, this is done by setting this done by setting these values as environment variables on all machines involved and also passing it to cutqc when. \n",
-    "\n",
-    "More information about environment intialization can be [found here.](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization)\n",
-    "\n",
-    "\n",
-    "Thankfully a compute cluster scheduler like [slurm](https://slurm.schedmd.com/documentation.html) will automatically set the RANK WORLD_SIZE environement variables. Slurm won't automatically set the master_port and mastter_addr envrionment variables, however it does make it easy to do so. \n",
-    "\n",
-    "Bellow is an excerpt from the slurm script used in example:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "shellscript"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "#SBATCH --output=_output/%x.%j.out\n",
-    "#SBATCH --nodes=1                 # node count\n",
-    "#SBATCH --ntasks-per-node=1       # total number of tasks across all nodes\n",
-    "#SBATCH --cpus-per-task=13        # cpu-cores per task (>1 if multi-threaded tasks)\n",
-    "\n",
-    "export MASTER_PORT=$(get_free_port)  \n",
-    "export MASTER_ADDR=$master_addr=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\" | head -n 1) "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "If done correctly, each node should be able to access port, address, world size information in the driver python by reading their environment variables. \n",
-    "\n",
-    "Bellow is an excerpt from the example driver that does this: \n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Title: 'dist_run.py'\n",
-    "import os\n",
-    "\n",
-    "# Environment variables set by slurm script\n",
-    "gpus_per_node = int(os.environ[\"SLURM_GPUS_ON_NODE\"])\n",
-    "WORLD_RANK = int(os.environ[\"SLURM_PROCID\"])\n",
-    "WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n",
-    "MASTER_RANK = 0"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The world_rank and World_size values are passed as arguments to the CutQC constructor. This will be shown bellow in the last section."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "If done correctly, each node should be able to access port, address and world size information in the driver python by reading their environment variables. \n",
-    "\n",
-    "Bellow is an excerpt from the example driver that does this: \n",
-    "\n",
-    "\n",
-    "Then, in the driver python file, each node\n",
-    "\n",
-    "Currently, CutQC requires \n",
-    "\n",
-    "The distributed reconstructor requires that the subcircuits be evaluated prior to its execution."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Message Passing Backend\n",
-    "\n",
-    "The steps shown above is strictly for initial coordination of nodes; A message passing backend is used to facitlate the "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 3. Initializing CutQC object for Reconstruction"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 4. Cut and Evaluation"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/distributed_example/README.md b/getting_started.md
similarity index 63%
rename from distributed_example/README.md
rename to getting_started.md
index 430e02a..2e93ee5 100644
--- a/distributed_example/README.md
+++ b/getting_started.md
@@ -2,6 +2,32 @@
 
 This directory contains a simple example of subcircuit reconstruction on an adder circuit, along with a notebook that provides more details on how the environment should be set up.
 
+## Setting up the environment
+
+First we need to setup a conda environement with the following
+
+    conda create --name cutqc python=3.12
+    conda activate cutqc 
+    pip install -r requirements.txt
+conda config --add channels https://conda.anaconda.org/gurobi
+conda install gurobi
+
+pip install numpy qiskit matplotlib pydot scipy tqdm pylatexenc scikit-learn tensorflow networkx torch qiskit-aer psutil
+## Running with slurm
+
+When initializing distributed, the worker nodes must have a way to initialize communication with the host node. The following must be set in the slurm file for running the distributed reconstruction to ensure this can happen. 
+
+    # Setup for Multi-node Workload
+    export MASTER_PORT=$(get_free_port)  # Get a free Port
+    export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
+    master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 
+    export MASTER_ADDR=$master_addr
+
+    # Sanity Print
+    echo "MASTER_ADDR="$MASTER_ADDR
+    echo "MASTER_PORT="$MASTER_PORT
+    echo "WORLD_SIZE="$WORLD_SIZE
+
 
 ## Cutting and Evaluation 
 
diff --git a/README.md b/readme.md
similarity index 100%
rename from README.md
rename to readme.md
diff --git a/requirements.txt b/requirements.txt
index a58fb90..f3776b5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,14 @@
-gurobipy==11.0.2
-networkx==3.1
-numpy==1.24.3
-psutil==6.0.0
-pytz==2024.2
-qiskit==1.1.1
-qiskit_aer==0.14.2
-scikit_learn==1.3.2
-tensorflow==2.13.1
-tensorflow_cpu==2.13.1
-torch==2.3.1
+numpy
+qiskit
+matplotlib
+pydot
+scipy
+tqdm
+pylatexenc
+scikit-learn
+tensorflow
+networkx
+gurobipy
+psutil
+black
+qiskit-aer
\ No newline at end of file

From 3a8ff96057956206ae6eb9fd1016e69728708a1d Mon Sep 17 00:00:00 2001
From: chuckgarcian <chuckgarcian@gmail.com>
Date: Fri, 8 Nov 2024 18:53:37 -0500
Subject: [PATCH 07/11] Update file structure and fix bug in cutqc.main

---
 getting_started.md | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/getting_started.md b/getting_started.md
index 2e93ee5..3c70480 100644
--- a/getting_started.md
+++ b/getting_started.md
@@ -1,18 +1,14 @@
-## How to Use CutQC to Efficiently Perform Subcircuit Reconstruction
-
-This directory contains a simple example of subcircuit reconstruction on an adder circuit, along with a notebook that provides more details on how the environment should be set up.
-
 ## Setting up the environment
 
 First we need to setup a conda environement with the following
 
     conda create --name cutqc python=3.12
     conda activate cutqc 
-    pip install -r requirements.txt
-conda config --add channels https://conda.anaconda.org/gurobi
-conda install gurobi
-
-pip install numpy qiskit matplotlib pydot scipy tqdm pylatexenc scikit-learn tensorflow networkx torch qiskit-aer psutil
+    conda config --add channels https://conda.anaconda.org/gurobi
+    conda install gurobi
+    pip install numpy qiskit matplotlib pydot scipy tqdm pylatexenc scikit-learn    
+tensorflow networkx torch qiskit-aer psutil
+        
 ## Running with slurm
 
 When initializing distributed, the worker nodes must have a way to initialize communication with the host node. The following must be set in the slurm file for running the distributed reconstruction to ensure this can happen. 

From 4a13cb893a81ee1b7287ad1e0d4de27782081cc0 Mon Sep 17 00:00:00 2001
From: Chuck Garcia <cg1509@della8.princeton.edu>
Date: Wed, 15 Jan 2025 13:23:17 -0500
Subject: [PATCH 08/11] Upate example scripts and add explanatory guide

---
 cut_and_eval.slurm                          |  28 ---
 cut_and_eval.py => cut_and_eval_example.py  |   8 +-
 cutqc/main.py                               |  25 +--
 dist_driver.slurm                           |  31 ---
 example.py                                  |  47 -----
 explaining_example.md                       | 205 ++++++++++++++++++++
 getting_started.md                          |  72 -------
 readme.md                                   |   7 +-
 dist_driver.py => reconstruction_example.py |   7 +-
 9 files changed, 234 insertions(+), 196 deletions(-)
 delete mode 100644 cut_and_eval.slurm
 rename cut_and_eval.py => cut_and_eval_example.py (91%)
 delete mode 100644 dist_driver.slurm
 delete mode 100644 example.py
 create mode 100644 explaining_example.md
 delete mode 100644 getting_started.md
 rename dist_driver.py => reconstruction_example.py (90%)

diff --git a/cut_and_eval.slurm b/cut_and_eval.slurm
deleted file mode 100644
index 487fcde..0000000
--- a/cut_and_eval.slurm
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-#SBATCH --output='example_cut_and_eval.out'
-#SBATCH --nodes=1                 # node count
-#SBATCH --ntasks-per-node=1       # total number of tasks across all nodes
-#SBATCH --cpus-per-task=12        # cpu-cores per task (>1 if multi-threaded tasks)
-#SBATCH --mem=40G                 # memory per cpu-core (4G is default)
-#SBATCH --time=00:00:55           # total run time limit (HH:MM:SS)
-
-# Setup for Multi-node Workload
-export MASTER_PORT=$(get_free_port)  # Get a free Port
-export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
-master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 
-export MASTER_ADDR=$master_addr
-
-# Sanity Print
-echo "MASTER_ADDR="$MASTER_ADDR
-echo "MASTER_PORT="$MASTER_PORT
-echo "WORLD_SIZE="$WORLD_SIZE
-
-# Load Modules
-module purge
-module load anaconda3/2024.2
-conda activate cutqc3
-module load gurobi/10.0.1
-export PYTHONPATH=/usr/licensed/gurobi/10.0.1/linux64/lib/python3.8_utf32
-
-python3 cut_and_eval.py 
-
diff --git a/cut_and_eval.py b/cut_and_eval_example.py
similarity index 91%
rename from cut_and_eval.py
rename to cut_and_eval_example.py
index 3e3f808..ebdc66f 100644
--- a/cut_and_eval.py
+++ b/cut_and_eval_example.py
@@ -1,6 +1,8 @@
-# Title: cut_and_eval.py
-# Description: Example of how to cut and evaluate for the purposes of 
-# distributed reconstruction
+'''
+Title: cut_and_eval.py
+Description: Example of how to cut and evaluate for the purposes of 
+distributed reconstruction
+'''
 
 import os, logging
 
diff --git a/cutqc/main.py b/cutqc/main.py
index 59c5a16..6f7052b 100644
--- a/cutqc/main.py
+++ b/cutqc/main.py
@@ -32,6 +32,7 @@ def __init__(self,
                  load_data=None, 
                  compute_backend='gpu', 
                  comm_backend = 'nccl',
+                 timeout=1,
                  gpus_per_node = None,
                  world_rank = None,
                  world_size = None, 
@@ -49,7 +50,8 @@ def __init__(self,
         --- Distributed Reconstruction Related Arguments ---          
         
         pytorch_distributed (Optional): When set to 'True', reconstruction 
-                is executed distributed. Default FALSE
+                is executed distributed using pytorch. Otherwise when 'False', 
+                framework used in Tensorflow, single node. Default FALSE.
         
         reconstruct_only (Optional): When enabled, cutqc performs only reconstructions. 
                           Executing with Pytorch requires that this be 'TRUE'.
@@ -61,6 +63,7 @@ def __init__(self,
         compute_backend (Optional): Compute processing device used if 
                                     pytorch_distributed is set to 'TRUE'. 
                                     'cpu' for cpu and 'gpu' for gpu. Default GPU
+        timeout (Optional): Integer bounded wait time to prevent deadlock between nodes.                                   
 
         comm_backend (Optional): message passing backend internally used by pytorch for 
                                  sending data between nodes. Default NCCL.
@@ -70,7 +73,7 @@ def __init__(self,
         world_size (Optional):   Total number of nodes
         
         """
-        assert not (pytorch_distributed is False and reconstruct_only is True), "Using pytorch is not available for cutting, since worker nodes are being concurrently initialized."
+        assert not (pytorch_distributed is False and reconstruct_only is True), "Executing with pytorch requires 'reconstruct_only' be true."
         
         self.name = name
         self.circuit = circuit
@@ -89,19 +92,19 @@ def __init__(self,
                 
         if reconstruct_only:
             # Multi node - Pytorch Version
-            if pytorch_distributed:                
-                self.compute_backend = compute_backend
-                self._setup_for_dist_reconstruction (load_data, comm_backend, world_rank, world_size, gpus_per_node)
-            
-            # Single node - Tensorflow Version
-            else:
-                self._load_data(load_data)
+          if pytorch_distributed:                
+              self.compute_backend = compute_backend
+              self._setup_for_dist_reconstruction (load_data, comm_backend, world_rank, world_size, gpus_per_node, timeout)
+          
+          # Single node - Tensorflow Version
+          else:
+              self._load_data(load_data)
         
         elif not reconstruct_only: 
             # Cutting, evaluation and reconstruction are occurring all at once.
             self._initialize_for_serial_reconstruction(circuit)    
             
-    def _setup_for_dist_reconstruction (self, load_data, comm_backend: str, world_rank: int, world_size: int, gpus_per_node: int, timeout=1):
+    def _setup_for_dist_reconstruction (self, load_data, comm_backend: str, world_rank: int, world_size: int, gpus_per_node: int, timeout: int):
         """
         Sets up to call the distributed kernel. Worker nodes 
         
@@ -113,7 +116,7 @@ def _setup_for_dist_reconstruction (self, load_data, comm_backend: str, world_ra
             timeout:      Max amount of time pytorch will let any one node wait on 
                         a message before killing it.
         """
-        
+        # GPU identifer on local compute cluster 
         self.local_rank = world_rank - gpus_per_node * (world_rank // gpus_per_node)                
         self.pytorch_distributed = True
         timelimit = timedelta(hours=timeout)  # Bounded wait time to prevent deadlock
diff --git a/dist_driver.slurm b/dist_driver.slurm
deleted file mode 100644
index 0aeb1e0..0000000
--- a/dist_driver.slurm
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-#SBATCH --output=example_dist_driver.out
-#SBATCH --nodes=1                 # node count
-#SBATCH --ntasks-per-node=2       # total number of tasks across all nodes
-#SBATCH --cpus-per-task=12        # cpu-cores per task (>1 if multi-threaded tasks)
-#SBATCH --mem=40G                 # memory per cpu-core (4G is default)
-#SBATCH --gres=gpu:4
-#SBATCH --time=00:01:55           # total run time limit (HH:MM:SS)
-#SBATCH --mail-type=begin         # send email when job begins
-#SBATCH --mail-type=end           # send email when job ends
-#SBATCH --mail-type=fail          # send mail if job fails
-
-# Setup for Multi-node Workload
-export MASTER_PORT=$(get_free_port)  # Get a free Port
-export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
-master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 
-export MASTER_ADDR=$master_addr
-
-# Sanity Print
-echo "MASTER_ADDR="$MASTER_ADDR
-echo "MASTER_PORT="$MASTER_PORT
-echo "WORLD_SIZE="$WORLD_SIZE
-
-# Load Modules
-module purge
-module load anaconda3/2024.2
-conda activate cutqc4
-module load gurobi/10.0.1
-export PYTHONPATH=/usr/licensed/gurobi/10.0.1/linux64/lib/python3.8_utf32
-
-srun python dist_driver.py
diff --git a/example.py b/example.py
deleted file mode 100644
index 3f398f9..0000000
--- a/example.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import os, math, logging
-
-# from cutqc_runtime.main import CutQC # Use this just to benchmark the runtime
-
-from cutqc.main import CutQC  # Use this for exact computation
-
-from helper_functions.benchmarks import generate_circ
-
-logging.disable(logging.WARNING)
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
-
-# Comment this line if using GPU
-os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
-
-if __name__ == "__main__":
-    circuit_type = "supremacy"
-    circuit_size = 16
-    circuit = generate_circ(
-        num_qubits=circuit_size,
-        depth=1,
-        circuit_type=circuit_type,
-        reg_name="q",
-        connected_only=True,
-        seed=None,
-    )
-    cutqc = CutQC(
-        name="%s_%d" % (circuit_type, circuit_size),
-        circuit=circuit,
-        cutter_constraints={
-            "max_subcircuit_width": math.ceil(circuit.num_qubits / 4 * 3),
-            "max_subcircuit_cuts": 10,
-            "subcircuit_size_imbalance": 2,
-            "max_cuts": 10,
-            "num_subcircuits": [2, 3],
-        },
-        verbose=True,
-    )
-    cutqc.cut()
-    if not cutqc.has_solution:
-        raise Exception("The input circuit and constraints have no viable cuts")
-
-    # add comment
-    cutqc.evaluate(eval_mode="sv", num_shots_fn=None)
-    cutqc.build(mem_limit=32, recursion_depth=1)
-    print("Cut: %d recursions." % (cutqc.num_recursions))
-    print(cutqc.approximation_bins)
-    cutqc.clean_data()
diff --git a/explaining_example.md b/explaining_example.md
new file mode 100644
index 0000000..86a2e7c
--- /dev/null
+++ b/explaining_example.md
@@ -0,0 +1,205 @@
+# Explaining the Reconstruction Example Script
+
+This document explains the basic environment setup for distributed reconstruction, along with how to run both `cut_and_eval_example.py` and `reconstruction_example.py`.
+
+I assume SLURM is used to execute Python scripts; however, running without SLURM is not much different, as certain environment variables will need to be manually set. More details on this are shown below.
+
+## 1 - Setup
+
+#### Required Environment Variables
+
+Prior to distributed reconstruction, the following [local environment variables](https://pytorch.org/tutorials//intermediate/dist_tuto.html?highlight=init_process_group#:~:text=MASTER_PORT%3A%20A%20free,or%20a%20worker.) must be set:
+
+- MASTER_PORT: A free port on the machine that will host the process with rank 0
+- MASTER_ADDR: IP address of the machine that will host the process with rank 0
+- WORLD_SIZE: The total number of processes, so that the master knows how many workers to wait for
+- RANK: Rank of each process, so they will know whether it is the master or a worker
+
+
+#### SLURM
+
+In the given example, [SLURM](https://slurm.schedmd.com/), a compute cluster scheduler, is used to automate the process of finding a free port for the master and setting the respective environment variables.
+
+When executed, the `reconstruction_example.slurm` SLURM script implicitly sets rank and GPUs per node environment variables as:
+
+- SLURM_GPUS_ON_NODE
+- SLURM_PROCID
+
+> (note this is on each respective machine)
+
+Moreover, the address and port information is explicitly set in 'reconstruct.slurm' with:
+
+  ```console
+    # Each node is set the following ENV VARS
+    export MASTER_PORT=$(get_free_port)  # Get a free Port
+    export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
+    master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)  
+    export MASTER_ADDR=$master_addr
+  ```
+
+### Without Slurm 
+
+The example can be run without SLURM by manually setting the environment variables and executing an instance of the example on each machine. Note that this is true only for distributed reconstruction, i.e., cut and evaluation should be done on a single, separate, isolated process.
+
+## 2 - Running 'cut_and_eval_example.py'
+
+Before reconstruction can be performed, any given circuit must first be cut and evaluated, and the results saved to a `.pkl` file. This can be done using `cutqc.save_cutqc_obj(filename)` as seen in the `cut_and_eval_example.py` file.
+
+Run `cut_and_eval.py` using the corresponding SLURM script:
+
+```
+    sbatch cut_and_eval.slurm
+```
+
+Running the SLURM script should produce an output file containing (or printed to console, if running without SLURM):
+
+```
+    --- Cut --- 
+    Set parameter GURO_PAR_SPECIAL
+    Set parameter TokenServer to value "license.rc.princeton.edu"
+    --- Evaluate ---
+    --- Dumping CutQC Object into adder_example.pkl ---
+    Completed
+```
+
+## 3 - Running 'reconstruction_example.py'
+
+Once the example adder circuit is cut and the CutQC instance is saved as a pickle file, distributed reconstruction can be performed.
+
+### Breakdown of example
+
+```{.Python .numberLines .lineAnchors}
+import os
+from cutqc.main import CutQC 
+
+# Environment variables
+GPUS_PER_NODE = int(os.environ["SLURM_GPUS_ON_NODE"])
+WORLD_RANK    = int(os.environ["SLURM_PROCID"])
+WORLD_SIZE    = int(os.environ["WORLD_SIZE"])
+
+if __name__ == "__main__":
+    full_path       = 'adder_example.pkl'
+    compute_backend = 'GPU'
+    comm_backend    = 'nccl'
+    
+    # Load CutQC Instance from Pickle
+    print(f'--- Running {full_path} ---')
+    cutqc = CutQC(
+        pytorch_distributed = True,
+        reconstruct_only = True,
+        load_data        = full_path,
+        compute_backend  = compute_backend,
+        comm_backend     = comm_backend,
+        gpus_per_node = GPUS_PER_NODE,
+        world_rank    = WORLD_RANK,
+        world_size    = WORLD_SIZE
+    )
+
+    # Initiate Reconstruct
+    compute_time = cutqc.build(mem_limit=32, recursion_depth=1)
+    approximation_error = cutqc.verify()
+
+    print('--- Reconstruction Complete ---')    
+    print("Total Reconstruction Time:\t{}".format(compute_time))
+    print("Approximation Error:\t{}".format(approximation_error))
+    cutqc.destroy_distributed()    
+```
+
+#### Lines 5 - 7:
+
+  ```python
+    GPUS_PER_NODE = int(os.environ["SLURM_GPUS_ON_NODE"])
+    WORLD_RANK    = int(os.environ["SLURM_PROCID"])
+    WORLD_SIZE    = int(os.environ["WORLD_SIZE"])
+  ```
+
+As mentioned in the first section, distributed reconstruction requires process information. On initialization of a CutQC object, only the World rank, world size, and GPUs per machine are required to be passed.
+
+#### Lines 10 - 12:
+
+  ```python
+      full_path       = 'adder_example.pkl'
+      compute_backend = 'GPU'
+      comm_backend    = 'nccl'
+  ```
+
+`full_path` should be the full path to the pickled CutQC object used to cut and evaluate the original target circuit.
+
+`compute_backend` is the device backend used. In this case, GPU is used but CPU is also possible. In cases of memory-intensive subcircuit reconstruction problem instances, CPU may be necessary.
+
+`comm_backend` is the [communication backend](https://pytorch.org/docs/stable/distributed.html), which facilitates the communication of data between nodes during computation/execution. In this case, the communication backend used is [NVIDIA's NCCL](https://developer.nvidia.com/nccl).
+
+#### Lines 16 - 25:
+
+  ```python
+      cutqc = CutQC(
+          pytorch_distributed = True,
+          reconstruct_only = True,
+          load_data        = full_path,
+          compute_backend  = compute_backend,
+          comm_backend     = comm_backend,
+          gpus_per_node = GPUS_PER_NODE,
+          world_rank    = WORLD_RANK,
+          world_size    = WORLD_SIZE
+      )
+  ```
+
+A new CutQC object is created for reconstruction. The previous CutQC instance, used to cut and evaluate, is loaded internally so that the master node can send the partitioned workloads to each worker.
+
+Reconstruct only must be passed as `True` too ensure CutQC does not attempt to cut and instead initializes for reconstruction. The `pytorch_distributed` parameter indicates to CutQC which computaitonal framework too use (Tensorflow, or Pytorch);for multinode distributed reconstruction, it must be passed as True. 
+
+#### Lines 28 - 29:
+
+  ```python
+      compute_time = cutqc.build(mem_limit=32, recursion_depth=1)
+      approximation_error = cutqc.verify()
+  ```
+
+Once the CutQC object is instantiated, the reconstruction process can be initiated by calling build.
+
+In addition to the explicitly passed memory limit, there is an implicit memory limit imposed by the compute device itself. In some cases, the partitioned workload may exceed the capacity of each respective GPU; if this occurs, then the distributed graph contractor will fail with the message 'Error 2006: Batch of size $M$, too large for GPU device of size N', where M is the size of batch and $N$ is memory capacity of a GPU. 
+
+A simple solution is to increase the number of GPU nodes used; Alternatively, a compute device type, like CPU, with more memory can be used instead. 
+
+Finally, the verify method is called in the example as a sanity check which computes the subcircuit reconstruction using TensorFlow on a single node. This can be removed, as it has no practical effect on reconstruction.
+
+#### Lines 34: 
+
+  ```python
+      cutqc.destroy_distributed()    
+  ```
+
+When reconstruction is complete, resources can be freed by calling destroy_distributed.
+
+### Executing and Output
+
+Once the circuit is cut and the results are computed, you can run parallel reconstruction by calling the SLURM script:
+
+```
+    sbatch dist_driver.slurm
+```
+
+Running the SLURM script should produce an output file containing:
+
+```
+    MASTER_ADDR=adroit-h11g3
+    MASTER_PORT=31179
+    WORLD_SIZE=2
+    --- Running adder_example.pkl ---
+    self.parallel_reconstruction: True
+    Worker 1, compute_device: cuda:1
+    --- Running adder_example.pkl ---
+    self.parallel_reconstruction: True
+    Worker 0, compute_device: cuda:0
+    subcircuit_entry_length: [32, 32]
+    LEN(DATASET): 16
+    NUMBER BATCHES: 1
+    Compute Time: 1.5637431228533387
+    Approximate Error: 1.2621774483536279e-29
+    verify took 0.011
+    --- Reconstruction Complete ---
+    Total Reconstruction Time:	1.5637431228533387
+    Approximation Error:	1.2621774483536279e-29
+    DESTROYING NOW! 1.5637431228533387
+    WORKER 1 DYING
+```
\ No newline at end of file
diff --git a/getting_started.md b/getting_started.md
deleted file mode 100644
index 3c70480..0000000
--- a/getting_started.md
+++ /dev/null
@@ -1,72 +0,0 @@
-## Setting up the environment
-
-First we need to setup a conda environement with the following
-
-    conda create --name cutqc python=3.12
-    conda activate cutqc 
-    conda config --add channels https://conda.anaconda.org/gurobi
-    conda install gurobi
-    pip install numpy qiskit matplotlib pydot scipy tqdm pylatexenc scikit-learn    
-tensorflow networkx torch qiskit-aer psutil
-        
-## Running with slurm
-
-When initializing distributed, the worker nodes must have a way to initialize communication with the host node. The following must be set in the slurm file for running the distributed reconstruction to ensure this can happen. 
-
-    # Setup for Multi-node Workload
-    export MASTER_PORT=$(get_free_port)  # Get a free Port
-    export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
-    master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 
-    export MASTER_ADDR=$master_addr
-
-    # Sanity Print
-    echo "MASTER_ADDR="$MASTER_ADDR
-    echo "MASTER_PORT="$MASTER_PORT
-    echo "WORLD_SIZE="$WORLD_SIZE
-
-
-## Cutting and Evaluation 
-
-Before reconstruction can be performed, any given circuit must first be cut and evaluated, and the results saved to a `.pkl` file. This can be done using `cutqc.save_cutqc_obj(filename)` as seen in the `cut_and_eval.py` file.
-
-Run `cut_and_eval.py` using the corresponding Slurm script. In other words:
-
-    sbatch cut_and_eval.slurm
-
-Running the slurm script should produce an output file containing:
-
-    --- Cut --- 
-    Set parameter GURO_PAR_SPECIAL
-    Set parameter TokenServer to value "license.rc.princeton.edu"
-    --- Evaluate ---
-    --- Dumping CutQC Object into adder_example.pkl ---
-    Completed
-
-## Reconstruction
-
-Once the circuit is cut and the results are computed, you can run parallel reconstruction by calling the slurm script:
-
-    sbatch dist_driver.slurm
-
-Running the slurm script should produce an output file containing:
-
-      MASTER_ADDR=adroit-h11g3
-      MASTER_PORT=31179
-      WORLD_SIZE=2
-      --- Running adder_example.pkl ---
-      self.parallel_reconstruction: True
-      Worker 1, compute_device: cuda:1
-      --- Running adder_example.pkl ---
-      self.parallel_reconstruction: True
-      Worker 0, compute_device: cuda:0
-      subcircuit_entry_length: [32, 32]
-      LEN(DATASET): 16
-      NUMBER BATCHES: 1
-      Compute Time: 1.5637431228533387
-      Approximate Error: 1.2621774483536279e-29
-      verify took 0.011
-      --- Reconstruction Complete ---
-      Total Reconstruction Time:	1.5637431228533387
-      Approxamate Error:	 1.2621774483536279e-29
-      DESTROYING NOW! 1.5637431228533387
-      WORKER 1 DYING
\ No newline at end of file
diff --git a/readme.md b/readme.md
index d78ca95..b3970b1 100644
--- a/readme.md
+++ b/readme.md
@@ -31,7 +31,9 @@ Follow the [instructions](https://support.gurobi.com/hc/en-us/articles/147996775
 pip install -r requirements.txt
 ```
 
-## Example Code
+> Note on Qiskit Version: If you get warnings about conditionals '==' and 'is', switching to qiskit==0.45.2 may fix the issue.
+
+## General Example Code
 For an example, run:
 ```
 python example.py
@@ -40,6 +42,9 @@ This runs an example 16-qubit supremacy circuit.
 The output qubits are in a scrambled order based on the subcircuit post-processing sequence.
 A function that converts an arbitrary state of interest to the original order will be added.
 
+## Example Reconstruction
+See `explaning_example.md` for running the example scripts. 
+
 ## Citing CutQC
 If you use CutQC in your work, we would appreciate it if you cite our paper:
 
diff --git a/dist_driver.py b/reconstruction_example.py
similarity index 90%
rename from dist_driver.py
rename to reconstruction_example.py
index 2499795..3ce4304 100644
--- a/dist_driver.py
+++ b/reconstruction_example.py
@@ -1,6 +1,7 @@
-# Title: dist_driver.py
-# Description: Example of how CutQC can be used to effececiently reconstruct 
-# subcircuits
+'''
+Title: dist_driver.py
+Description: Example of how CutQC can be used to efficiently reconstruct subcircuits
+'''
 
 import os
 from cutqc.main import CutQC 

From 4e302de865058087f1a78ecce7872e40d8a19ac2 Mon Sep 17 00:00:00 2001
From: Chuck Garcia <cg1509@della8.princeton.edu>
Date: Wed, 15 Jan 2025 13:24:51 -0500
Subject: [PATCH 09/11] Add example slurm

---
 cut_and_eval_example.slurm   | 28 ++++++++++++++++++++++++++++
 reconstruction_example.slurm | 31 +++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 cut_and_eval_example.slurm
 create mode 100644 reconstruction_example.slurm

diff --git a/cut_and_eval_example.slurm b/cut_and_eval_example.slurm
new file mode 100644
index 0000000..5d9240e
--- /dev/null
+++ b/cut_and_eval_example.slurm
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --output='cut_and_eval_example.out'
+#SBATCH --nodes=1                 # node count
+#SBATCH --ntasks-per-node=1       # total number of tasks across all nodes
+#SBATCH --cpus-per-task=12        # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem=40G                 # memory per cpu-core (4G is default)
+#SBATCH --time=00:00:55           # total run time limit (HH:MM:SS)
+
+# Setup for Multi-node Workload
+export MASTER_PORT=$(get_free_port)  # Get a free Port
+export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 
+export MASTER_ADDR=$master_addr
+
+# Sanity Print
+echo "MASTER_ADDR="$MASTER_ADDR
+echo "MASTER_PORT="$MASTER_PORT
+echo "WORLD_SIZE="$WORLD_SIZE
+
+# Load Modules
+module purge
+module load anaconda3/2024.2
+conda activate cutqc
+module load gurobi/10.0.1
+export PYTHONPATH=/usr/licensed/gurobi/10.0.1/linux64/lib/python3.8_utf32
+
+python3 cut_and_eval_example.py 
+
diff --git a/reconstruction_example.slurm b/reconstruction_example.slurm
new file mode 100644
index 0000000..2aac011
--- /dev/null
+++ b/reconstruction_example.slurm
@@ -0,0 +1,31 @@
+#!/bin/bash
+#SBATCH --output=reconstruction_example.out
+#SBATCH --nodes=1                 # node count
+#SBATCH --ntasks-per-node=2       # total number of tasks across all nodes
+#SBATCH --cpus-per-task=12        # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem=40G                 # memory per cpu-core (4G is default)
+#SBATCH --gres=gpu:4
+#SBATCH --time=00:01:55           # total run time limit (HH:MM:SS)
+#SBATCH --mail-type=begin         # send email when job begins
+#SBATCH --mail-type=end           # send email when job ends
+#SBATCH --mail-type=fail          # send mail if job fails
+
+# Setup for Multi-node Workload
+export MASTER_PORT=$(get_free_port)  # Get a free Port
+export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)  # Master node is first host in list of hostnames
+export MASTER_ADDR=$master_addr
+
+# Sanity Print
+echo "MASTER_ADDR="$MASTER_ADDR
+echo "MASTER_PORT="$MASTER_PORT
+echo "WORLD_SIZE="$WORLD_SIZE
+
+# Load Modules
+module purge
+module load anaconda3/2024.2
+conda activate cutqc
+module load gurobi/10.0.1
+export PYTHONPATH=/usr/licensed/gurobi/10.0.1/linux64/lib/python3.8_utf32
+
+srun python reconstruction_example.py

From 7b2b5b9e8486ab73d5b5c7d85db2cdb9206ba097 Mon Sep 17 00:00:00 2001
From: Chuck Garcia <cg1509@della8.princeton.edu>
Date: Wed, 15 Jan 2025 13:39:25 -0500
Subject: [PATCH 10/11] Update readme

---
 readme.md | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/readme.md b/readme.md
index b3970b1..0930bb7 100644
--- a/readme.md
+++ b/readme.md
@@ -33,15 +33,6 @@ pip install -r requirements.txt
 
 > Note on Qiskit Version: If you get warnings about conditionals '==' and 'is', switching to qiskit==0.45.2 may fix the issue.
 
-## General Example Code
-For an example, run:
-```
-python example.py
-```
-This runs an example 16-qubit supremacy circuit.
-The output qubits are in a scrambled order based on the subcircuit post-processing sequence.
-A function that converts an arbitrary state of interest to the original order will be added.
-
 ## Example Reconstruction
 See `explaning_example.md` for running the example scripts. 
 

From e68c06bba7be22189c01666d3e2f66c7bc40fdae Mon Sep 17 00:00:00 2001
From: chuckgarcian <chuckgarcian@gmail.com>
Date: Wed, 11 Jun 2025 09:37:50 -0400
Subject: [PATCH 11/11] Update CutQC to qiskit 2.0.2 (latest)

---
 cut_and_eval_example.slurm   |  16 +----
 cutqc/cutter.py              | 132 ++++++++++++++++++++++++++++++++---
 reconstruction_example.slurm |   6 +-
 3 files changed, 129 insertions(+), 25 deletions(-)

diff --git a/cut_and_eval_example.slurm b/cut_and_eval_example.slurm
index 5d9240e..81aaf44 100644
--- a/cut_and_eval_example.slurm
+++ b/cut_and_eval_example.slurm
@@ -6,23 +6,11 @@
 #SBATCH --mem=40G                 # memory per cpu-core (4G is default)
 #SBATCH --time=00:00:55           # total run time limit (HH:MM:SS)
 
-# Setup for Multi-node Workload
-export MASTER_PORT=$(get_free_port)  # Get a free Port
-export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
-master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 
-export MASTER_ADDR=$master_addr
-
-# Sanity Print
-echo "MASTER_ADDR="$MASTER_ADDR
-echo "MASTER_PORT="$MASTER_PORT
-echo "WORLD_SIZE="$WORLD_SIZE
-
 # Load Modules
 module purge
 module load anaconda3/2024.2
-conda activate cutqc
-module load gurobi/10.0.1
-export PYTHONPATH=/usr/licensed/gurobi/10.0.1/linux64/lib/python3.8_utf32
+conda activate CutQCSummer2025
+module load gurobi/12.0.0
 
 python3 cut_and_eval_example.py 
 
diff --git a/cutqc/cutter.py b/cutqc/cutter.py
index 20ff198..fea7fa0 100644
--- a/cutqc/cutter.py
+++ b/cutqc/cutter.py
@@ -134,11 +134,10 @@ def _add_constraints(self):
         """
         for v in range(self.n_vertices):
             self.model.addConstr(
+                
                 gp.quicksum(
                     [self.vertex_var[i][v] for i in range(self.num_subcircuit)]
-                ),
-                gp.GRB.EQUAL,
-                1,
+                ) == 1
             )
 
         """
@@ -310,41 +309,158 @@ def solve(self):
 def read_circ(circuit):
     dag = circuit_to_dag(circuit)
     edges = []
+    
     node_name_ids = {}
     id_node_names = {}
     vertex_ids = {}
+    
+    topological_node_id = 0
+    qubit_gate_counter = {}
+    
+    for qubit in dag.qubits:
+        qubit_gate_counter[qubit] = 0
+    
+    # Assign vertices a unique id w.r.t. their topological order and a unique name
+    # given from its op and qubits 
+    for vertex in dag.topological_op_nodes():        
+        if len(vertex.qargs) != 2:
+            raise Exception("vertex does not have 2 qargs!")
+                      
+        arg0, arg1 = vertex.qargs
+        vertex_name = "%s[%d]%d %s[%d]%d" % (
+            arg0._register.name,
+            arg0._index,
+            qubit_gate_counter[arg0],
+            arg1._register.name,
+            arg1._index,
+            qubit_gate_counter[arg1],
+        )
+            
+        qubit_gate_counter[arg0] += 1
+        qubit_gate_counter[arg1] += 1
+        
+        
+        if vertex_name not in node_name_ids and vertex._node_id not in vertex_ids:
+            node_name_ids[vertex_name] = topological_node_id
+            id_node_names[topological_node_id] = vertex_name
+            vertex_ids[vertex._node_id] = topological_node_id            
+        
+            topological_node_id += 1
+
+    # Collect edge ids                  
+    for u, v, _ in dag.edges():      
+        
+        if type(u) == DAGOpNode and type(v) == DAGOpNode:      
+            u_id = vertex_ids[u._node_id]
+            v_id = vertex_ids[v._node_id]
+            
+            edges.append((u_id, v_id)) 
+    
+    n_vertices = dag.size()
+    
+    return n_vertices, edges, node_name_ids, id_node_names
+
+
+def read_circ_2(circuit):
+    
+    dag = circuit_to_dag(circuit)
+    edges = []
+    
+    node_name_ids = {}
+    id_node_names = {}
+    vertex_ids = {}
+    
     curr_node_id = 0
     qubit_gate_counter = {}
+    
+    all_nodes_sanity = []
+    all_nodes_sanity_name = []
+    
     for qubit in dag.qubits:
         qubit_gate_counter[qubit] = 0
+    i = 0    
     for vertex in dag.topological_op_nodes():
+        
+        all_nodes_sanity.append (id(vertex))
+        
         if len(vertex.qargs) != 2:
             raise Exception("vertex does not have 2 qargs!")
-
+                      
         arg0, arg1 = vertex.qargs
 
         vertex_name = "%s[%d]%d %s[%d]%d" % (
             arg0._register.name,
             arg0._index,
             qubit_gate_counter[arg0],
+        
             arg1._register.name,
             arg1._index,
             qubit_gate_counter[arg1],
         )
+    
+        all_nodes_sanity_name.append (vertex_name)
+        
         qubit_gate_counter[arg0] += 1
         qubit_gate_counter[arg1] += 1
-        # print(vertex.op.label,vertex_name,curr_node_id)
+        
+        
+        ## Add 
         if vertex_name not in node_name_ids and id(vertex) not in vertex_ids:
+            i = i + 1
+            print (f"vertex name Executed! {id(vertex)}")
             node_name_ids[vertex_name] = curr_node_id
             id_node_names[curr_node_id] = vertex_name
             vertex_ids[id(vertex)] = curr_node_id
             curr_node_id += 1
 
-    for u, v, _ in dag.edges():
-        if isinstance(u, DAGOpNode) and isinstance(v, DAGOpNode):
+    i = 0
+    for u, v, _ in dag.edges():      
+        # if isinstance(u, DAGOpNode) and isinstance(v, DAGOpNode):
+        print (f"u is in list: {id(u) in all_nodes_sanity }")
+        print (f"v is in list: {id(v) in all_nodes_sanity }")
+        
+        print (f"Type(u){type(u)}")
+        print (f"Type(v){type(v)}")
+        
+        if type(u) == DAGOpNode and type(v) == DAGOpNode:
+            i = i + 1
+            
+            print (u.op.name)
+            print (v.op.name)
+            print (f"Line Executed! {i}")
+            
+            arg0, arg1 = u.qargs
+                        
+            vertex_name_u = "%s[%d]%d %s[%d]%d" % (
+                arg0._register.name,
+                arg0._index,
+                qubit_gate_counter[arg0],
+            
+                arg1._register.name,
+                arg1._index,
+                qubit_gate_counter[arg1],
+            )
+            arg0, arg1 = v.qargs
+            vertex_name_v = "%s[%d]%d %s[%d]%d" % (
+                  arg0._register.name,
+                  arg0._index,
+                  qubit_gate_counter[arg0],
+              
+                  arg1._register.name,
+                  arg1._index,
+                  qubit_gate_counter[arg1],
+              )
+            print (f"u name is in list: {id(u) in all_nodes_sanity }")
+            print (f"v name is in list: {id(v) in all_nodes_sanity }")
+              
+            print (id(u))
+            print (id(v))
+            
+            ## Ensure end nodes are in the all node list 
+            
             u_id = vertex_ids[id(u)]
             v_id = vertex_ids[id(v)]
-            edges.append((u_id, v_id))
+            edges.append((u_id, v_id)) 
 
     n_vertices = dag.size()
     return n_vertices, edges, node_name_ids, id_node_names
diff --git a/reconstruction_example.slurm b/reconstruction_example.slurm
index 2aac011..a41388a 100644
--- a/reconstruction_example.slurm
+++ b/reconstruction_example.slurm
@@ -24,8 +24,8 @@ echo "WORLD_SIZE="$WORLD_SIZE
 # Load Modules
 module purge
 module load anaconda3/2024.2
-conda activate cutqc
-module load gurobi/10.0.1
-export PYTHONPATH=/usr/licensed/gurobi/10.0.1/linux64/lib/python3.8_utf32
+conda activate CutQCSummer2025
+module load gurobi/12.0.0
 
 srun python reconstruction_example.py
+