diff --git a/docs/conf.py b/docs/conf.py
index b871475..8e556dd 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -7,6 +7,7 @@
 extensions = [
     "myst_nb",
     "sphinx_book_theme",
+    "sphinx_design",
     "sphinx.ext.autodoc",
 ]
 
diff --git a/docs/gauxc/index.rst b/docs/gauxc/index.rst
new file mode 100644
index 0000000..1e2b55f
--- /dev/null
+++ b/docs/gauxc/index.rst
@@ -0,0 +1,12 @@
+Integrating Skala in Electronic Structure Packages via GauXC
+============================================================
+
+The `GauXC <https://github.com/wavefunction91/gauxc>`__ library provides an implementation for evaluating exchange-correlation functionals from an input density matrix.
+We provide Skala as an extension to the GauXC library, allowing users to easily integrate Skala into electronic structure packages that already interface with GauXC.
+The following sections provide instructions on how to install GauXC with Skala support and how to use Skala in your own electronic structure package via GauXC.
+
+
+.. toctree::
+
+   installation
+   standalone
diff --git a/docs/gauxc/installation.rst b/docs/gauxc/installation.rst
new file mode 100644
index 0000000..effbfa0
--- /dev/null
+++ b/docs/gauxc/installation.rst
@@ -0,0 +1,348 @@
+.. _gauxc_install:
+
+Installing GauXC
+================
+
+In this section, we will provide instructions on how to install GauXC with Skala support based on the conda-forge ecosystem.
+As part of this tutorial we will be
+
+* installing dependencies for building GauXC
+* configuring GauXC with different options
+* testing our the Skala implementation in GauXC
+* installing the GauXC library
+* reusing GauXC from the CMake build system
+
+
+Prerequisites
+-------------
+
+For this tutorial, we will use the `mamba <https://mamba.readthedocs.io/en/latest/>`__ package manager for setting up the environment and installing dependencies.
+If you do not have mamba installed, you can download the `miniforge <https://conda-forge.org/download/>`__ installer.
+
+First, we will create a new environment with all the required dependencies for building GauXC with Skala support.
+We provide three different configurations depending on whether you want to build GauXC with OpenMP, MPI, or CUDA support.
+
+.. dropdown:: GauXC dependencies
+
+   The following dependencies are required for building GauXC with Skala support:
+
+   - C/C++ compiler (with C++17 support)
+   - CMake (version 3.15 or higher)
+   - `exchcxx <https://github.com/wavefunction91/exchcxx>`__\ * (version 1 or higher)
+   - `libxc <https://libxc.gitlab.io/>`__\ * (version 7 or higher)
+   - `integratorxx <https://github.com/wavefunction91/integratorxx>`__\ * (version 1 or higher)
+   - `gau2grid <https://github.com/psi4/gau2grid>`__\ * (version 2.0.6 or higher)
+   - `libtorch <https://docs.pytorch.org/cppdocs/installing.html>`__ (CPU or CUDA version depending on your configuration)
+   - `nlohmann_json <https://github.com/nlohmann/json>`__\ * (version 3.9.1 or higher)
+   - BLAS library (like OpenBLAS, MKL, etc.)
+
+   When building with MPI support via ``-DGAUXC_ENABLE_MPI=on`` (default ``off``),
+   the following dependencies are also required:
+
+   - MPI implementation (like OpenMPI, MPICH, etc.)
+
+   When building with Cuda support via ``-DGAUXC_ENABLE_CUDA=on`` (default ``off``),
+   the following dependencies are also required:
+
+   - CUDA toolkit
+   - `cuBLAS library <https://developer.nvidia.com/cublas>`__
+   - `Cutlass library <https://github.com/NVIDIA/cutlass>`__\ *
+   - `CUB library <https://github.com/NVIDIA/cccl/tree/main/cub>`__\ *
+
+   When building with HDF5 support via ``-DGAUXC_ENABLE_HDF5=on`` (default ``on``),
+   the following dependencies are also required:
+
+   - `HDF5 <https://support.hdfgroup.org/documentation>`__
+   - `HighFive <https://github.com/highfive-dev/highfive>`__\ * (version 2.4.0 or higher)
+
+   All libraries marked with a * can be automatically fetched by the GauXC build system
+   and do not need to be installed manually.
+
+For this, create a file named `environment.yml` with the following content:
+
+.. tab-set::
+   :sync-group: config
+
+   .. tab-item:: OpenMP
+
+      .. code-block:: yaml
+         :caption: environment.yml
+
+         name: gauxc-dev
+         channels:
+           - conda-forge
+         dependencies:
+           # build requirements
+           - c-compiler
+           - cxx-compiler
+           - cmake >=3.15,<4
+           - ninja
+           - nlohmann_json >=3.9
+           # host/runtime requirements
+           - exchcxx >=1.0
+           - gau2grid >=2.0.6
+           - hdf5
+           - libblas
+           - pytorch >=2.0 cpu_*
+
+   .. tab-item:: MPI
+
+      .. code-block:: yaml
+         :caption: environment.yml
+
+         name: gauxc-dev
+         channels:
+           - conda-forge
+         dependencies:
+           # build requirements
+           - c-compiler
+           - cxx-compiler
+           - cmake >=3.15,<4
+           - ninja
+           - nlohmann_json >=3.9
+           # host/runtime requirements
+           - openmpi  # pick mpich if that matches your stack
+           - exchcxx >=1.0
+           - gau2grid >=2.0.6
+           - hdf5 * mpi_*
+           - libblas
+           - pytorch >=2.0 cpu_*
+
+   .. tab-item:: CUDA
+
+      .. code-block:: yaml
+         :caption: environment.yml
+
+         name: gauxc-dev
+         channels:
+           - conda-forge
+         dependencies:
+           # build requirements
+           - c-compiler
+           - cxx-compiler
+           - cuda-compiler
+           - cmake >=3.15,<4
+           - ninja
+           - nlohmann_json >=3.9
+           # host/runtime requirements
+           - libxc >=7,<8
+           - gau2grid >=2.0.6
+           - hdf5
+           - libblas
+           - pytorch >=2.0 cuda*
+
+Create and activate the environment:
+
+.. code-block:: none
+
+   mamba env create -n gauxc-dev -f environment.yml
+   mamba activate gauxc-dev
+
+Verify that the toolchain is visible:
+
+.. code-block:: bash
+
+   cmake --version
+   python -c "import torch; print(torch.__version__)"
+
+
+Obtain GauXC with Skala
+-----------------------
+
+Download the pre-packaged source bundle from the Skala release page:
+
+.. code-block:: none
+
+   curl -L https://github.com/microsoft/skala/releases/download/v1.1.0/gauxc-skala.tar.gz | tar xzv
+
+.. tip::
+
+   To verify the downloaded tarball you can obtain a checksum
+
+   .. code-block:: none
+
+      curl -L https://github.com/microsoft/skala/releases/download/v1.1.0/gauxc-skala.tar.gz > gauxc-skala.tar.gz
+      curl -L https://github.com/microsoft/skala/releases/download/v1.1.0/gauxc-skala.tar.gz.sha256 | sha256sum -c
+      tar xzvf gauxc-skala.tar.gz
+
+The archive expands into a ``gauxc`` directory that already contains the Skala patches.
+
+
+Configure and build
+-------------------
+
+Create an out-of-tree build directory and pick the configuration that matches your backend.
+
+.. tab-set::
+   :sync-group: config
+
+   .. tab-item:: OpenMP
+
+      .. code-block:: none
+
+         cmake -B build -S gauxc -G Ninja \
+           -DGAUXC_ENABLE_OPENMP=on \
+           -DGAUXC_ENABLE_MPI=off \
+           -DGAUXC_ENABLE_CUDA=off \
+           -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX}
+         cmake --build build
+
+   .. tab-item:: MPI
+
+      .. code-block:: none
+
+         cmake -B build -S gauxc -G Ninja \
+           -DGAUXC_ENABLE_OPENMP=on \
+           -DGAUXC_ENABLE_MPI=on \
+           -DGAUXC_ENABLE_CUDA=off \
+           -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX}
+         cmake --build build
+
+   .. tab-item:: CUDA
+
+      .. code-block:: none
+
+         cmake -B build -S gauxc -G Ninja \
+           -DGAUXC_ENABLE_OPENMP=on \
+           -DGAUXC_ENABLE_MPI=off \
+           -DGAUXC_ENABLE_CUDA=on \
+           -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX}
+         cmake --build build
+
+.. tip::
+
+   If CMake cannot find libtorch, the ``Torch_DIR`` variable can be set to help discover the package.
+   For conda-forge installed pytorch this should be set as ``-DTorch_DIR=${CONDA_PREFIX}/share/cmake/Torch``
+   and for pip installed pytorch the CMake config file will be in ``${CONDA_PREFIX}/lib/python3.11/site-packages/torch/share/cmake/Torch``
+   where the Python version should be adjusted accordingly to the environment.
+
+
+Quick verification
+------------------
+
+After the build finishes, run the bundled regression test to confirm that Skala-enabled functionals
+are working correctly. The Skala implementation can run different traditional functionals, like PBE and TPSS,
+which can be compared against other libraries.
+
+.. code-block:: bash
+
+   cd gauxc/tests/ref_data
+   ../../../build/tests/standalone_driver onedft_input.inp
+
+Expected output includes the total TPSS energy computed using a checkpoint compatible for the Skala implementation
+for the reference density matrix.
+
+.. tip::
+   
+   If the executable cannot locate libtorch or other shared libraries, double-check
+   that ``LD_LIBRARY_PATH`` includes ``${CONDA_PREFIX}/lib``
+   (activating the environment usually handles this).
+
+
+Install the library
+-------------------
+
+Install into the active conda environment so downstream projects can pick up the CMake config files.
+
+.. code-block:: bash
+
+   cmake --install build
+
+This installs headers, libraries, and CMake config.
+
+
+Integrate with your codebase
+----------------------------
+
+Using an installed GauXC
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Add the following to your CMake project, ensuring that ``CMAKE_PREFIX_PATH`` contains
+``${CONDA_PREFIX}`` (activation scripts typically set this).
+
+.. code-block:: cmake
+
+   find_package(gauxc CONFIG REQUIRED)
+
+   if(NOT gauxc_HAS_ONEDFT)
+     message(FATAL_ERROR "GauXC found but Skala/OneDFT was not enabled during the build")
+   endif()
+
+   target_link_libraries(my_dft_driver PRIVATE gauxc::gauxc)
+
+The imported target propagates include directories, compile definitions, and linkage against BLAS,
+Torch, and optional MPI/CUDA components.
+
+Embedding GauXC via FetchContent
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you need to vend GauXC directly from your build, use ``FetchContent`` while mirroring the
+options chosen above.
+
+.. code-block:: cmake
+
+   set(Skala_GauXC_URL "https://github.com/microsoft/skala/releases/download/v1.1.0/gauxc-skala.tar.gz")
+   set(Skala_GauXC_SHA256 "ed3102485f6d838c8076a03162b11a1d7c3fd52b212ba6a048db2e9089c98f3c")
+
+   option(Skala_GauXC_ENABLE_OPENMP "Enable OpenMP support in GauXC" ON)
+   option(Skala_GauXC_ENABLE_MPI "Enable MPI support in GauXC" OFF)
+   option(Skala_GauXC_ENABLE_CUDA "Enable CUDA support in GauXC" OFF)
+
+   find_package(gauxc QUIET CONFIG)
+   if(NOT gauxc_FOUND)
+     include(FetchContent)
+
+     message(STATUS "Could not find GauXC... Building GauXC from source")
+     message(STATUS "GAUXC URL: ${Skala_GauXC_URL}")
+
+     set(GAUXC_ENABLE_ONEDFT ON CACHE BOOL "" FORCE)
+     set(GAUXC_ENABLE_TESTS OFF CACHE BOOL "" FORCE)
+     set(GAUXC_ENABLE_OPENMP ${Skala_GauXC_ENABLE_OPENMP} CACHE BOOL "" FORCE)
+     set(GAUXC_ENABLE_MPI ${Skala_GauXC_ENABLE_MPI} CACHE BOOL "" FORCE)
+     set(GAUXC_ENABLE_CUDA ${Skala_GauXC_ENABLE_CUDA} CACHE BOOL "" FORCE)
+
+     FetchContent_Declare(
+       gauxc
+       URL ${Skala_GauXC_URL}
+       URL_HASH SHA256=${Skala_GauXC_SHA256}
+       DOWNLOAD_EXTRACT_TIMESTAMP ON
+     )
+     FetchContent_MakeAvailable(gauxc)
+
+   else()
+     if(NOT ${GAUXC_HAS_ONEDFT})
+       message(FATAL_ERROR "GauXC found but without Skala support enabled")
+     endif()
+     if(${Skala_GauXC_ENABLE_OPENMP} AND NOT ${GAUXC_HAS_OPENMP})
+       message(WARNING "GauXC Found with OpenMP support but Skala_GauXC_ENABLE_OPENMP is OFF")
+     endif()
+     if(${Skala_GauXC_ENABLE_MPI} AND NOT ${GAUXC_HAS_MPI})
+       message(WARNING "GauXC Found with MPI support but Skala_GauXC_ENABLE_MPI is OFF")
+     endif()
+     if(${Skala_GauXC_ENABLE_CUDA} AND NOT ${GAUXC_HAS_CUDA})
+       message(WARNING "GauXC Found with CUDA support but Skala_GauXC_ENABLE_CUDA is OFF")
+     endif()
+   endif()
+
+Troubleshooting
+---------------
+
+Torch not found
+  ensure ``Torch_DIR`` points to the libtorch CMake package inside the active environment,
+  or export ``Torch_DIR`` before running CMake.
+
+CUDA mismatch
+  the CUDA toolkit selected by conda must match the version baked into the
+  ``pytorch`` build; reinstall ``pytorch`` if necessary (e.g., ``pytorch ==2.3.* cuda118*``).
+
+Linker errors for BLAS/MPI
+  verify that the conda environment stayed active during the build and that ``cmake`` picked
+  the toolchain from ``${CONDA_PREFIX}`` via ``CMAKE_PREFIX_PATH``.
+
+Standalone driver cannot find densities
+  run it from ``gauxc/tests/ref_data`` since paths in density files are specified relative to the
+  current directory.
+
+.. note::
+
+   Need help? Open an issue on the `Skala repository <https://github.com/microsoft/skala/issues>`__.
\ No newline at end of file
diff --git a/docs/gauxc/standalone.rst b/docs/gauxc/standalone.rst
new file mode 100644
index 0000000..794e2be
--- /dev/null
+++ b/docs/gauxc/standalone.rst
@@ -0,0 +1,227 @@
+GauXC standalone usage
+======================
+
+The GauXC package comes with a standalone driver for testing the evaluation of the exchange-correlation energy with different functionals.
+In this tutorial we will use the standalone driver to evaluate Skala based on density matrices computed with different packages.
+
+.. note::
+
+   For building GauXC and running the standalone driver checkout :ref:`install_gauxc`.
+
+Create GauXC compatible input
+-----------------------------
+
+We will use the ``skala`` package to write a GauXC compatible input for our calculation.
+For this we will run a PySCF calculation and write the molecule, basis set and density matrix in the format expected by GauXC.
+In this example we will use a single one atom system in a small basis set.
+
+.. note::
+
+   We will write the input data as HDF5 file since GauXC can read its objects directly from HDF5 datasets.
+   The format in the HDF5 file does correpond to the internal structure of GauXC objects and therefore allows us to conveniently inspect the data.
+
+.. code-block:: python
+
+   from pyscf import gto
+
+   from skala.gauxc.export import write_gauxc_h5_from_pyscf
+   from skala.pyscf import SkalaRKS
+
+   mol = gto.M(atom="He 0 0 0", basis="def2-svp", unit="Bohr", spin=0)
+   ks = SkalaRKS(xc="pbe")
+   ks.kernel()
+
+   dm = ks.make_rdm1()
+   exc = ks.scf_summary["exc"]
+   _, _, vxc = ks._numint.nr_rks(ks.mol, ks.grids, ks.xc, dm)
+
+   write_gauxc_h5_from_pyscf("He_def2svp.h5", mol, dm=dm, exc=exc, vxc=vxc)
+
+Additionally to the inputs (molecule, basis set, and density matrix) we provide the exchange-correlation energy and potential to allow the standalone driver to compare against our reference calculation.
+
+Running the GauXC standalone driver
+-----------------------------------
+
+The GauXC standalone driver takes a single input file, where we need to specify the path of our HDF5 file with the input data.
+In the input file we specify the ``ONEDFT_MODEL`` as PBE since we used it for our input calculation as well.
+Furthermore, we have parameters like ``grid``, ``pruning_scheme``, etc. which define the integration grid settings in GauXC, here we go with a fine grid, Mura-Knowles radial integration scheme and the robust pruning scheme of Psi4. 
+
+.. code-block:: ini
+   :caption: gauxc_input.inp
+
+   [GAUXC]
+   ref_file = He_def2svp.h5
+   ONEDFT_MODEL = PBE
+   grid = Fine
+   pruning_scheme = Robust
+   RAD_QUAD = MuraKnowles
+   batch_size = 512
+   basis_tol = 2.22e-16
+   LB_EXEC_SPACE =  Device
+   INT_EXEC_SPACE = Device
+   REDUCTION_KERNEL = Default
+   MEMORY_SIZE = 0.1
+
+.. note::
+
+   Make sure the HDF5 file ``He_def2svp.h5`` is in the same directory as the one where we start the standalone driver.
+
+To run the standalone driver with this input we run it from the build directory with our input file:
+
+.. code-block:: text
+
+   ./build/tests/standalone_driver gauxc_input.inp
+
+For a successful run we will see the following output
+
+.. code-block:: text
+
+   DRIVER SETTINGS: 
+     REF_FILE          = He_def2svp.h5
+     GRID              = FINE
+     RAD_QUAD          = MURAKNOWLES
+     PRUNING_SCHEME    = ROBUST
+     BATCH_SIZE        = 512
+     BASIS_TOL         = 2.22e-16
+     FUNCTIONAL        = PBE0
+     LB_EXEC_SPACE     = DEVICE
+     INT_EXEC_SPACE    = DEVICE
+     INTEGRATOR_KERNEL = DEFAULT
+     LWD_KERNEL        = DEFAULT
+     REDUCTION_KERNEL  = DEFAULT
+     DEN (?)           = false
+     VXC (?)           = true
+     EXX (?)           = false
+     EXC_GRAD (?)      = false
+     DD_PSI (?)        = false
+     DD_PSI_POTENTIAL (?)       = false
+     ONEDFT_MODEL    = PBE
+     FXC_CONTRACTION (?)       = false
+     MEMORY_SIZE       = 0.1
+
+   EXC: -1.054031868349e+00
+   EXC = -1.054031868349e+00
+
+   Load Balancer Timings
+           LoadBalancer.CreateTasks:  1.50510e+01 ms
+   MolecularWeights Timings
+                   MolecularWeights:  2.98569e+01 ms
+   Integrator Timings
+                       XCIntegrator.Allreduce:  4.11500e-03 ms
+                       XCIntegrator.LocalWork:  2.35691e+01 ms
+                      XCIntegrator.LocalWork2:  9.11679e+00 ms
+   XC Int Duration  = 3.35111170000000e-01 s
+   EXC (ref)        = -1.05403142675144e+00
+   EXC (calc)       = -1.05403186834886e+00
+   EXC Diff         = -4.18960391377858e-07
+   | VXC (ref)  |_F = 1.45598265614311e+00
+   | VXC (calc) |_F = 1.45598296606474e+00
+   RMS VXC Diff     = 7.43706533247358e-08
+   | VXCz (ref)  |_F = 0.00000000000000e+00
+   | VXCz (calc) |_F = 0.00000000000000e+00
+   RMS VXCz Diff     = 0.00000000000000e+00
+
+We find a reasonable difference between PySCF and GauXC computed exchange-correlation energy and potential.
+
+.. note::
+
+   We can converge this value further by choosing finer grid settings both in PySCF and GauXC.
+
+Inspecting the GauXC input data
+-------------------------------
+
+Now that we verified that GauXC can evaluate based on our PySCF produced input data, we will have a closer look of what we sent to GauXC.
+For this we will inspect our HDF5 input data more closely.
+
+.. code-block:: ipython
+
+   In [1]: import h5py
+      ...: import numpy as np
+
+   In [2]: with h5py.File("He_def2-svp.h5") as h5:
+      ...:     molecule = np.asarray(h5["MOLECULE"])
+      ...:     basis = np.asarray(h5["BASIS"])
+      ...:     dm_scalar = np.asarray(h5["DENSITY_SCALAR"])
+      ...:     dm_z = np.asarray(h5["DENSITY_Z"])
+      ...:
+
+First, we inspect the molecule format which represents follows an array of structs format, combining the atomic number together with the cartesian coordinates in Bohr.
+For our Helium example we expect a single entry centered at the origin:
+
+.. code-block:: ipython
+
+   In [3]: molecule.shape
+   Out[3]: (1,)
+
+   In [4]: molecule.dtype
+   Out[4]: dtype({'names': ['Atomic Number', 'X Coordinate', 'Y Coordinate', 'Z Coordinate'], 'formats': ['<i4', '<f8', '<f8', '<f8'], 'offsets': [0, 8, 16, 24], 'itemsize': 32})
+
+   In [5]: molecule[0]
+   Out[5]: np.void((2, 0.0, 0.0, 0.0), dtype={'names': ['Atomic Number', 'X Coordinate', 'Y Coordinate', 'Z Coordinate'], 'formats': ['<i4', '<f8', '<f8', '<f8'], 'offsets': [0, 8, 16, 24], 'itemsize': 32})
+
+The def2-SVP basis set for Helium has three functions (2s1p).
+Similar to the molecule format the basis set is represented as an array of structs and combines the information of the number of primitives, the angular momentum, whether the shell is spherical or cartesian with the Gaussian exponents, contraction coefficients and cartesian coordinates of the origin in Bohr.
+Note that the length of exponents in each shell is padded to 16 elements for the exponents and contraction coefficients.
+
+.. code-block:: ipython
+
+   In [6]: basis.shape
+   Out[6]: (3,)
+
+   In [7]: basis.dtype
+   Out[7]: dtype({'names': ['NPRIM', 'L', 'PURE', 'ALPHA', 'COEFF', 'ORIGIN'], 'formats': ['<i4', '<i4', '<i4', ('<f8', (16,)), ('<f8', (16,)), ('<f8', (3,))], 'offsets': [0, 4, 8, 16, 272, 528], 'itemsize': 552})
+
+   In [8]: basis[0]
+   Out[8]: np.void((3, 0, 1, [38.354936737, 5.7689081479, 1.2399407035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.44139562618839057, 0.6934601558999577, 0.6641335374571593, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0]), dtype={'names': ['NPRIM', 'L', 'PURE', 'ALPHA', 'COEFF', 'ORIGIN'], 'formats': ['<i4', '<i4', '<i4', ('<f8', (16,)), ('<f8', (16,)), ('<f8', (3,))], 'offsets': [0, 4, 8, 16, 272, 528], 'itemsize': 552})
+
+   In [9]: basis[2]
+   Out[9]: np.void((1, 1, 0, [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.4254109407099804, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0]), dtype={'names': ['NPRIM', 'L', 'PURE', 'ALPHA', 'COEFF', 'ORIGIN'], 'formats': ['<i4', '<i4', '<i4', ('<f8', (16,)), ('<f8', (16,)), ('<f8', (3,))], 'offsets': [0, 4, 8, 16, 272, 528], 'itemsize': 552})
+
+.. important::
+
+   The orbital ordering convention for the shells in GauXC is following the common component architecture (CCA) convention for pure / spherical shells and the row convention for cartesian ones.
+   The CCA ordering for spherical shells is defined as
+
+   - ``s`` (:math:`\ell = 0`): :math:`Y_0^0`
+   - ``p`` (:math:`\ell = 1`): :math:`Y_1^{-1}`, :math:`Y_1^{0}`, :math:`Y_1^{+1}`,
+   - ``d`` (:math:`\ell = 2`): :math:`Y_2^{-2}`, :math:`Y_2^{-1}`, :math:`Y_2^{0}`, :math:`Y_2^{+1}`, :math:`Y_2^{+2}`
+
+   The row ordering for cartesian shells is defined as
+
+   - ``s`` (:math:`\ell = 0`): `1`
+   - ``p`` (:math:`\ell = 1`): ``x``, ``y``, ``z``
+   - ``d`` (:math:`\ell = 2`): ``xx``, ``xy``, ``xz``, ``yy``, ``yz``, ``zz``
+   - ``f`` (:math:`\ell = 3`): ``xxx``, ``xxy``, ``xxz``, ``xyy``, ``xyz``, ``xzz``, ``yyy``, ``yyz``, ``yzz``, ``zzz``
+
+   PySCF is using CCA ordering for all shells except for the p-shell where row ordering is used.
+   Our export accounts for this by exporting p-shells with setting the ``pure`` entry to zero to have GauXC use row ordering.
+
+Finally, we inspect the density matrix in our input data.
+Notably, the two spin channels are stored as scalar and z component.
+The scalar component contains the sum of the alpha / up and the beta / down spin channel, i.e. the total, while the z component contains their difference, i.e. the polarization.
+Similarly, GauXC will compute the exchange-correlation potential as scalar and z component.
+
+We can convert our from scalar and z component to alpha and beta channel by
+
+.. code-block:: ipython
+
+   In [10]: dm_a = 0.5 * (dm_scalar + dm_z)
+       ...: dm_b = 0.5 * (dm_scalar - dm_z)
+
+Since we used a restricted density matrix as input here the z component will be zero and the scalar component just the double of the individual spin channels.
+
+Summary
+-------
+
+In summary we recommend to use the standalone driver to explore using GauXC for verifying that you formatted your inputs correctly.
+Use the provided traditional functional implementations (LDA exchange, PBE and TPSS) to verify the correctness of the computed exchange-correlation energies and potentials between your own package and GauXC.
+
+Troubleshooting
+---------------
+
+HDF5 file not found
+  ensure that the HDF5 file is in the same directory where you are starting the GauXC standalone driver
+
+GauXC standalone driver not found
+  the GauXC standalone driver is a testing tool and not installed with GauXC, check the ``tests`` subdirectory in the directory where you built GauXC.
+  If the standalone driver is missing make sure that you enabled testing in CMake or rebuild the GauXC project to generate the standalone driver.
diff --git a/docs/index.rst b/docs/index.rst
index 4cfc352..0d8d846 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -27,6 +27,7 @@ Please stay tuned for updates and new releases.
    pyscf/scf_settings
    ase
    foundry
+   gauxc/index
 
 .. toctree::
    :maxdepth: 1
diff --git a/environment.yml b/environment.yml
index 9d5c2f2..3502646 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,6 +7,7 @@ dependencies:
 - azure-identity
 - dftd3-python
 - e3nn
+- h5py
 - numpy
 - opt_einsum_fx
 - pyscf <2.10.0
diff --git a/pyproject.toml b/pyproject.toml
index d1f28bf..1a4ca3b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ optional-dependencies.doc = [
     "myst-nb",
     "sphinx",
     "sphinx-book-theme",
+    "sphinx-design",
 ]
 
 urls.repository = "https://github.com/microsoft/skala"
diff --git a/src/skala/gauxc/__init__.py b/src/skala/gauxc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/skala/gauxc/export.py b/src/skala/gauxc/export.py
new file mode 100644
index 0000000..227b2eb
--- /dev/null
+++ b/src/skala/gauxc/export.py
@@ -0,0 +1,141 @@
+import h5py
+import numpy as np
+
+from pyscf import gto
+
+MOLECULE_DTYPE = {
+    "names": ["Atomic Number", "X Coordinate", "Y Coordinate", "Z Coordinate"],
+    "formats": ["<i4", "<f8", "<f8", "<f8"],
+    "offsets": [0, 8, 16, 24],
+    "itemsize": 32,
+}
+BASIS_DTYPE = {
+    "names": ["NPRIM", "L", "PURE", "ALPHA", "COEFF", "ORIGIN"],
+    "formats": ["<i4", "<i4", "<i4", ("<f8", (16,)), ("<f8", (16,)), ("<f8", (3,))],
+    "offsets": [0, 4, 8, 16, 272, 528],
+    "itemsize": 552,
+}
+SQRT_PI_CUBED = 5.56832799683170784528481798212
+K_MINUS_1 = [
+    1,
+    1,
+    1,
+    2,
+    3,
+    8,
+    15,
+    48,
+    105,
+    384,
+    945,
+    3840,
+    10395,
+    46080,
+    135135,
+    645120,
+    2027025,
+    10321920,
+    34459425,
+    185794560,
+    654729075,
+    3715891200,
+    13749310575,
+    81749606400,
+    316234143225,
+    1961990553600,
+    7905853580625,
+    51011754393600,
+    213458046676875,
+    1428329123020800,
+    6190283353629375,
+    42849873690624000,
+    191898783962510625,
+    1371195958099968000,
+    6332659870762850625,
+]
+
+
+def write_gauxc_h5_from_pyscf(filename: str, mol: gto.Mole, dm: np.ndarray, exc: float | None = None, vxc: np.ndarray | None = None) -> None:
+    data = pyscf_to_gauxc_h5(mol, dm, exc, vxc)
+    with h5py.File(filename, "w") as fd:
+        for key, value in data.items():
+            fd.create_dataset(key, data=value)
+
+
+def pyscf_to_gauxc_h5(mol: gto.Mole, dm: np.ndarray, exc: float | None = None, vxc: np.ndarray | None = None) -> dict[str, np.ndarray]:
+    molecule = np.array(
+        [
+            (number, *coords)
+            for number, coords in zip(mol.atom_charges(), mol.atom_coords(unit="Bohr"), strict=True)
+        ],
+        dtype=MOLECULE_DTYPE,
+    )
+    basis = np.array(
+        [
+            format_basis(
+                func[0],
+                mol.cart,
+                [pair[0] for pair in func[1:]],
+                [pair[prim] for pair in func[1:]],
+                coord,
+            )
+            for atom, coord in mol._atom
+            for func in mol._basis[atom]
+            for prim in range(1, len(func[1]))
+        ],
+        dtype=BASIS_DTYPE,
+    )
+    dm_scalar = dm if dm.ndim == 2 else dm[0] + dm[1]
+    dm_z = np.zeros_like(dm) if dm.ndim == 2 else dm[0] - dm[1]
+
+    data = {
+        "MOLECULE": molecule,
+        "BASIS": basis,
+        "DENSITY_SCALAR": dm_scalar,
+        "DENSITY_Z": dm_z,
+    }
+
+    if exc is not None:
+        data["EXC"] = exc
+    if vxc is not None:
+        vxc_scalar = vxc if vxc.ndim == 2 else vxc[0] + vxc[1]
+        vxc_z = np.zeros_like(vxc) if vxc.ndim == 2 else vxc[0] - vxc[1]
+        data["VXC_SCALAR"] = vxc_scalar
+        data["VXC_Z"] = vxc_z
+
+    return data
+
+
+def norm(coeff: list[float], alpha: list[float], l: int) -> list[float]:
+    """
+    Normalize contraction coefficients for a given angular momentum and exponents
+    using libint normalization conventions.
+    """
+    alpha = np.asarray(alpha)
+    two_alpha = 2 * alpha
+    two_alpha_to_am32 = two_alpha ** (l + 1) * np.sqrt(two_alpha)
+    normalization_factor = np.sqrt(2**l * two_alpha_to_am32 / (SQRT_PI_CUBED * K_MINUS_1[2 * l]))
+    gamma = alpha[:, np.newaxis] + alpha[np.newaxis, :]
+    aa = K_MINUS_1[2 * l] * SQRT_PI_CUBED / (2**l * gamma ** (l + 1) * np.sqrt(gamma))
+    coeff = coeff * normalization_factor
+    normalization_factor = 1.0 / np.sqrt(np.einsum("i,j,ij->", coeff, coeff, aa))
+    return (coeff * normalization_factor).tolist()
+
+
+def format_basis(
+    l: int,
+    cart: bool,
+    alpha: list[float],
+    coeff: list[float],
+    coord: list[float],
+    padv: float = 0.0,
+    padl: int = 16,
+) -> tuple[int, int, int, list[float], list[float], list[float]]:
+    return (
+        len(alpha),
+        l,
+        0 if cart or l == 1 else 1,
+        alpha + [padv] * (padl - len(alpha)),
+        norm(coeff, alpha, l) + [padv] * (padl - len(coeff)),
+        coord,
+    )
diff --git a/tests/test_gauxc_export.py b/tests/test_gauxc_export.py
new file mode 100644
index 0000000..4069614
--- /dev/null
+++ b/tests/test_gauxc_export.py
@@ -0,0 +1,73 @@
+from tempfile import NamedTemporaryFile
+
+import h5py
+import numpy as np
+import pytest
+
+from pyscf import dft, gto
+from skala.gauxc.export import write_gauxc_h5_from_pyscf
+
+
+@pytest.fixture(params=["He", "Li"])
+def mol_name(request) -> str:
+    return request.param
+
+
+@pytest.fixture
+def basis() -> str:
+    return "def2-svp"
+
+
+@pytest.fixture(params=["cart", "sph"])
+def cartesian(request) -> bool:
+    return request.param == "cart"
+
+
+@pytest.fixture
+def mol(mol_name: str, basis: str, cartesian: bool) -> gto.Mole:
+    match mol_name:
+        case "He":
+            return gto.M(atom="He 0 0 0", basis=basis, cart=cartesian, unit="Bohr", spin=0)
+        case "Li":
+            return gto.M(atom="Li 0 0 0", basis=basis, cart=cartesian, unit="Bohr", spin=1)
+        case _:
+            raise ValueError(f"Unknown molecule name: {mol_name}")
+
+
+@pytest.fixture
+def ks(mol: gto.Mole) -> dft.rks.RKS:
+    ks = dft.KS(mol, xc="pbe")
+    ks.kernel()
+    return ks
+
+@pytest.fixture
+def dm(ks: dft.rks.RKS) -> np.ndarray:
+    return ks.make_rdm1()
+
+@pytest.fixture
+def exc(ks: dft.rks.RKS) -> float:
+    return ks.scf_summary["exc"]
+
+
+@pytest.fixture
+def vxc(ks: dft.rks.RKS, dm: np.ndarray) -> np.ndarray:
+    if dm.ndim == 2:
+        _, _, vxc = ks._numint.nr_rks(ks.mol, ks.grids, ks.xc, dm)
+    else:
+        _, _, vxc = ks._numint.nr_uks(ks.mol, ks.grids, ks.xc, dm)
+    return vxc
+
+
+
+def test_write_pyscf(mol: gto.Mole, dm: np.ndarray, mol_name, basis, exc, vxc) -> None:
+    with NamedTemporaryFile(suffix=".h5") as tmp:
+        write_gauxc_h5_from_pyscf(tmp.name, mol, dm, exc, vxc)
+
+        with h5py.File(tmp.name, "r") as h5:
+            assert "MOLECULE" in h5, "Molecule is missing in h5 export"
+            assert "BASIS" in h5, "Basis is missing in h5 export"
+            assert "DENSITY_SCALAR" in h5, "Density (a+b) is missing in h5 export"
+            assert "DENSITY_Z" in h5, "Density (a-b) is missing in h5 export"
+            assert "EXC" in h5, "Exchange-correlation energy is missing in h5 export"
+            assert "VXC_SCALAR" in h5, "Exchange-correlation potential (a+b) is missing in h5 export"
+            assert "VXC_Z" in h5, "Exchange-correlation potential (a-b) is missing in h5 export"
\ No newline at end of file