diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index dc85afb..47e7a75 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -21,9 +21,6 @@ jobs:
       with:
         fetch-depth: 0
         
-    - name: apt update
-      run: apt update -y && apt upgrade -y
-        
     - name: Build from src
       run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE .. && make -j8 && make install
       
diff --git a/.github/workflows/Coverage.yml b/.github/workflows/Coverage.yml
index 1c261da..6f458a8 100644
--- a/.github/workflows/Coverage.yml
+++ b/.github/workflows/Coverage.yml
@@ -19,9 +19,6 @@ jobs:
       with:
         fetch-depth: 0
         
-    - name: apt update
-      run: apt update -y && apt upgrade -y
-        
     - name: Build from src
       continue-on-error: true # The one test will most likely fail, but if we skip we lose a lot of coverage.
       run: mkdir build && cd build && cmake -DBUILD_COVERAGE=1 .. && make -j8 && make install
diff --git a/.gitignore b/.gitignore
index 36a2411..477a210 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,8 +11,7 @@ _site/
 .bundle/
 vendor/
 
-# Ignore editors
-.vscode
+# Ignore editors (but not vscode)
 /.pytest_cache/
 .cproject
 .idea
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000..2f9028c
--- /dev/null
+++ b/.vscode/c_cpp_properties.json
@@ -0,0 +1,23 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "${workspaceFolder}/include/**",
+                "/usr/local/cuda/include",
+                "/usr/local/include/openbabel3/",
+                "/usr/include",
+                "/usr/include/c++/11/",
+                "${workspaceFolder}/include/**"],
+            "defines": [],
+            "compilerPath": "/usr/bin/c++",
+            "cStandard": "c17",
+            "cppStandard": "gnu++17",
+            "intelliSenseMode": "linux-gcc-x64",
+            "compileCommands": "${workspaceFolder}/build/compile_commands.json",
+            "configurationProvider": "ms-vscode.cmake-tools"            
+        }
+    ],
+    "version": 4
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..03f2e1b
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,77 @@
+{
+    "files.associations": {
+        "stdexcept": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "strstream": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "chrono": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "condition_variable": "cpp",
+        "cstdint": "cpp",
+        "deque": "cpp",
+        "list": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "semaphore": "cpp",
+        "sstream": "cpp",
+        "stop_token": "cpp",
+        "streambuf": "cpp",
+        "thread": "cpp",
+        "cinttypes": "cpp",
+        "typeindex": "cpp",
+        "typeinfo": "cpp",
+        "variant": "cpp",
+        "codecvt": "cpp",
+        "cfenv": "cpp",
+        "filesystem": "cpp"
+    },
+    "C_Cpp.files.exclude": {
+        "/usr/local/include/libmolgrid/**": true
+    }
+}
\ No newline at end of file
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
new file mode 100644
index 0000000..dc1a631
--- /dev/null
+++ b/.vscode/tasks.json
@@ -0,0 +1,24 @@
+{
+  "version": "2.0.0",
+  "isShelllCommand": true,
+  "tasks": [
+    {
+      "label": "build",
+      "options": {
+        "cwd": "${workspaceRoot}/build"
+      },
+      "type": "shell",
+      "command": "make -j24",
+      "group": "build"
+    },
+    {
+      "label": "build-debug",
+      "options": {
+        "cwd": "${workspaceRoot}/build-debug"
+      },
+      "type": "shell",
+      "command": "make -j24",
+      "group": "build"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5c0c8c..903503b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,8 @@ if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES OFF)
 include(GNUInstallDirs)
 
 option(BUILD_SHARED "Build shared library" ON)
@@ -37,8 +39,7 @@ include(GNUInstallDirs)
 # set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR})
 
 #dependencies
-find_package(CUDA REQUIRED)
-find_package(Boost REQUIRED COMPONENTS regex unit_test_framework program_options system filesystem iostreams)
+find_package(Boost REQUIRED COMPONENTS regex unit_test_framework program_options system filesystem iostreams serialization)
 find_package(OpenBabel3 REQUIRED)
 include_directories(SYSTEM ${OPENBABEL3_INCLUDE_DIR})
 find_package(ZLIB)
@@ -54,20 +55,25 @@ set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CUDA_STANDARD 14)
 
 if(CMAKE_CXX_COMPILER_ID MATCHES GNU)
-    set(CMAKE_CXX_FLAGS         "${CMAKE_CXX_FLAGS} -Wall -Wno-unknown-pragmas -Werror")
-    set(CMAKE_CXX_FLAGS_DEBUG   "-O0 -g3")
-    if (BUILD_COVERAGE)
-	set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -fprofile-arcs -ftest-coverage")
-    else()
-        set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g")
-    endif()	
+  set(CMAKE_CXX_FLAGS         "${CMAKE_CXX_FLAGS} -Wall -Wno-unknown-pragmas -Werror")
+  include(CheckCCompilerFlag)
+  check_c_compiler_flag(-Wno-error=template-id-cdtor HAS_TEMPLATE_ID_CDTOR)
+  if (HAS_TEMPLATE_ID_CDTOR)
+    set(CMAKE_CXX_FLAGS       "${CMAKE_CXX_FLAGS} -Wno-error=template-id-cdtor")
+  endif()
+  set(CMAKE_CXX_FLAGS_DEBUG   "-O0 -g3")
+  if (BUILD_COVERAGE)
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -fprofile-arcs -ftest-coverage")
+  else()
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g")
+  endif()
 endif()
 
 if(CUDA_VERSION_MAJOR LESS 11)
 #compile for all major architectures >= 35
- set(CMAKE_CUDA_ARCHITECTURES 35 50 60 70 75)
+  set(CMAKE_CUDA_ARCHITECTURES 35 50 60 70 75)
 else()
- set(CMAKE_CUDA_ARCHITECTURES 52 60 70 75 80)
+  set(CMAKE_CUDA_ARCHITECTURES 70 80 86)
 endif()
 
 set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -g -lineinfo")
diff --git a/README.md b/README.md
index 43f59d7..0126f14 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 libmolgrid
 ==========
-![Github CI](https://github.com/gnina/libmolgrid/actions/workflows/CI.yml/badge.svg)
+[![Github CI](https://github.com/gnina/libmolgrid/actions/workflows/CI.yml/badge.svg)](https://github.com/gnina/libmolgrid/actions)
 [![codecov](https://codecov.io/gh/gnina/libmolgrid/branch/master/graph/badge.svg?token=gUFisf34rf)](https://codecov.io/gh/gnina/libmolgrid)
 
 libmolgrid is under active development, but should be suitable for use by early adopters.
diff --git a/default.nix b/default.nix
index b06b223..0c47087 100644
--- a/default.nix
+++ b/default.nix
@@ -1,59 +1,47 @@
-{ lib
-, stdenv
-, fetchFromGitHub
-, cmake
-, cudatoolkit
-, openbabel
-, zlib
-, boost
-, python310Packages
-, python310
-, pkg-config
+{
+  lib,
+  cmake,
+  boost,
+  python3,
+  python3Packages,
+  cudaPackages,
+  openbabel,
+  zlib,
 }:
 
-stdenv.mkDerivation rec {
+cudaPackages.backendStdenv.mkDerivation {
   pname = "libmolgrid";
-  version = "0.5.3";
-
+  version = "master";
   src = ./.;
-  # fetchFromGitHub {
-  #   owner = "gnina";
-  #   repo = "libmolgrid";
-  #   rev = "v${version}";
-  #   hash = "sha256-YdEjXfrTf9hw0nMbC2JWZ7Gf/psZ4RQ6v6GUrx5yIoA=";
-  # };
-
-  #buildFlags = [ "-stdlib=libstdc++" ];
 
   nativeBuildInputs = [
     cmake
-    pkg-config
+    cudaPackages.cuda_nvcc
   ];
 
   buildInputs = [
-    #gcc
-    cudatoolkit
+    boost.dev
+    python3
+    python3Packages.boost
+    python3Packages.numpy
+    python3Packages.openbabel-bindings
+    python3Packages.pyquaternion
+    python3Packages.pytest
+    cudaPackages.cuda_cudart
+    cudaPackages.cuda_cccl
     openbabel
-    #stdenv.cc.cc.lib
     zlib
-    boost.dev
-    python310Packages.boost
-    python310Packages.pytest
-    python310Packages.numpy
-    python310Packages.pyquaternion
-    python310Packages.openbabel-bindings
-    python310
   ];
 
-  OPENBABEL3_INCLUDE_DIR = "${openbabel}/include/openbabel3";
-
-  cmakeFlags = [ "-DOPENBABEL3_INCLUDE_DIR=${OPENBABEL3_INCLUDE_DIR}" ];
+  cmakeFlags = [
+    "-DCMAKE_CUDA_ARCHITECTURES=70;80;86"
+    "-DOPENBABEL3_INCLUDE_DIR=${openbabel}/include/openbabel3"
+  ];
 
   meta = with lib; {
-    description =
-      "Comprehensive library for fast, GPU accelerated molecular gridding for deep learning workflows";
+    description = "Comprehensive library for fast, GPU accelerated molecular gridding for deep learning workflows";
     homepage = "https://github.com/gnina/libmolgrid";
     license = licenses.asl20;
-    maintainers = with maintainers; [ ];
+    maintainers = [ ];
   };
 }
diff --git a/flake.lock b/flake.lock
index e875405..d35a24c 100644
--- a/flake.lock
+++ b/flake.lock
@@ -1,46 +1,24 @@
 {
   "nodes": {
     "nixpkgs": {
-      "inputs": {
-        "nixpkgs": [
-          "nixpkgs-base"
-        ]
-      },
-      "locked": {
-        "lastModified": 1673400599,
-        "narHash": "sha256-EnDVDsQ/uoCdO1UMPTSHelJLjJTR794yp370a89n3y4=",
-        "owner": "numtide",
-        "repo": "nixpkgs-unfree",
-        "rev": "7331a9526557393edc2ff86d04ecd74b107f1b81",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "nixpkgs-unfree",
-        "rev": "7331a9526557393edc2ff86d04ecd74b107f1b81",
-        "type": "github"
-      }
-    },
-    "nixpkgs-base": {
       "locked": {
-        "lastModified": 1669833724,
-        "narHash": "sha256-/HEZNyGbnQecrgJnfE8d0WC5c1xuPSD2LUpB6YXlg4c=",
+        "lastModified": 1752620740,
+        "narHash": "sha256-f3pO+9lg66mV7IMmmIqG4PL3223TYMlnlw+pnpelbss=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "4d2b37a84fad1091b9de401eb450aae66f1a741e",
+        "rev": "32a4e87942101f1c9f9865e04dc3ddb175f5f32e",
         "type": "github"
       },
       "original": {
         "owner": "NixOS",
-        "ref": "22.11",
         "repo": "nixpkgs",
+        "rev": "32a4e87942101f1c9f9865e04dc3ddb175f5f32e",
         "type": "github"
       }
     },
     "root": {
       "inputs": {
-        "nixpkgs": "nixpkgs",
-        "nixpkgs-base": "nixpkgs-base"
+        "nixpkgs": "nixpkgs"
       }
     }
   },
diff --git a/flake.nix b/flake.nix
index 9f7c1cc..c0e2679 100644
--- a/flake.nix
+++ b/flake.nix
@@ -1,13 +1,46 @@
 {
-  description = "A very basic flake";
-  #inputs.nixpkgs.url = "github:NixOS/nixpkgs/22.11";
-  inputs.nixpkgs-base.url = "github:NixOS/nixpkgs/22.11";
-  inputs.nixpkgs.url = "github:numtide/nixpkgs-unfree/7331a9526557393edc2ff86d04ecd74b107f1b81";
-  inputs.nixpkgs.inputs.nixpkgs.follows = "nixpkgs-base";
-
-  outputs = { self, nixpkgs, nixpkgs-base }: {
-
-    packages.x86_64-linux.default = nixpkgs.legacyPackages.x86_64-linux.callPackage (import ./default.nix) { };
+  description = "A flake for libmolgrid.";
 
+  inputs = {
+    nixpkgs.url = "github:NixOS/nixpkgs/32a4e87942101f1c9f9865e04dc3ddb175f5f32e";
   };
+
+  outputs =
+    { self, nixpkgs }:
+    let
+      system = "x86_64-linux";
+    in
+    {
+      packages.${system} = {
+        libmolgrid =
+          let
+            pkgs = import nixpkgs {
+              inherit system;
+              config.allowUnfree = true;
+              config.cudaSupport = true;
+            };
+          in
+          pkgs.callPackage ./default.nix { };
+        libmolgrid_bullet =
+          let
+            system = "x86_64-linux";
+            pkgs = import nixpkgs {
+              inherit system;
+              overlays = [
+                (final: prev: { cudaPackages = prev.cudaPackages_12_4; })
+              ];
+              config.allowUnfree = true;
+              config.cudaSupport = true;
+              config.cudaCapabilities = [
+                "7.0"
+                "8.0"
+                "8.6"
+              ];
+              cudaForwardCompat = false;
+            };
+          in
+          pkgs.callPackage ./default.nix { };
+        default = self.packages.${system}.libmolgrid;
+      };
+    };
 }
diff --git a/include/libmolgrid/coord_cache.h b/include/libmolgrid/coord_cache.h
index c6ba04e..842d045 100644
--- a/include/libmolgrid/coord_cache.h
+++ b/include/libmolgrid/coord_cache.h
@@ -7,11 +7,14 @@
 #ifndef COORD_CACHE_H_
 #define COORD_CACHE_H_
 
-#include "libmolgrid/coordinateset.h"
 #include "libmolgrid/atom_typer.h"
+#include "libmolgrid/coordinateset.h"
 #include "libmolgrid/example.h"
 #include <boost/iostreams/device/mapped_file.hpp>
-
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/serialization/serialization.hpp>
+#include <boost/serialization/unordered_map.hpp>
 namespace libmolgrid {
 
 /** \brief Load and cache molecular coordinates and atom types.
@@ -21,36 +24,77 @@ namespace libmolgrid {
  *  training runs.
  */
 class CoordCache {
-    using MemCache = std::unordered_map<const char*, CoordinateSet>;
-    MemCache memcache;
-    std::shared_ptr<AtomTyper> typer;
-    std::string data_root;
-    std::string molcache;
-    bool use_cache = true; //is possible to disable caching
-    bool addh = true; ///protonate
-    bool make_vector_types = false; ///convert index types to vector, will also convert to type based radii and add a dummy type
-
-    //for memory mapped cache
-    boost::iostreams::mapped_file_source cache_map;
-    std::unordered_map<const char*, size_t> offsets; //map from names to position in cache_map
-
-  public:
-    CoordCache() {}
-    CoordCache(std::shared_ptr<AtomTyper> t,
-        const ExampleProviderSettings& settings = ExampleProviderSettings(),
-        const std::string& mc = "");
-    ~CoordCache() {}
-
-    /** \brief Set coord to the appropriate CoordinateSet for fname
-     * @param[in] fname file name, not including root directory prefix, of molecular data
-     * @param[out] coord  CoordinateSet for passed molecule
-     */
-    void set_coords(const char *fname, CoordinateSet& coord);
-
-    /// return the number of types (channels) each example will have
-    size_t num_types() const { return typer->num_types(); }
-
-    std::vector<std::string> get_type_names() const { return typer->get_type_names(); }
+  using MemCache = std::unordered_map<const char *, CoordinateSet>;
+  MemCache memcache;
+  std::shared_ptr<AtomTyper> typer;
+  std::string data_root;
+  std::string molcache;
+  bool use_cache = true; // is possible to disable caching
+  bool addh = true;      /// protonate
+  bool make_vector_types =
+      false; /// convert index types to vector, will also convert to type based
+             /// radii and add a dummy type
+
+  // for memory mapped cache
+  boost::iostreams::mapped_file_source cache_map;
+  std::unordered_map<const char *, size_t>
+      offsets; // map from names to position in cache_map
+
+public:
+  CoordCache() {}
+  CoordCache(
+      std::shared_ptr<AtomTyper> t,
+      const ExampleProviderSettings &settings = ExampleProviderSettings(),
+      const std::string &mc = "");
+  ~CoordCache() {}
+
+  /** \brief Set coord to the appropriate CoordinateSet for fname
+   * @param[in] fname file name, not including root directory prefix, of
+   * molecular data
+   * @param[out] coord  CoordinateSet for passed molecule
+   */
+  void set_coords(const char *fname, CoordinateSet &coord);
+
+  /// return the number of types (channels) each example will have
+  size_t num_types() const { return typer->num_types(); }
+
+  std::vector<std::string> get_type_names() const {
+    return typer->get_type_names();
+  }
+
+  /** \brief Write out current contents of memory cache to provided output
+   * stream.
+   * @param[in] out output stream
+   */
+  void save_mem_cache(std::ostream &out) const;
+
+  /** \brief Write out current contents of memory cache to provided file.
+   * @param[in] fname file name
+   */
+  void save_mem_cache(const std::string &fname) const {
+    std::ofstream out(fname.c_str());
+    if (!out)
+      throw std::invalid_argument("Could not open file " + fname);
+    save_mem_cache(out);
+  }
+
+  /** \brief Read contents of input stream into memory cache.
+   * @param[in] in input stream
+   */
+  void load_mem_cache(std::istream &in);
+  /** \brief Read contents of provided file into memory cache.
+   * @param[in] fname file name
+   */
+  void load_mem_cache(const std::string &fname) {
+    std::ifstream in(fname.c_str());
+    if (!in)
+      throw std::invalid_argument("Could not load file " + fname);
+    load_mem_cache(in);
+  }
+
+  size_t mem_cache_size() const {
+    return memcache.size();
+  }
 };
 
 } /* namespace libmolgrid */
diff --git a/include/libmolgrid/coordinateset.h b/include/libmolgrid/coordinateset.h
index 18f4843..351aa93 100644
--- a/include/libmolgrid/coordinateset.h
+++ b/include/libmolgrid/coordinateset.h
@@ -12,7 +12,10 @@
 
 #include <vector>
 #include <openbabel/mol.h>
+#include <boost/serialization/access.hpp>
+
 #include "libmolgrid/managed_grid.h"
+#include "libmolgrid/string_cache.h"
 
 namespace libmolgrid {
 
@@ -136,6 +139,29 @@ struct CoordinateSet {
 
   ///for debugging
   void dump(std::ostream& out) const;
+
+  friend class boost::serialization::access;
+  template <class Archive> void serialize(Archive &ar, const unsigned version) {
+    ar &  coords;
+    ar & type_index; 
+  ar & type_vector;
+  ar & radii;
+  ar & max_type;
+
+  if (Archive::is_saving::value)  {
+    std::string tmp;
+    if(src) {
+      tmp = src;
+    }
+    ar & tmp;
+  } else { //reading
+    std::string tmp;
+    ar & tmp;
+    if(tmp.length() > 0) {
+      src = string_cache.get(tmp);
+    }
+  }
+  }  
 };
 
 extern template size_t CoordinateSet::copyTo(Grid<float, 2, false>& c, Grid<float, 1, false>& t, Grid<float, 1, false>& r) const;
diff --git a/include/libmolgrid/example.h b/include/libmolgrid/example.h
index 0efb1b9..c75f8b1 100644
--- a/include/libmolgrid/example.h
+++ b/include/libmolgrid/example.h
@@ -15,6 +15,7 @@
 #include <vector>
 #include <unordered_set>
 #include "libmolgrid/coordinateset.h"
+#include "libmolgrid/string_cache.h"
 
 namespace libmolgrid {
 
@@ -161,19 +162,7 @@ struct ExampleRef {
 };
 
 
-//for memory efficiency, only store a given string once and use the const char*
-class StringCache {
-  std::unordered_set<std::string> strings;
-public:
-  const char* get(const std::string& s)
-  {
-    strings.insert(s);
-    //we assume even as the set is resized that strings never get allocated
-    return strings.find(s)->c_str();
-  }
-};
 
-extern StringCache string_cache;
 
 } /* namespace libmolgrid */
 
diff --git a/include/libmolgrid/example_extractor.h b/include/libmolgrid/example_extractor.h
index 887ae7c..f7461bf 100644
--- a/include/libmolgrid/example_extractor.h
+++ b/include/libmolgrid/example_extractor.h
@@ -88,6 +88,38 @@ class ExampleExtractor {
     ///return names of types for explicitly typed examples
     ///type names are prepended by coordinate set index
     virtual std::vector<std::string> get_type_names() const;
+
+  /** \brief Write out current contents of memory caches to provided output
+   * stream.
+   * @param[in] out output stream
+   */
+  void save_mem_caches(std::ostream &out) const;
+  
+  /** \brief Write out current contents of memory caches to provided file.
+   * @param[in] fname file name
+   */
+  void save_mem_caches(const std::string &fname) const {
+    std::ofstream out(fname.c_str());
+    if (!out)
+      throw std::invalid_argument("Could not open file " + fname);
+    save_mem_caches(out);
+  }
+
+  /** \brief Read contents of input stream into memory caches.
+   * @param[in] in input stream
+   */
+  void load_mem_caches(std::istream &in);
+  /** \brief Read contents of provided file into memory caches.
+   * @param[in] fname file name
+   */
+  void load_mem_caches(const std::string &fname) {
+    std::ifstream in(fname.c_str());
+    if (!in)
+      throw std::invalid_argument("Could not load file " + fname);
+    load_mem_caches(in);
+  }
+
+  size_t mem_caches_size() const;    
 };
 
 } /* namespace libmolgrid */
diff --git a/include/libmolgrid/example_provider.h b/include/libmolgrid/example_provider.h
index f3f0361..e6eb850 100644
--- a/include/libmolgrid/example_provider.h
+++ b/include/libmolgrid/example_provider.h
@@ -10,12 +10,12 @@
 #define EXAMPLE_PROVIDER_H_
 
 #include "libmolgrid/example.h"
-#include "libmolgrid/exampleref_providers.h"
 #include "libmolgrid/example_extractor.h"
+#include "libmolgrid/exampleref_providers.h"
 
 namespace libmolgrid {
 
-  // Docstring_ExampleProvider
+// Docstring_ExampleProvider
 /** \brief Given a file of examples, provide Example classes one at a time
  * This contains an ExampleRefProvider, which can be configured using a
  * single settings object if so desired, and an example extractor.
@@ -23,121 +23,172 @@ namespace libmolgrid {
  * of the dataset into memory.
  *
  * An example files contains a single example on each line where an example
- * consists of some number of numerical labels (num_labels, will be auto-detected
- * if not specified) followed by file paths to molecular data, all space separated.
+ * consists of some number of numerical labels (num_labels, will be
+ * auto-detected if not specified) followed by file paths to molecular data, all
+ * space separated.
  */
 class ExampleProvider {
-    std::shared_ptr<ExampleRefProvider> provider;
-    ExampleExtractor extractor;
-    ExampleProviderSettings init_settings; //save settings created with
-    size_t last_epoch = 0;
-
-  public:
-
-    /// return provider as specifyed by settings
-    static std::shared_ptr<ExampleRefProvider> createProvider(const ExampleProviderSettings& settings);
-
-    /// Create provider using default gnina typing
-    ExampleProvider(const ExampleProviderSettings& settings=ExampleProviderSettings());
-
-    /// Create provider/extractor according to settings with single typer
-    ExampleProvider(const ExampleProviderSettings& settings, std::shared_ptr<AtomTyper> t);
-
-    /// Create provider/extractor according to settings with two typers
-    ExampleProvider(const ExampleProviderSettings& settings, std::shared_ptr<AtomTyper> t1, std::shared_ptr<AtomTyper> t2);
-
-    /// Create provider/extractor according to settings
-    ExampleProvider(const ExampleProviderSettings& settings, const std::vector<std::shared_ptr<AtomTyper> >& typrs,
-        const std::vector<std::string>& molcaches = std::vector<std::string>());
-
-    /// use provided provider
-    ExampleProvider(std::shared_ptr<ExampleRefProvider> p, const ExampleExtractor& e);
-    virtual ~ExampleProvider() {}
-
-    ///load example file file fname and setup provider
-    virtual void populate(const std::string& fname, int num_labels=-1);
-    ///load multiple example files
-    virtual void populate(const std::vector<std::string>& fnames, int num_labels=-1);
-
-    ///return number of labels for each example (computed from first example only)
-    virtual size_t num_labels() const { return provider->num_labels(); }
-    
-    ///provide next example
-    virtual void next(Example& ex);
-    virtual Example next() { Example ex; next(ex); return ex; }
-
-    /** \brief Return current small epoch number
-     *  A small epoch occurs once every training example has been
-     *  seen at MOST once.  For example, when providing a balanced
-     *  view of unbalanced data, a small epoch will complete once the
-     *  less common class has been iterated over.
-     *  Note this is the epoch of the next example to be provided, not the previous.
-     */
-    virtual size_t get_small_epoch_num() const { return provider->get_small_epoch_num(); }
-
-    /** \brief Return current large epoch number
-     *  A large epoch occurs once every training example has been
-     *  seen at LEAST once.  For example, when providing a balanced
-     *  view of unbalanced data, a large epoch will complete once the
-     *  more common class has been iterated over.
-     *  Note this is the epoch of the next example to be provided, not the previous.
-     */
-    virtual size_t get_large_epoch_num() const { return provider->get_large_epoch_num(); }
-
-    /// Return number of example in small epoch
-    virtual size_t small_epoch_size() const { return provider->small_epoch_size(); }
-
-    /// Return number of example in large epoch
-    virtual size_t large_epoch_size() const { return provider->large_epoch_size(); }
-
-    /// Reset to beginning
-    virtual void reset() { provider->reset(); }
-
-    ///skip over the first n examples
-    virtual void skip(unsigned n);
-
-    ///return settings created with
-    const ExampleProviderSettings& settings() const { return init_settings; }
-
-
-    virtual void next_batch(std::vector<Example>& ex, unsigned batch_size=0);
-
-    ///provide a batch of examples, unspecified or 0 batch_size uses default batch size
-    virtual std::vector<Example> next_batch(unsigned batch_size=0) {
-      std::vector<Example> ex;
-      next_batch(ex, batch_size);
-      return ex;
-    }
-
-    ///return true if we have crossed into a new epoch (or are about to)
-    ///Note that once this returns true once, it won't again until the next
-    ///epoch has been consumed.
-    bool at_new_epoch() {
-      if(init_settings.iteration_scheme == LargeEpoch) {
-        size_t e = provider->get_large_epoch_num();
-        if(e != last_epoch) {
-          last_epoch = e;
-          return true;
-        }
-      } else if(init_settings.iteration_scheme == SmallEpoch) {
-        size_t e = provider->get_small_epoch_num();
-        if(e != last_epoch) {
-          last_epoch = e;
-          return true;
-        }
+  std::shared_ptr<ExampleRefProvider> provider;
+  ExampleExtractor extractor;
+  ExampleProviderSettings init_settings; // save settings created with
+  size_t last_epoch = 0;
+
+public:
+  /// return provider as specifyed by settings
+  static std::shared_ptr<ExampleRefProvider>
+  createProvider(const ExampleProviderSettings &settings);
+
+  /// Create provider using default gnina typing
+  ExampleProvider(
+      const ExampleProviderSettings &settings = ExampleProviderSettings());
+
+  /// Create provider/extractor according to settings with single typer
+  ExampleProvider(const ExampleProviderSettings &settings,
+                  std::shared_ptr<AtomTyper> t);
+
+  /// Create provider/extractor according to settings with two typers
+  ExampleProvider(const ExampleProviderSettings &settings,
+                  std::shared_ptr<AtomTyper> t1, std::shared_ptr<AtomTyper> t2);
+
+  /// Create provider/extractor according to settings
+  ExampleProvider(
+      const ExampleProviderSettings &settings,
+      const std::vector<std::shared_ptr<AtomTyper>> &typrs,
+      const std::vector<std::string> &molcaches = std::vector<std::string>());
+
+  /// use provided provider
+  ExampleProvider(std::shared_ptr<ExampleRefProvider> p,
+                  const ExampleExtractor &e);
+  virtual ~ExampleProvider() {}
+
+  /// load example file file fname and setup provider
+  virtual void populate(const std::string &fname, int num_labels = -1);
+  /// load multiple example files
+  virtual void populate(const std::vector<std::string> &fnames,
+                        int num_labels = -1);
+
+  /// return number of labels for each example (computed from first example
+  /// only)
+  virtual size_t num_labels() const { return provider->num_labels(); }
+
+  /// provide next example
+  virtual void next(Example &ex);
+  virtual Example next() {
+    Example ex;
+    next(ex);
+    return ex;
+  }
+
+  /** \brief Return current small epoch number
+   *  A small epoch occurs once every training example has been
+   *  seen at MOST once.  For example, when providing a balanced
+   *  view of unbalanced data, a small epoch will complete once the
+   *  less common class has been iterated over.
+   *  Note this is the epoch of the next example to be provided, not the
+   * previous.
+   */
+  virtual size_t get_small_epoch_num() const {
+    return provider->get_small_epoch_num();
+  }
+
+  /** \brief Return current large epoch number
+   *  A large epoch occurs once every training example has been
+   *  seen at LEAST once.  For example, when providing a balanced
+   *  view of unbalanced data, a large epoch will complete once the
+   *  more common class has been iterated over.
+   *  Note this is the epoch of the next example to be provided, not the
+   * previous.
+   */
+  virtual size_t get_large_epoch_num() const {
+    return provider->get_large_epoch_num();
+  }
+
+  /// Return number of example in small epoch
+  virtual size_t small_epoch_size() const {
+    return provider->small_epoch_size();
+  }
+
+  /// Return number of example in large epoch
+  virtual size_t large_epoch_size() const {
+    return provider->large_epoch_size();
+  }
+
+  /// Reset to beginning
+  virtual void reset() { provider->reset(); }
+
+  /// skip over the first n examples
+  virtual void skip(unsigned n);
+
+  /// return settings created with
+  const ExampleProviderSettings &settings() const { return init_settings; }
+
+  virtual void next_batch(std::vector<Example> &ex, unsigned batch_size = 0);
+
+  /// provide a batch of examples, unspecified or 0 batch_size uses default
+  /// batch size
+  virtual std::vector<Example> next_batch(unsigned batch_size = 0) {
+    std::vector<Example> ex;
+    next_batch(ex, batch_size);
+    return ex;
+  }
+
+  /// return true if we have crossed into a new epoch (or are about to)
+  /// Note that once this returns true once, it won't again until the next
+  /// epoch has been consumed.
+  bool at_new_epoch() {
+    if (init_settings.iteration_scheme == LargeEpoch) {
+      size_t e = provider->get_large_epoch_num();
+      if (e != last_epoch) {
+        last_epoch = e;
+        return true;
+      }
+    } else if (init_settings.iteration_scheme == SmallEpoch) {
+      size_t e = provider->get_small_epoch_num();
+      if (e != last_epoch) {
+        last_epoch = e;
+        return true;
       }
-      return false;
     }
-
-    ExampleExtractor& get_extractor() { return extractor; }
-    ExampleRefProvider& get_provider() { return *provider; }
-
-    ///number of types
-    size_t num_types() const { return extractor.num_types(); }
-    ///names of types (requires explicit typing)
-    std::vector<std::string> get_type_names() const { return extractor.get_type_names(); }
-    ///return number of examples
-    size_t size() const { return provider->size(); }
+    return false;
+  }
+
+  ExampleExtractor &get_extractor() { return extractor; }
+  ExampleRefProvider &get_provider() { return *provider; }
+
+  /// number of types
+  size_t num_types() const { return extractor.num_types(); }
+  /// names of types (requires explicit typing)
+  std::vector<std::string> get_type_names() const {
+    return extractor.get_type_names();
+  }
+  /// return number of examples
+  size_t size() const { return provider->size(); }
+
+  /** \brief Write out current contents of memory caches to provided output
+   * stream.
+   * @param[in] out output stream
+   */
+  void save_mem_caches(std::ostream &out) const {
+    extractor.save_mem_caches(out);
+  }
+
+  /** \brief Write out current contents of memory caches to provided file.
+   * @param[in] fname file name
+   */
+  void save_mem_caches(const std::string &fname) const {
+    extractor.save_mem_caches(fname);
+  }
+
+  /** \brief Read contents of input stream into memory caches.
+   * @param[in] in input stream
+   */
+  void load_mem_caches(std::istream &in) { extractor.load_mem_caches(in); }
+  /** \brief Read contents of provided file into memory caches.
+   * @param[in] fname file name
+   */
+  void load_mem_caches(const std::string &fname) { extractor.load_mem_caches(fname); }
+
+  size_t mem_caches_size() const { return extractor.mem_caches_size(); }
 };
 
 } /* namespace libmolgrid */
diff --git a/include/libmolgrid/managed_grid.h b/include/libmolgrid/managed_grid.h
index a81426a..fd64353 100644
--- a/include/libmolgrid/managed_grid.h
+++ b/include/libmolgrid/managed_grid.h
@@ -13,120 +13,141 @@
 #include <memory>
 #include <utility>
 #include <boost/lexical_cast.hpp>
-
+#include <boost/serialization/split_member.hpp>
+#include <boost/serialization/array.hpp>
 #include "libmolgrid/grid.h"
 
+namespace libmolgrid
+{
 
-namespace libmolgrid {
-
-
-template<typename Dtype>
-struct mgrid_buffer_data {
+  template <typename Dtype>
+  struct mgrid_buffer_data
+  {
     Dtype *gpu_ptr;
     bool sent_to_gpu;
-};
+  };
 
-/** \brief ManagedGrid base class */
-template<typename Dtype, std::size_t NumDims>
-class ManagedGridBase {
+  /** \brief ManagedGrid base class */
+  template <typename Dtype, std::size_t NumDims>
+  class ManagedGridBase
+  {
   public:
-    using gpu_grid_t = Grid<Dtype, NumDims, true>; ///cuda grid type
+    using gpu_grid_t = Grid<Dtype, NumDims, true>; /// cuda grid type
     using cpu_grid_t = Grid<Dtype, NumDims, false>;
     using type = Dtype;
     static constexpr size_t N = NumDims;
 
   protected:
-
-    //two different views of the same memory
-    mutable gpu_grid_t gpu_grid; //treated as a cache
+    // two different views of the same memory
+    mutable gpu_grid_t gpu_grid; // treated as a cache
     cpu_grid_t cpu_grid;
-    std::shared_ptr<Dtype> cpu_ptr; //shared pointer lets us not worry about copying the grid
-    size_t capacity = 0; //amount of memory allocated (for resizing)
+    std::shared_ptr<Dtype> cpu_ptr; // shared pointer lets us not worry about copying the grid
+    size_t capacity = 0;            // amount of memory allocated (for resizing)
 
     using buffer_data = mgrid_buffer_data<Dtype>;
     mutable buffer_data *gpu_info = nullptr;
 
-
-    ///empty (unusable) grid
+    /// empty (unusable) grid
     ManagedGridBase() = default;
 
     // deallocate our special buffer memory, include gpu memory if present
-    static void delete_buffer(void *ptr) {
-      buffer_data *data = (buffer_data*)(ptr) - 1;
-      if(data->gpu_ptr != nullptr) {
-        //deallocate gpu
+    static void delete_buffer(void *ptr)
+    {
+      buffer_data *data = (buffer_data *)(ptr)-1;
+      if (data->gpu_ptr != nullptr)
+      {
+        // deallocate gpu
         cudaFree(data->gpu_ptr);
       }
       free(data);
     }
-    //allocate and set the cpu pointer (and grid) with space for sent_to_gpu bool, set the bool ptr location
-    //does not initialize memory
-    void alloc_and_set_cpu(size_t sz) {
-      //put buffer data at start so know where it is on delete
-      void *buffer = malloc(sizeof(buffer_data)+sz*sizeof(Dtype));
-      Dtype *cpu_data = (Dtype*)((buffer_data*)buffer+1);
-
-      if(!buffer) throw std::runtime_error("Could not allocate "+itoa(sz*sizeof(Dtype))+" bytes of CPU memory in ManagedGrid");
+    // allocate and set the cpu pointer (and grid) with space for sent_to_gpu bool, set the bool ptr location
+    // does not initialize memory
+    void alloc_and_set_cpu(size_t sz)
+    {
+      // put buffer data at start so know where it is on delete
+      void *buffer = malloc(sizeof(buffer_data) + sz * sizeof(Dtype));
+      Dtype *cpu_data = (Dtype *)((buffer_data *)buffer + 1);
+
+      if (!buffer)
+        throw std::runtime_error("Could not allocate " + itoa(sz * sizeof(Dtype)) + " bytes of CPU memory in ManagedGrid");
       cpu_ptr = std::shared_ptr<Dtype>(cpu_data, delete_buffer);
       cpu_grid.set_buffer(cpu_ptr.get());
-      gpu_info = (buffer_data*)buffer;
+      gpu_info = (buffer_data *)buffer;
       gpu_info->gpu_ptr = nullptr;
       gpu_info->sent_to_gpu = false;
     }
 
-    //allocate and set gpu_ptr and grid, does not initialize memory, should not be called if memory is already allocated
-    void alloc_and_set_gpu(size_t sz) const {
-      if(gpu_info == nullptr)
+    // allocate and set gpu_ptr and grid, does not initialize memory, should not be called if memory is already allocated
+    void alloc_and_set_gpu(size_t sz) const
+    {
+      if (gpu_info == nullptr)
         throw std::runtime_error("Attempt to allocate gpu memory in empty ManagedGrid");
-      if(gpu_info->gpu_ptr != nullptr) {
+      if (gpu_info->gpu_ptr != nullptr)
+      {
         throw std::runtime_error("Attempt to reallocate gpu memory in  ManagedGrid");
       }
-      //we are not actually using unified memory, but this make debugging easier?
-      cudaError_t err = cudaMalloc(&gpu_info->gpu_ptr,sz*sizeof(Dtype));
+      // we are not actually using unified memory, but this make debugging easier?
+      cudaError_t err = cudaMalloc(&gpu_info->gpu_ptr, sz * sizeof(Dtype));
       cudaGetLastError();
-      if(err != cudaSuccess) {
-        throw std::runtime_error("Could not allocate "+itoa(sz*sizeof(Dtype))+" bytes of GPU memory in ManagedGrid");
+      if (err != cudaSuccess)
+      {
+        throw std::runtime_error("Could not allocate " + itoa(sz * sizeof(Dtype)) + " bytes of GPU memory in ManagedGrid");
       }
       gpu_grid.set_buffer(gpu_info->gpu_ptr);
     }
 
-    template<typename... I, typename = typename std::enable_if<sizeof...(I) == NumDims>::type>
-    ManagedGridBase(I... sizes): gpu_grid(nullptr, sizes...), cpu_grid(nullptr, sizes...) {
-      //allocate buffer
+    template <typename... I, typename = typename std::enable_if<sizeof...(I) == NumDims>::type>
+    ManagedGridBase(I... sizes) : gpu_grid(nullptr, sizes...), cpu_grid(nullptr, sizes...)
+    {
+      // allocate buffer
+      capacity = this->size();
+      alloc_and_set_cpu(capacity); // even with capacity == 0 need to allocate sent_to_gpu
+      memset(cpu_ptr.get(), 0, capacity * sizeof(Dtype));
+      gpu_info->sent_to_gpu = false;
+    }
+
+    ManagedGridBase(size_t *sizes): gpu_grid(nullptr, sizes), cpu_grid(nullptr, sizes)
+    {
+      // allocate buffer
       capacity = this->size();
-      alloc_and_set_cpu(capacity); //even with capacity == 0 need to allocate sent_to_gpu
-      memset(cpu_ptr.get(), 0, capacity*sizeof(Dtype));
+      alloc_and_set_cpu(capacity); // even with capacity == 0 need to allocate sent_to_gpu
+      memset(cpu_ptr.get(), 0, capacity * sizeof(Dtype));
       gpu_info->sent_to_gpu = false;
     }
 
-    //helper for clone, allocate new memory and copies contents of current ptr into it
-    void clone_ptrs() {
-      if(capacity == 0) {
+
+    // helper for clone, allocate new memory and copies contents of current ptr into it
+    void clone_ptrs()
+    {
+      if (capacity == 0)
+      {
         return;
       }
 
-      //duplicate cpu memory and set sent_to_gpu
+      // duplicate cpu memory and set sent_to_gpu
       std::shared_ptr<Dtype> old = cpu_ptr;
       buffer_data oldgpu = *gpu_info;
       alloc_and_set_cpu(capacity);
-      memcpy(cpu_ptr.get(), old.get(), sizeof(Dtype)*capacity);
+      memcpy(cpu_ptr.get(), old.get(), sizeof(Dtype) * capacity);
       gpu_info->sent_to_gpu = oldgpu.sent_to_gpu;
 
-      //if allocated, duplicate gpu, but only if it is active
-      if(oldgpu.gpu_ptr && oldgpu.sent_to_gpu) {
+      // if allocated, duplicate gpu, but only if it is active
+      if (oldgpu.gpu_ptr && oldgpu.sent_to_gpu)
+      {
         alloc_and_set_gpu(capacity);
-        LMG_CUDA_CHECK(cudaMemcpy(gpu_info->gpu_ptr, oldgpu.gpu_ptr, sizeof(Dtype)*capacity, cudaMemcpyDeviceToDevice));
+        LMG_CUDA_CHECK(cudaMemcpy(gpu_info->gpu_ptr, oldgpu.gpu_ptr, sizeof(Dtype) * capacity, cudaMemcpyDeviceToDevice));
       }
     }
-  public:
 
+  public:
     /// dimensions along each axis
-    inline const size_t * dimensions() const { return cpu_grid.dimensions(); }
+    inline const size_t *dimensions() const { return cpu_grid.dimensions(); }
     /// dimensions along specified axis
     inline size_t dimension(size_t i) const { return cpu_grid.dimension(i); }
 
     /// offset for each dimension, all indexing calculations use this
-    inline const size_t * offsets() const { return cpu_grid.offsets(); }
+    inline const size_t *offsets() const { return cpu_grid.offsets(); }
     /// offset for each dimension, all indexing calculations use this
     inline size_t offset(size_t i) const { return cpu_grid.offset(i); }
 
@@ -134,109 +155,153 @@ class ManagedGridBase {
     inline size_t size() const { return cpu_grid.size(); }
 
     /// set contents to zero
-    inline void fill_zero() {
-      if(ongpu()) gpu_grid.fill_zero();
-      else cpu_grid.fill_zero();
+    inline void fill_zero()
+    {
+      if (ongpu())
+        gpu_grid.fill_zero();
+      else
+        cpu_grid.fill_zero();
     }
 
     /** \brief Initializer list indexing
      *
      */
-    template<typename... I>
-    inline Dtype& operator()(I... indices) {
+    template <typename... I>
+    inline Dtype &operator()(I... indices)
+    {
       tocpu();
       return cpu_grid(indices...);
     }
 
-    template<typename... I>
-    inline Dtype operator()(I... indices) const {
+    template <typename... I>
+    inline Dtype operator()(I... indices) const
+    {
       tocpu();
       return cpu_grid(indices...);
     }
 
     /** \brief Copy data into dest.  Should be same size, but will narrow if needed */
-    size_t copyTo(cpu_grid_t& dest) const {
+    size_t copyTo(cpu_grid_t &dest) const
+    {
       size_t sz = std::min(size(), dest.size());
-      if(sz == 0) return 0;
-      if(ongpu()) {
-        LMG_CUDA_CHECK(cudaMemcpy(dest.data(), gpu_grid.data(), sz*sizeof(Dtype), cudaMemcpyDeviceToHost));
-      } else { //host ot host
-        memcpy(dest.data(),cpu_grid.data(),sz*sizeof(Dtype));
+      if (sz == 0)
+        return 0;
+      if (ongpu())
+      {
+        LMG_CUDA_CHECK(cudaMemcpy(dest.data(), gpu_grid.data(), sz * sizeof(Dtype), cudaMemcpyDeviceToHost));
+      }
+      else
+      { // host ot host
+        memcpy(dest.data(), cpu_grid.data(), sz * sizeof(Dtype));
       }
       return sz;
     }
 
     /** \brief Copy data into dest.  Should be same size, but will narrow if needed */
-    size_t copyTo(gpu_grid_t& dest) const {
+    size_t copyTo(gpu_grid_t &dest) const
+    {
       size_t sz = std::min(size(), dest.size());
-      if(sz == 0) return 0;
-      if(ongpu()) {
-        LMG_CUDA_CHECK(cudaMemcpy(dest.data(),gpu_grid.data(),sz*sizeof(Dtype),cudaMemcpyDeviceToDevice));
-      } else {
-        LMG_CUDA_CHECK(cudaMemcpy(dest.data(),cpu_grid.data(),sz*sizeof(Dtype),cudaMemcpyHostToDevice));
+      if (sz == 0)
+        return 0;
+      if (ongpu())
+      {
+        LMG_CUDA_CHECK(cudaMemcpy(dest.data(), gpu_grid.data(), sz * sizeof(Dtype), cudaMemcpyDeviceToDevice));
+      }
+      else
+      {
+        LMG_CUDA_CHECK(cudaMemcpy(dest.data(), cpu_grid.data(), sz * sizeof(Dtype), cudaMemcpyHostToDevice));
       }
       return sz;
     }
 
     /** \brief Copy data into dest.  Should be same size, but will narrow if needed */
-    size_t copyTo(ManagedGridBase<Dtype, NumDims>& dest) const {
-      if(dest.ongpu()) {
+    size_t copyTo(ManagedGridBase<Dtype, NumDims> &dest) const
+    {
+      if (dest.ongpu())
+      {
         return copyTo(dest.gpu_grid);
-      } else {
+      }
+      else
+      {
         return copyTo(dest.cpu_grid);
       }
     }
 
     /** \brief Copy data from src.  Should be same size, but will narrow if needed */
-    size_t copyFrom(const cpu_grid_t& src) {
+    size_t copyFrom(const cpu_grid_t &src)
+    {
       size_t sz = std::min(size(), src.size());
-      if(sz == 0) return 0;
-      if(ongpu()) {
-       LMG_CUDA_CHECK(cudaMemcpy(gpu_grid.data(), src.data(), sz*sizeof(Dtype), cudaMemcpyHostToDevice));
-      } else {
-        memcpy(cpu_grid.data(),src.data(),sz*sizeof(Dtype));
+      if (sz == 0)
+        return 0;
+      if (ongpu())
+      {
+        LMG_CUDA_CHECK(cudaMemcpy(gpu_grid.data(), src.data(), sz * sizeof(Dtype), cudaMemcpyHostToDevice));
+      }
+      else
+      {
+        memcpy(cpu_grid.data(), src.data(), sz * sizeof(Dtype));
       }
       return sz;
     }
 
     /** \brief Copy data from src.  Should be same size, but will narrow if needed */
-    size_t copyFrom(const gpu_grid_t& src) {
+    size_t copyFrom(const gpu_grid_t &src)
+    {
       size_t sz = std::min(size(), src.size());
-      if(sz == 0) return 0;
-      if(ongpu()) {
-        LMG_CUDA_CHECK(cudaMemcpy(gpu_grid.data(),src.data(),sz*sizeof(Dtype),cudaMemcpyDeviceToDevice));
-      } else {
-        LMG_CUDA_CHECK(cudaMemcpy(cpu_grid.data(),src.data(),sz*sizeof(Dtype),cudaMemcpyDeviceToHost));
+      if (sz == 0)
+        return 0;
+      if (ongpu())
+      {
+        LMG_CUDA_CHECK(cudaMemcpy(gpu_grid.data(), src.data(), sz * sizeof(Dtype), cudaMemcpyDeviceToDevice));
+      }
+      else
+      {
+        LMG_CUDA_CHECK(cudaMemcpy(cpu_grid.data(), src.data(), sz * sizeof(Dtype), cudaMemcpyDeviceToHost));
       }
       return sz;
     }
 
     /** \brief Copy data from src.  Should be same size, but will narrow if needed */
-    size_t copyFrom(const ManagedGridBase<Dtype, NumDims>& src) {
-      if(src.ongpu()) {
+    size_t copyFrom(const ManagedGridBase<Dtype, NumDims> &src)
+    {
+      if (src.ongpu())
+      {
         return copyFrom(src.gpu_grid);
-      } else { //on host
+      }
+      else
+      { // on host
         return copyFrom(src.cpu_grid);
       }
     }
 
     /** \brief Copy data from src into this starting at start.  Should be same size, but will narrow if needed */
-    size_t copyInto(size_t start, const ManagedGridBase<Dtype, NumDims>& src) {
-      size_t off = offset(0)*start;
-      size_t sz = size()-off;
+    size_t copyInto(size_t start, const ManagedGridBase<Dtype, NumDims> &src)
+    {
+      size_t off = offset(0) * start;
+      size_t sz = size() - off;
       sz = std::min(sz, src.size());
-      if(sz == 0) return 0;
-      if(src.ongpu()) {
-        if(ongpu()) {
-          LMG_CUDA_CHECK(cudaMemcpy(gpu_grid.data()+off,src.gpu_grid.data(),sz*sizeof(Dtype),cudaMemcpyDeviceToDevice));
-        } else {
-          LMG_CUDA_CHECK(cudaMemcpy(cpu_grid.data()+off,src.gpu_grid.data(),sz*sizeof(Dtype),cudaMemcpyDeviceToHost));
+      if (sz == 0)
+        return 0;
+      if (src.ongpu())
+      {
+        if (ongpu())
+        {
+          LMG_CUDA_CHECK(cudaMemcpy(gpu_grid.data() + off, src.gpu_grid.data(), sz * sizeof(Dtype), cudaMemcpyDeviceToDevice));
+        }
+        else
+        {
+          LMG_CUDA_CHECK(cudaMemcpy(cpu_grid.data() + off, src.gpu_grid.data(), sz * sizeof(Dtype), cudaMemcpyDeviceToHost));
+        }
+      }
+      else
+      { // on host
+        if (ongpu())
+        {
+          LMG_CUDA_CHECK(cudaMemcpy(gpu_grid.data() + off, src.data(), sz * sizeof(Dtype), cudaMemcpyHostToDevice));
         }
-      } else { //on host
-        if(ongpu()) {
-         LMG_CUDA_CHECK(cudaMemcpy(gpu_grid.data()+off, src.data(), sz*sizeof(Dtype), cudaMemcpyHostToDevice));
-        } else {
-          memcpy(cpu_grid.data()+off,src.data(),sz*sizeof(Dtype));
+        else
+        {
+          memcpy(cpu_grid.data() + off, src.data(), sz * sizeof(Dtype));
         }
       }
       return sz;
@@ -248,22 +313,29 @@ class ManagedGridBase {
      * This function is provided so code can be optimized to avoid unnecessary allocations and
      * should be used carefully.
      */
-    template<typename... I, typename = typename std::enable_if<sizeof...(I) == NumDims>::type>
-    ManagedGrid<Dtype, NumDims> resized(I... sizes) {
+    template <typename... I, typename = typename std::enable_if<sizeof...(I) == NumDims>::type>
+    ManagedGrid<Dtype, NumDims> resized(I... sizes)
+    {
       cpu_grid_t g(nullptr, sizes...);
-      if(g.size() <= capacity) {
-        //no need to allocate and copy; capacity stays the same
+      if (g.size() <= capacity)
+      {
+        // no need to allocate and copy; capacity stays the same
         ManagedGrid<Dtype, NumDims> tmp;
         tmp.cpu_ptr = cpu_ptr;
         tmp.gpu_info = gpu_info;
         tmp.cpu_grid = cpu_grid_t(cpu_ptr.get(), sizes...);
-        if(gpu_info) tmp.gpu_grid = gpu_grid_t(gpu_info->gpu_ptr, sizes...);
+        if (gpu_info)
+          tmp.gpu_grid = gpu_grid_t(gpu_info->gpu_ptr, sizes...);
         tmp.capacity = capacity;
         return tmp;
-      } else {
+      }
+      else
+      {
         ManagedGrid<Dtype, NumDims> tmp(sizes...);
-        if(size() > 0 && tmp.size() > 0) {
-          if(ongpu()) tmp.togpu(); //allocate gpu memory
+        if (size() > 0 && tmp.size() > 0)
+        {
+          if (ongpu())
+            tmp.togpu(); // allocate gpu memory
           copyTo(tmp);
         }
         return tmp;
@@ -273,102 +345,143 @@ class ManagedGridBase {
     /** \brief Return GPU Grid view.  Host code should not access the grid
      * until the GPU code is complete.
      */
-    const gpu_grid_t& gpu() const { togpu(); return gpu_grid; }
-    gpu_grid_t& gpu() { togpu(); return gpu_grid; }
+    const gpu_grid_t &gpu() const
+    {
+      togpu();
+      return gpu_grid;
+    }
+    gpu_grid_t &gpu()
+    {
+      togpu();
+      return gpu_grid;
+    }
 
     /** \brief Return CPU Grid view.  GPU code should no longer access this memory.
      */
-    const cpu_grid_t& cpu() const { tocpu(); return cpu_grid; }
-    cpu_grid_t& cpu() { tocpu(); return cpu_grid; }
+    const cpu_grid_t &cpu() const
+    {
+      tocpu();
+      return cpu_grid;
+    }
+    cpu_grid_t &cpu()
+    {
+      tocpu();
+      return cpu_grid;
+    }
 
     /** \brief Transfer data to GPU */
-    void togpu(bool dotransfer=true) const {
-      if(capacity == 0) return;
-      //check that memory is allocated - even if data is on gpu, may still need to set this mgrid's gpu_grid
-      if(gpu_grid.data() == nullptr) {
-        if(gpu_info->gpu_ptr == nullptr) {
+    void togpu(bool dotransfer = true) const
+    {
+      if (capacity == 0)
+        return;
+      // check that memory is allocated - even if data is on gpu, may still need to set this mgrid's gpu_grid
+      if (gpu_grid.data() == nullptr)
+      {
+        if (gpu_info->gpu_ptr == nullptr)
+        {
           alloc_and_set_gpu(capacity);
-        } //otherwise some other copy has already allocated memory, just need to set
-        size_t offset = cpu_grid.data() - cpu_ptr.get(); //might be subgrid
-          gpu_grid.set_buffer(gpu_info->gpu_ptr+offset);
+        } // otherwise some other copy has already allocated memory, just need to set
+        size_t offset = cpu_grid.data() - cpu_ptr.get(); // might be subgrid
+        gpu_grid.set_buffer(gpu_info->gpu_ptr + offset);
       }
-      if(oncpu() && dotransfer) {
-        LMG_CUDA_CHECK(cudaMemcpy(gpu_info->gpu_ptr,cpu_ptr.get(),capacity*sizeof(Dtype),cudaMemcpyHostToDevice));
+      if (oncpu() && dotransfer)
+      {
+        LMG_CUDA_CHECK(cudaMemcpy(gpu_info->gpu_ptr, cpu_ptr.get(), capacity * sizeof(Dtype), cudaMemcpyHostToDevice));
       }
-      if(gpu_info) gpu_info->sent_to_gpu = true;
+      if (gpu_info)
+        gpu_info->sent_to_gpu = true;
     }
 
     /** \brief Transfer data to CPU.  If not dotransfer, data is not copied back. */
-    void tocpu(bool dotransfer=true) const {
-      if(ongpu() && capacity > 0 && dotransfer) {
-        LMG_CUDA_CHECK(cudaMemcpy(cpu_ptr.get(),gpu_info->gpu_ptr,capacity*sizeof(Dtype),cudaMemcpyDeviceToHost));
+    void tocpu(bool dotransfer = true) const
+    {
+      if (ongpu() && capacity > 0 && dotransfer)
+      {
+        LMG_CUDA_CHECK(cudaMemcpy(cpu_ptr.get(), gpu_info->gpu_ptr, capacity * sizeof(Dtype), cudaMemcpyDeviceToHost));
       }
-      if(gpu_info) gpu_info->sent_to_gpu = false;
+      if (gpu_info)
+        gpu_info->sent_to_gpu = false;
     }
 
     /** \brief Return true if memory is currently on GPU */
-    bool ongpu() const {
+    bool ongpu() const
+    {
       bool ret = gpu_info && gpu_info->sent_to_gpu;
-      if(ret && gpu_grid.data() == nullptr) togpu(); //needed if this is a copy made before another transfered
+      if (ret && gpu_grid.data() == nullptr)
+        togpu(); // needed if this is a copy made before another transfered
       return ret;
     }
 
     /** \brief Return true if memory is currently on CPU */
     bool oncpu() const { return gpu_info == nullptr || !gpu_info->sent_to_gpu; }
 
-
     operator cpu_grid_t() const { return cpu(); }
-    operator cpu_grid_t&() {return cpu(); }
+    operator cpu_grid_t &() { return cpu(); }
 
     operator gpu_grid_t() const { return gpu(); }
-    operator gpu_grid_t&() {return gpu(); }
+    operator gpu_grid_t &() { return gpu(); }
 
     /// Return pointer to CPU data
-    inline const Dtype * data() const { tocpu(); return cpu().data(); }
-    inline Dtype * data() { tocpu(); return cpu().data(); }
+    inline const Dtype *data() const
+    {
+      tocpu();
+      return cpu().data();
+    }
+    inline Dtype *data()
+    {
+      tocpu();
+      return cpu().data();
+    }
 
-    //pointer equality
-    bool operator==(const ManagedGridBase<Dtype, NumDims>& rhs) const {
+    // pointer equality
+    bool operator==(const ManagedGridBase<Dtype, NumDims> &rhs) const
+    {
       return cpu_ptr == rhs.cpu_ptr;
     }
+
   protected:
     // constructor used by operator[]
-    friend ManagedGridBase<Dtype,NumDims-1>;
-    explicit ManagedGridBase(const ManagedGridBase<Dtype,NumDims+1>& G, size_t i):
-      gpu_grid(G.gpu_grid, i), cpu_grid(G.cpu_grid, i), cpu_ptr(G.cpu_ptr),
-      capacity(G.capacity), gpu_info(G.gpu_info) {}
-};
-
-/** \brief A dense grid whose memory is managed by the class.
- *
- * Memory is allocated as unified memory so it can be safely accessed
- * from either the CPU or GPU.  Note that while the memory is accessible
- * on the GPU, ManagedGrid objects should only be used directly on the host.
- * Device code should use a Grid view of the ManagedGrid.  Accessing ManagedGrid
- * data from the host while the GPU is writing to the grid will result in
- * undefined behavior.
- *
- * If CUDA fails to allocate unified memory (presumably due to lack of GPUs),
- * host-only memory will be used instead.
- *
- * Grid is a container class of ManagedGrid instead of a base class.  Explicit
- * transformation to a Grid view is required to clearly indicate how the memory
- * should be accessed.  This can be done with cpu and gpu methods or by an
- * explicit cast to Grid.
- *
- * There are two class specialization to support bracket indexing.
- */
-template<typename Dtype, std::size_t NumDims>
-class ManagedGrid :  public ManagedGridBase<Dtype, NumDims> {
+    friend ManagedGridBase<Dtype, NumDims - 1>;
+    explicit ManagedGridBase(const ManagedGridBase<Dtype, NumDims + 1> &G, size_t i) : gpu_grid(G.gpu_grid, i), cpu_grid(G.cpu_grid, i), cpu_ptr(G.cpu_ptr),
+                                                                                       capacity(G.capacity), gpu_info(G.gpu_info) {}
+  };
+
+  /** \brief A dense grid whose memory is managed by the class.
+   *
+   * Memory is allocated as unified memory so it can be safely accessed
+   * from either the CPU or GPU.  Note that while the memory is accessible
+   * on the GPU, ManagedGrid objects should only be used directly on the host.
+   * Device code should use a Grid view of the ManagedGrid.  Accessing ManagedGrid
+   * data from the host while the GPU is writing to the grid will result in
+   * undefined behavior.
+   *
+   * If CUDA fails to allocate unified memory (presumably due to lack of GPUs),
+   * host-only memory will be used instead.
+   *
+   * Grid is a container class of ManagedGrid instead of a base class.  Explicit
+   * transformation to a Grid view is required to clearly indicate how the memory
+   * should be accessed.  This can be done with cpu and gpu methods or by an
+   * explicit cast to Grid.
+   *
+   * There are two class specialization to support bracket indexing.
+   */
+  template <typename Dtype, std::size_t NumDims>
+  class ManagedGrid : public ManagedGridBase<Dtype, NumDims>
+  {
+  protected:
+    ManagedGrid(size_t *sizes) : ManagedGridBase<Dtype, NumDims>(sizes)
+    {
+    }  
   public:
-    using subgrid_t = ManagedGrid<Dtype,NumDims-1>;
+    using subgrid_t = ManagedGrid<Dtype, NumDims - 1>;
     using base_t = ManagedGridBase<Dtype, NumDims>;
 
-    //empty, unusable grid
+    // empty, unusable grid
     ManagedGrid() = default;
 
-    template<typename... I, typename = typename std::enable_if<sizeof...(I) == NumDims>::type>
-    ManagedGrid(I... sizes): ManagedGridBase<Dtype,NumDims>(sizes...) {
+    template <typename... I, typename = typename std::enable_if<sizeof...(I) == NumDims>::type>
+    ManagedGrid(I... sizes) : ManagedGridBase<Dtype, NumDims>(sizes...)
+    {
     }
 
     /** \brief Bracket indexing.
@@ -377,79 +490,128 @@ class ManagedGrid :  public ManagedGridBase<Dtype, NumDims> {
      *  but not maximally efficient (unless the compiler is really good).
      *  Use operator() for fastest (but unchecked) access or access data directly.
      */
-    subgrid_t operator[](size_t i) const {
-      if(i >= this->cpu_grid.dimension(0))
-        throw std::out_of_range("Index "+boost::lexical_cast<std::string>(i)+" out of bounds of dimension "+boost::lexical_cast<std::string>(this->cpu_grid.dimension(0)));
-      return ManagedGrid<Dtype,NumDims-1>(*static_cast<const ManagedGridBase<Dtype, NumDims> *>(this), i);
+    subgrid_t operator[](size_t i) const
+    {
+      if (i >= this->cpu_grid.dimension(0))
+        throw std::out_of_range("Index " + boost::lexical_cast<std::string>(i) + " out of bounds of dimension " + boost::lexical_cast<std::string>(this->cpu_grid.dimension(0)));
+      return ManagedGrid<Dtype, NumDims - 1>(*static_cast<const ManagedGridBase<Dtype, NumDims> *>(this), i);
     }
 
     /** \brief Return a copy of this grid */
-    ManagedGrid<Dtype, NumDims> clone() const {
+    ManagedGrid<Dtype, NumDims> clone() const
+    {
       ManagedGrid<Dtype, NumDims> ret(*this);
       ret.clone_ptrs();
       return ret;
     }
-  protected:
-    //only called from regular Grid
-    friend ManagedGrid<Dtype,NumDims+1>;
-    explicit ManagedGrid<Dtype,NumDims>(const ManagedGridBase<Dtype,NumDims+1>& G, size_t i):
-      ManagedGridBase<Dtype,NumDims>(G, i) {}
 
-};
+    template <class Archive>
+    void save(Archive &ar, const unsigned int version) const
+    {
+      this->tocpu();
+      for (size_t i = 0; i < NumDims; i++)
+      {
+        ar << this->dimension(i);
+      }
+      ar << boost::serialization::make_array(this->data(), this->size());
+    }
+
+    template <class Archive>
+    void load(Archive &ar, const unsigned int version)
+    {
+      size_t newdims[NumDims] = {0,};
+      for (size_t i = 0; i < NumDims; i++)
+      {
+        ar >> newdims[i];
+      }
+      ManagedGrid<Dtype, NumDims> tmp(newdims);
+      ar >> boost::serialization::make_array(tmp.data(), tmp.size());
+      *this = tmp;
+    }
 
+    BOOST_SERIALIZATION_SPLIT_MEMBER()
 
-// class specialization of managed grid to make final operator[] return scalar
-template<typename Dtype >
-class ManagedGrid<Dtype, 1> : public ManagedGridBase<Dtype, 1> {
+  protected:
+    // only called from regular Grid
+    friend ManagedGrid<Dtype, NumDims + 1>;
+    explicit ManagedGrid<Dtype, NumDims>(const ManagedGridBase<Dtype, NumDims + 1> &G, size_t i) : ManagedGridBase<Dtype, NumDims>(G, i) {}
+  };
+
+  // class specialization of managed grid to make final operator[] return scalar
+  template <typename Dtype>
+  class ManagedGrid<Dtype, 1> : public ManagedGridBase<Dtype, 1>
+  {
   public:
     using subgrid_t = Dtype;
     using base_t = ManagedGridBase<Dtype, 1>;
 
     ManagedGrid() = default;
-    ManagedGrid(size_t sz): ManagedGridBase<Dtype, 1>(sz) {
+    ManagedGrid(size_t sz) : ManagedGridBase<Dtype, 1>(sz)
+    {
     }
 
-    inline Dtype& operator[](size_t i) {
+    inline Dtype &operator[](size_t i)
+    {
       this->tocpu();
       return this->cpu_grid[i];
     }
 
-    inline Dtype operator[](size_t i) const {
+    inline Dtype operator[](size_t i) const
+    {
       this->tocpu();
       return this->cpu_grid[i];
     }
 
-    inline Dtype& operator()(size_t a) {
+    inline Dtype &operator()(size_t a)
+    {
       this->tocpu();
       return this->cpu_grid(a);
     }
 
-    inline Dtype operator()(size_t a) const {
+    inline Dtype operator()(size_t a) const
+    {
       this->tocpu();
       return this->cpu_grid(a);
     }
 
-    ManagedGrid<Dtype, 1> clone() const {
+    ManagedGrid<Dtype, 1> clone() const
+    {
       ManagedGrid<Dtype, 1> ret(*this);
       ret.clone_ptrs();
       return ret;
     }
 
-  protected:
-    //only called from regular Grid
-    friend ManagedGrid<Dtype,2>;
-    explicit ManagedGrid<Dtype,1>(const ManagedGridBase<Dtype,2>& G, size_t i):
-      ManagedGridBase<Dtype,1>(G, i) {}
+    template <class Archive>
+    void save(Archive &ar, const unsigned int version) const
+    {
+      this->tocpu();
+      ar << this->dimension(0);      
+      ar << boost::serialization::make_array(this->data(), this->size());
+    }
 
-};
+    template <class Archive>
+    void load(Archive &ar, const unsigned int version)
+    {
+      size_t newdim = 0;
+      ar >> newdim;      
+      ManagedGrid<Dtype, 1> tmp(newdim);
+      ar >> boost::serialization::make_array(tmp.data(), tmp.size());
+      *this = tmp;
+    }
 
+    BOOST_SERIALIZATION_SPLIT_MEMBER()
 
-#define EXPAND_MGRID_DEFINITIONS(Z,SIZE,_) \
-typedef ManagedGrid<float, SIZE> MGrid##SIZE##f; \
-typedef ManagedGrid<double, SIZE> MGrid##SIZE##d;
+  protected:
+    // only called from regular Grid
+    friend ManagedGrid<Dtype, 2>;
+    explicit ManagedGrid<Dtype, 1>(const ManagedGridBase<Dtype, 2> &G, size_t i) : ManagedGridBase<Dtype, 1>(G, i) {}
+  };
 
+#define EXPAND_MGRID_DEFINITIONS(Z, SIZE, _)       \
+  typedef ManagedGrid<float, SIZE> MGrid##SIZE##f; \
+  typedef ManagedGrid<double, SIZE> MGrid##SIZE##d;
 
-BOOST_PP_REPEAT_FROM_TO(1,LIBMOLGRID_MAX_GRID_DIM, EXPAND_MGRID_DEFINITIONS, 0);
+  BOOST_PP_REPEAT_FROM_TO(1, LIBMOLGRID_MAX_GRID_DIM, EXPAND_MGRID_DEFINITIONS, 0);
 
 }
 #endif /* MANAGED_GRID_H_ */
diff --git a/include/libmolgrid/string_cache.h b/include/libmolgrid/string_cache.h
new file mode 100644
index 0000000..fe37ac6
--- /dev/null
+++ b/include/libmolgrid/string_cache.h
@@ -0,0 +1,30 @@
+/** \file string_cache.h
+ *
+ *  Class for avoiding duplicate memory allocation of identical strings.
+ *  Use const char*
+ */
+
+#ifndef STRING_CACHE_H_
+#define STRING_CACHE_H_
+
+#include <string>
+#include <unordered_set>
+
+namespace libmolgrid {
+
+//for memory efficiency, only store a given string once and use the const char*
+class StringCache {
+  std::unordered_set<std::string> strings;
+public:
+  const char* get(const std::string& s)
+  {
+    strings.insert(s);
+    //we assume even as the set is resized that strings never get allocated
+    return strings.find(s)->c_str();
+  }
+};
+
+extern StringCache string_cache;
+
+}; //namespace libmolgrid
+#endif // STRING_CACHE_H_
\ No newline at end of file
diff --git a/python/bindings.h b/python/bindings.h
index 592bbea..2723d6d 100644
--- a/python/bindings.h
+++ b/python/bindings.h
@@ -12,6 +12,7 @@
 #include <vector>
 #include <type_traits>
 #include <utility>
+#include <boost/bind/bind.hpp>
 #include <boost/python.hpp>
 #include <boost/python/suite/indexing/vector_indexing_suite.hpp>
 #include <boost/python/make_constructor.hpp>
@@ -25,6 +26,7 @@
 #define TYPEARG(Z, N, T) BOOST_PP_COMMA_IF(N) T
 #define NTYPES(N, T) BOOST_PP_REPEAT(N, TYPEARG, T)
 
+
 extern bool python_gpu_enabled;
 bool init_numpy();
 
diff --git a/python/bindings.in.cpp b/python/bindings.in.cpp
index 0549560..65158ff 100644
--- a/python/bindings.in.cpp
+++ b/python/bindings.in.cpp
@@ -190,7 +190,7 @@ std::vector< std::vector<T> > listlist_to_vecvec(list l) {
 class PythonCallbackIndexTyper: public CallbackIndexTyper {
 
     boost::python::object callback;
-
+    using CallbackIndexTyper::get_atom_type_index;
   public:
 
     /// iniitalize callbacktyper, if names are not provided, numerical names will be generated
@@ -217,7 +217,7 @@ class PythonCallbackIndexTyper: public CallbackIndexTyper {
 class PythonCallbackVectorTyper: public CallbackVectorTyper {
 
     boost::python::object callback;
-
+    using CallbackVectorTyper::get_atom_type_vector;
   public:
 
     /// iniitalize callbacktyper, if names are not provided, numerical names will be generated
@@ -555,7 +555,7 @@ MAKE_ALL_GRIDS()
       init<object, unsigned, list>(
           (arg("func"), arg("num_types"), arg("names") = list() ) ))
       .def("num_types", &PythonCallbackIndexTyper::num_types)
-      .def("get_atom_type_index", &PythonCallbackIndexTyper::get_atom_type_index)
+      .def<std::pair<int,float> (PythonCallbackIndexTyper::*)(object a) const>("get_atom_type_index", &PythonCallbackIndexTyper::get_atom_type_index)
       .def("get_type_names",&PythonCallbackIndexTyper::get_type_names);
   implicitly_convertible<std::shared_ptr<PythonCallbackIndexTyper>, std::shared_ptr<AtomTyper> >();
 
@@ -574,7 +574,7 @@ MAKE_ALL_GRIDS()
       init<object, unsigned, list>(
           (arg("func"), arg("num_types"), arg("names") = list() ) ))
       .def("num_types", &PythonCallbackVectorTyper::num_types)
-      .def("get_atom_type_vector", &PythonCallbackVectorTyper::get_atom_type_vector)
+      .def< tuple (PythonCallbackVectorTyper::*)(object a) const >("get_atom_type_vector", &PythonCallbackVectorTyper::get_atom_type_vector)
       .def("get_type_names",&PythonCallbackVectorTyper::get_type_names);
   implicitly_convertible<std::shared_ptr<PythonCallbackVectorTyper>, std::shared_ptr<AtomTyper> >();
 
@@ -768,6 +768,9 @@ MAKE_ALL_GRIDS()
       .def("get_large_epoch_num", &ExampleProvider::get_large_epoch_num, "Return large epoch number, where an epoch means every example has been seen at LEAST once.")
       .def("small_epoch_size", &ExampleProvider::small_epoch_size,"Return size of small epoch")
       .def("large_epoch_size", &ExampleProvider::large_epoch_size,"Return size of large epoch")
+      .def("mem_caches_size", &ExampleProvider::mem_caches_size, "Return number of elements currently cached in-memory.")
+      .def("save_mem_caches", static_cast<void (ExampleProvider::*)(const std::string& ) const>(&ExampleProvider::save_mem_caches),arg("file_name"),"Write current in-memory cache state to specified file.")
+      .def("load_mem_caches", static_cast<void (ExampleProvider::*)(const std::string& )>(&ExampleProvider::load_mem_caches),arg("file_name"),"Read specified file into in-memory cache.")
       .def("reset", &ExampleProvider::reset, "Reset iterator to beginning")
       .def("__iter__", +[](object self) { return self;})
       .def("__next__", +[](ExampleProvider& self) -> std::vector<Example> {
diff --git a/python/torch_bindings.py b/python/torch_bindings.py
index 3632422..fd60596 100644
--- a/python/torch_bindings.py
+++ b/python/torch_bindings.py
@@ -1,6 +1,11 @@
+import math
 import torch
+from torch.distributed import get_rank, get_world_size
+from torch.utils.data import DistributedSampler
 import molgrid as mg
 import types
+from itertools import islice
+
 def tensor_as_grid(t):
     '''Return a Grid view of tensor t'''
     gname = 'Grid'
@@ -157,9 +162,12 @@ def extra_repr(self):
                 self.gmaker.get_resolution(), self.gmaker.get_dimension(), self.center[0], self.center[1], self.center[2])
         
                            
-class MolDataset(torch.utils.data.Dataset):
+class MolMapDataset(torch.utils.data.Dataset):
     '''A pytorch mappable dataset for molgrid training files.'''
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args,
+                 random_translation: float=0.0,
+                 random_rotation: bool=False,
+                 **kwargs):
         '''Initialize mappable MolGridDataset.  
         :param input(s): File name(s) of training example files 
         :param typers: A tuple of AtomTypers to use
@@ -173,9 +181,9 @@ def __init__(self, *args, **kwargs):
         :param ligmolcache: precalculated molcache2 file for ligand; if doesn't exist, will look in data_root
         '''
 
+        self._random_translation, self._random_rotation = random_translation, random_rotation
         if 'typers' in kwargs:
-            typers = kwargs['typers']
-            del kwargs['typers']
+            typers = kwargs.pop('typers')
             self.examples = mg.ExampleDataset(*typers,**kwargs)
             self.typers = typers
         else:
@@ -184,39 +192,42 @@ def __init__(self, *args, **kwargs):
         self.types_files = list(args)
         self.examples.populate(self.types_files)
             
-        self.num_labels = self.examples.num_labels()
-
-        
     def __len__(self):
         return len(self.examples)
     
     def __getitem__(self, idx):
         ex = self.examples[idx]
-        center = torch.tensor([i for i in ex.coord_sets[-1].center()])
+        center = torch.tensor(list(ex.coord_sets[-1].center()))
         coordinates = ex.merge_coordinates()
+        if self._random_translation > 0 or self._random_rotation:
+            mg.Transform(ex.coord_sets[-1].center(), self._random_translation, self._random_rotation).forward(coordinates, coordinates)
         if coordinates.has_vector_types() and coordinates.size() > 0:
             atomtypes = torch.tensor(coordinates.type_vector.tonumpy(),dtype=torch.long).type('torch.FloatTensor')
         else:
             atomtypes = torch.tensor(coordinates.type_index.tonumpy(),dtype=torch.long).type('torch.FloatTensor')
         coords = torch.tensor(coordinates.coords.tonumpy())
+        length = len(coords)
         radii = torch.tensor(coordinates.radii.tonumpy())
-        labels = [ex.labels[lab] for lab in range(self.num_labels)]
-        return center, coords, atomtypes, radii, labels
+        labels = torch.tensor(ex.labels)
+        return length, center, coords, atomtypes, radii, labels
+
     
     def __getstate__(self):
-        settings = self.examples.settings()
+        settings = self.examples.settings() 
         keyword_dict = {sett: getattr(settings, sett) for sett in dir(settings) if not sett.startswith('__')}
         if self.typers is not None: ## This will fail if self.typers is not none, need a way to pickle AtomTypers
-            raise NotImplementedError('MolDataset does not support pickling when not using the default Gnina atom typers, this uses %s'.format(str(self.typers)))
+            raise NotImplementedError('MolMapDataset does not support pickling when not using the default Gnina atom typers, this uses %s'.format(str(self.typers)))
             keyword_dict['typers'] = self.typers
+        keyword_dict['random_translation'] = self._random_translation
+        keyword_dict['random_rotation'] = self._random_rotation
         return keyword_dict, self.types_files
 
     def __setstate__(self,state):
         kwargs=state[0]
-        
+        self._random_translation = kwargs.pop('random_translation')
+        self._random_rotation = kwargs.pop('random_rotation')
         if 'typers' in kwargs:
-            typers = kwargs['typers']
-            del kwargs['typers']
+            typers = kwargs.pop('typers')
             self.examples = mg.ExampleDataset(*typers, **kwargs)
             self.typers = typers
         else:
@@ -225,33 +236,137 @@ def __setstate__(self,state):
         self.types_files = list(state[1])
         self.examples.populate(self.types_files)
 
-        self.num_labels = self.examples.num_labels()
 
     @staticmethod
     def collateMolDataset(batch):
-        '''collate_fn for use in torch.utils.data.Dataloader when using the MolDataset.
+        '''collate_fn for use in torch.utils.data.Dataloader when using the MolMapDataset.
         Returns lengths, centers, coords, types, radii, labels all padded to fit maximum size of batch'''
-        lens = []
-        centers = []
-        lcoords = []
-        ltypes = []
-        lradii = []
-        labels = []
-        for center,coords,types,radii,label in batch:
-            lens.append(coords.shape[0])
-            centers.append(center)
-            lcoords.append(coords)
-            ltypes.append(types)
-            lradii.append(radii)
-            labels.append(torch.tensor(label))
+        batch_list = list(zip(*batch))
+        lengths = torch.tensor(batch_list[0])
+        centers = torch.stack(batch_list[1], dim=0)
+        coords = torch.nn.utils.rnn.pad_sequence(batch_list[2], batch_first=True)
+        types = torch.nn.utils.rnn.pad_sequence(batch_list[3], batch_first=True)
+        radii = torch.nn.utils.rnn.pad_sequence(batch_list[4], batch_first=True)
+        labels = torch.stack(batch_list[5], dim=0)
+
+        return lengths, centers, coords, types, radii, labels
+
+class MolIterDataset(torch.utils.data.IterableDataset):
+    '''A pytorch iterable dataset for molgrid training files. Use with a DataLoader(batch_size=None) for best results.'''
+    def __init__(self, *args,
+                 random_translation: float=0.0,
+                 random_rotation: bool=False,
+                 **kwargs):
+        '''Initialize mappable MolGridDataset.  
+        :param input(s): File name(s) of training example files 
+        :param typers: A tuple of AtomTypers to use
+        :type typers: tuple
+        :param cache_structs: retain coordinates in memory for faster training
+        :param add_hydrogens: protonate molecules read using openbabel
+        :param duplicate_first: clone the first coordinate set to be paired with each of the remaining (receptor-ligand pairs)
+        :param make_vector_types: convert index types into one-hot encoded vector types
+        :param data_root: prefix for data files
+        :param recmolcache: precalculated molcache2 file for receptor (first molecule); if doesn't exist, will look in data _root
+        :param ligmolcache: precalculated molcache2 file for ligand; if doesn't exist, will look in data_root
+        '''
+
+        # molgrid.set_random_seed(kwargs['random_seed'])
+        self._random_translation, self._random_rotation = random_translation, random_rotation
+        if 'typers' in kwargs:
+            typers = kwargs.pop('typers')
+            self.examples = mg.ExampleProvider(*typers,**kwargs)
+            self.typers = typers
+        else:
+            self.examples = mg.ExampleProvider(**kwargs)
+            self.typers = None
+        self.types_files = list(args)
+        self.examples.populate(self.types_files)
+
+        self._num_labels = self.examples.num_labels()
+
+    def generate(self):
+        for batch in self.examples:
+            yield self.batch_to_tensors(batch)
+            
+    def batch_to_tensors(self, batch):
+        batch_lengths = torch.zeros(len(batch), dtype=torch.int64)
+        batch_centers = torch.zeros((len(batch), 3), dtype=torch.float32)
+        batch_coords = []
+        batch_atomtypes = []
+        batch_radii = []
+        batch_labels = torch.zeros((len(batch),self._num_labels), dtype=torch.float32)
+        for idx, ex in enumerate(batch):
+            length, center, coords, atomtypes, radii, labels = self.example_to_tensor(ex)
+            batch_lengths[idx] = length
+            batch_centers[idx,:] = center
+            batch_coords.append(coords)
+            batch_atomtypes.append(atomtypes)
+            batch_radii.append(radii)
+            batch_labels[idx,:] = labels
+        pad_coords = torch.nn.utils.rnn.pad_sequence(batch_coords, batch_first=True)
+        pad_atomtypes = torch.nn.utils.rnn.pad_sequence(batch_atomtypes, batch_first=True)
+        pad_radii = torch.nn.utils.rnn.pad_sequence(batch_radii, batch_first=True)
+        return batch_lengths, batch_centers, pad_coords, pad_atomtypes, pad_radii, batch_labels
 
 
-        lengths = torch.tensor(lens)
-        lcoords = torch.nn.utils.rnn.pad_sequence(lcoords, batch_first=True)
-        ltypes = torch.nn.utils.rnn.pad_sequence(ltypes, batch_first=True)
-        lradii = torch.nn.utils.rnn.pad_sequence(lradii, batch_first=True)
+    def example_to_tensor(self, ex):
+        center = torch.tensor(list(ex.coord_sets[-1].center()))
+        coordinates = ex.merge_coordinates()
+        if self._random_translation > 0 or self._random_rotation:
+            mg.Transform(ex.coord_sets[-1].center(), self._random_translation, self._random_rotation).forward(coordinates, coordinates)
+        if coordinates.has_vector_types() and coordinates.size() > 0:
+            atomtypes = torch.tensor(coordinates.type_vector.tonumpy(),dtype=torch.long).type('torch.FloatTensor')
+        else:
+            atomtypes = torch.tensor(coordinates.type_index.tonumpy(),dtype=torch.long).type('torch.FloatTensor')
+        coords = torch.tensor(coordinates.coords.tonumpy())
+        length = len(coords)
+        radii = torch.tensor(coordinates.radii.tonumpy())
+        labels = torch.tensor(ex.labels)
+        return length, center, coords, atomtypes, radii, labels
+
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        worker_id = worker_info.id if worker_info is not None else 0
+        n_workers = worker_info.num_workers if worker_info is not None else 1
+
+        world_size = get_world_size() if torch.distributed.is_initialized() else 1
+        if world_size == 1:
+            return islice(self.generate(), worker_id, None, n_workers)
+        process_rank = get_rank()
+
+        return islice(self.generate(), process_rank * n_workers + worker_id, None, n_workers * world_size)
+
+    def __len__(self):
+        settings = self.examples.settings()
+        batch_size = settings.default_batch_size
+        if settings.iteration_scheme == mg.IterationScheme.SmallEpoch:
+            return self.examples.small_epoch_size() // batch_size
+        elif settings.iteration_scheme == mg.IterationScheme.LargeEpoch:
+            return math.ceil(self.examples.large_epoch_size() / batch_size)
+        else:
+            NotImplementedError('Iteration scheme %s not supported'.format(itr_scheme))
 
-        centers = torch.stack(centers,dim=0)
-        labels = torch.stack(labels,dim=0)
 
-        return lengths, centers, lcoords, ltypes, lradii, labels
+    def __getstate__(self):
+        settings = self.examples.settings() 
+        keyword_dict = {sett: getattr(settings, sett) for sett in dir(settings) if not sett.startswith('__')}
+        if self.typers is not None: ## This will fail if self.typers is not none, need a way to pickle AtomTypers
+            raise NotImplementedError('MolIterDataset does not support pickling when not using the default Gnina atom typers, this uses %s'.format(str(self.typers)))
+            keyword_dict['typers'] = self.typers
+        keyword_dict['random_translation'] = self._random_translation
+        keyword_dict['random_rotation'] = self._random_rotation
+        return keyword_dict, self.types_files
+
+    def __setstate__(self,state):
+        kwargs=state[0]
+        self._random_translation = kwargs.pop('random_translation')
+        self._random_rotation = kwargs.pop('random_rotation')
+        if 'typers' in kwargs:
+            typers = kwargs.pop('typers')
+            self.examples = mg.ExampleProvider(*typers, **kwargs)
+            self.typers = typers
+        else:
+            self.examples = mg.ExampleProvider(**kwargs)
+            self.typers = None
+        self.types_files = list(state[1])
+        self.examples.populate(self.types_files)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index dc047e6..8aa92d6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,6 +18,7 @@ set( LIBMOLGRID_SOURCES
  grid_interpolater.cpp
  grid_interpolater.cu
  cartesian_grid.cpp
+ string_cache.cpp
 )
 
 set( LIBMOLGRID_HEADERS
@@ -39,6 +40,7 @@ set( LIBMOLGRID_HEADERS
  ../include/libmolgrid/common.h
  ../include/libmolgrid/grid_io.h
  ../include/libmolgrid/cartesian_grid.h
+ ../include/libmolgrid/string_cache.h
 )
 
 set ( LIBMOLGRID_HEADERS ${LIBMOLGRID_HEADERS}  PARENT_SCOPE)
diff --git a/src/coord_cache.cpp b/src/coord_cache.cpp
index e1bd36f..d285ce2 100644
--- a/src/coord_cache.cpp
+++ b/src/coord_cache.cpp
@@ -6,13 +6,13 @@
  */
 
 #include "libmolgrid/coord_cache.h"
+#include "libmolgrid/string_cache.h"
 
 #include <boost/algorithm/string.hpp>
-#include <boost/filesystem/path.hpp>
 #include <boost/filesystem.hpp>
-#include <openbabel/obconversion.h>
+#include <boost/filesystem/path.hpp>
 #include <cuda_runtime.h>
-
+#include <openbabel/obconversion.h>
 
 namespace libmolgrid {
 
@@ -21,135 +21,139 @@ using namespace OpenBabel;
 
 unsigned do_not_optimize_away;
 
-//read in molcache if present
-CoordCache::CoordCache(std::shared_ptr<AtomTyper> t, const ExampleProviderSettings& settings,
-    const std::string& mc): typer(t), data_root(settings.data_root), molcache(mc),
-        use_cache(settings.cache_structs), addh(settings.add_hydrogens), make_vector_types(settings.make_vector_types) {
-  if(molcache.length() > 0) {
+// read in molcache if present
+CoordCache::CoordCache(std::shared_ptr<AtomTyper> t,
+                       const ExampleProviderSettings &settings,
+                       const std::string &mc)
+    : typer(t), data_root(settings.data_root), molcache(mc),
+      use_cache(settings.cache_structs), addh(settings.add_hydrogens),
+      make_vector_types(settings.make_vector_types) {
+  if (molcache.length() > 0) {
     static_assert(sizeof(size_t) == 8, "size_t must be 8 bytes");
 
-    if(!boost::filesystem::exists(molcache)) {
-      //try looking in dataroot
+    if (!boost::filesystem::exists(molcache)) {
+      // try looking in dataroot
       molcache = (boost::filesystem::path(data_root) / molcache).string();
     }
     ifstream mcache(molcache.c_str());
-    if(!mcache) throw invalid_argument("Could not open file: "+molcache);
+    if (!mcache)
+      throw invalid_argument("Could not open file: " + molcache);
     int version = 0;
-    mcache.read((char*)&version,sizeof(int));
-    if(version != -1) {
-      throw invalid_argument(molcache+" is not a valid molcache2 file");
+    mcache.read((char *)&version, sizeof(int));
+    if (version != -1) {
+      throw invalid_argument(molcache + " is not a valid molcache2 file");
     }
     size_t start = 0;
-    mcache.read((char*)&start, sizeof(size_t));
+    mcache.read((char *)&start, sizeof(size_t));
     mcache.seekg(start);
 
-    //open memory map for moldata
-    cache_map.open(molcache.c_str(),start);
-    if(!cache_map.is_open()) throw logic_error("Could not memory map "+molcache);
+    // open memory map for moldata
+    cache_map.open(molcache.c_str(), start);
+    if (!cache_map.is_open())
+      throw logic_error("Could not memory map " + molcache);
 
-    //read in name + offset
+    // read in name + offset
     unsigned char len = 0;
     char molname[256];
     size_t offset = 0;
 
-    while(mcache.read((char*)&len, 1)) {
-      //read name then offset
+    while (mcache.read((char *)&len, 1)) {
+      // read name then offset
       mcache.read(molname, len);
       molname[len] = 0;
-      mcache.read((char*)&offset, sizeof(size_t));
+      mcache.read((char *)&offset, sizeof(size_t));
 
       string mname(molname);
       offsets[string_cache.get(mname)] = offset;
     }
 
-    //prefetch into file cache
+    // prefetch into file cache
     unsigned sum = 0;
-    for(unsigned i = 0, n = cache_map.size(); i < n; i += 1024) {
+    for (unsigned i = 0, n = cache_map.size(); i < n; i += 1024) {
       sum += cache_map.data()[i];
     }
     do_not_optimize_away = sum;
   }
-
 }
 
-//set coords using the cache
-void CoordCache::set_coords(const char *fname, CoordinateSet& coord) {
+// set coords using the cache
+void CoordCache::set_coords(const char *fname, CoordinateSet &coord) {
 
   struct info {
-    float x,y,z;
+    float x, y, z;
     int type;
   };
 
-  if(offsets.count(fname)) {
+  if (offsets.count(fname)) {
     size_t off = offsets[fname];
-    const char *data = cache_map.data()+off;
-    unsigned natoms = *(unsigned*)data;
-    info *atoms = (info*)(data+sizeof(unsigned));
+    const char *data = cache_map.data() + off;
+    unsigned natoms = *(unsigned *)data;
+    info *atoms = (info *)(data + sizeof(unsigned));
 
-    if(typer->is_vector_typer())
+    if (typer->is_vector_typer())
       throw invalid_argument("Vector typer used with molcache files");
 
-    vector<float3> c; c.reserve(natoms);
-    vector<float> r; r.reserve(natoms);
-    vector<int> t; t.reserve(natoms);
-    for(unsigned i = 0; i < natoms; i++)
-    {
-      info& atom = atoms[i];
+    vector<float3> c;
+    c.reserve(natoms);
+    vector<float> r;
+    r.reserve(natoms);
+    vector<int> t;
+    t.reserve(natoms);
+    for (unsigned i = 0; i < natoms; i++) {
+      info &atom = atoms[i];
       auto t_r = typer->get_int_type(atom.type);
-      if(t_r.first >= 0) { //ignore neg
+      if (t_r.first >= 0) { // ignore neg
         t.push_back(t_r.first);
         r.push_back(t_r.second);
-        c.push_back(make_float3(atom.x,atom.y,atom.z));
+        c.push_back(make_float3(atom.x, atom.y, atom.z));
       }
     }
 
     coord = CoordinateSet(c, t, r, typer->num_types());
     coord.src = fname;
-  }
-  else if(memcache.count(fname)) {
-    coord = memcache[fname].clone(); //always copy out of cache
+  } else if (memcache.count(fname)) {
+    coord = memcache[fname].clone(); // always copy out of cache
   } else {
     std::string fullname = fname;
-    if(data_root.length()) {
-      boost::filesystem::path p = boost::filesystem::path(data_root) / boost::filesystem::path(fname);
+    if (data_root.length()) {
+      boost::filesystem::path p =
+          boost::filesystem::path(data_root) / boost::filesystem::path(fname);
       fullname = p.string();
     }
-    //check for custom gninatypes file
-    if(boost::algorithm::ends_with(fname,".gninatypes"))
-    {
-      if(typer->is_vector_typer())
+    // check for custom gninatypes file
+    if (boost::algorithm::ends_with(fname, ".gninatypes")) {
+      if (typer->is_vector_typer())
         throw invalid_argument("Vector typer used with gninatypes files");
 
       ifstream in(fullname.c_str());
-      if(!in) throw invalid_argument("Could not read "+fullname);
+      if (!in)
+        throw invalid_argument("Could not read " + fullname);
 
       vector<float3> c;
       vector<float> r;
       vector<int> t;
       info atom;
 
-      while(in.read((char*)&atom, sizeof(atom)))
-      {
+      while (in.read((char *)&atom, sizeof(atom))) {
         auto t_r = typer->get_int_type(atom.type);
-        if(t_r.first >= 0) { //ignore neg
+        if (t_r.first >= 0) { // ignore neg
           t.push_back(t_r.first);
           r.push_back(t_r.second);
-          c.push_back(make_float3(atom.x,atom.y,atom.z));
+          c.push_back(make_float3(atom.x, atom.y, atom.z));
         }
       }
 
       coord = CoordinateSet(c, t, r, typer->num_types());
       coord.src = fname;
-    }
-    else if(!boost::algorithm::ends_with(fname,"none")) //reserved word
+    } else if (!boost::algorithm::ends_with(fname, "none")) // reserved word
     {
-      //read mol from file and set mol info (atom coords and grid positions)
+      // read mol from file and set mol info (atom coords and grid positions)
       OBConversion conv;
       OBMol mol;
-      if(!conv.ReadFile(&mol, fullname.c_str()))
+      if (!conv.ReadFile(&mol, fullname.c_str()))
         throw invalid_argument("Could not read " + fullname);
 
-      if(addh) {
+      if (addh) {
         mol.AddHydrogens();
       }
 
@@ -157,18 +161,38 @@ void CoordCache::set_coords(const char *fname, CoordinateSet& coord) {
       coord.src = fname;
     } else {
       coord = CoordinateSet();
-      coord.max_type = typer->num_types(); //empty, but include type size
+      coord.max_type = typer->num_types(); // empty, but include type size
     }
 
-    AtomIndexTyper *ityper = dynamic_cast<AtomIndexTyper*>(typer.get());
-    if(make_vector_types && ityper) {
+    AtomIndexTyper *ityper = dynamic_cast<AtomIndexTyper *>(typer.get());
+    if (make_vector_types && ityper) {
       coord.make_vector_types(false, ityper->get_type_radii());
     }
-    if(use_cache) { //save coord
-      memcache[fname] = coord.clone(); //save a copy in case the returned set is modified
+    if (use_cache) { // save coord
+      memcache[fname] =
+          coord.clone(); // save a copy in case the returned set is modified
     }
   }
 }
 
+void CoordCache::save_mem_cache(std::ostream &out) const {
+  boost::archive::binary_oarchive oarch(out);
+
+  // must convert to std::string as storing pointers makes no sense
+  std::unordered_map<std::string, CoordinateSet> tmp;
+  tmp.insert(memcache.begin(), memcache.end());
+  oarch << tmp;
+}
+
+
+void CoordCache::load_mem_cache(std::istream &in) {
+  boost::archive::binary_iarchive iarch(in);
+  std::unordered_map<std::string, CoordinateSet> tmp; 
+  iarch >> tmp;
+  for (const auto& kv : tmp) {
+    const char *fname = string_cache.get(kv.first);
+    memcache[fname] = kv.second;
+  }
+}
 
 } /* namespace libmolgrid */
diff --git a/src/coordinateset.cpp b/src/coordinateset.cpp
index a9525a4..af1a867 100644
--- a/src/coordinateset.cpp
+++ b/src/coordinateset.cpp
@@ -274,7 +274,8 @@ void CoordinateSet::mergeInto(const CoordinateSet& rec, const CoordinateSet& lig
   if(rec.type_vector.dimension(1) != lig.type_vector.dimension(1)) {
     throw std::invalid_argument("Type vectors are incompatible sizes");
   }
-  if(rec.has_vector_types() != lig.has_vector_types() || rec.has_indexed_types() != lig.has_indexed_types()) {
+  if((rec.has_vector_types() != lig.has_vector_types()) && (rec.has_indexed_types() != lig.has_indexed_types())) {
+    //empty coordsets are true for both, so and 
     throw std::invalid_argument("Incompatible types when combining coodinate sets");
   }
   if(rec.has_indexed_types()) {
diff --git a/src/example.cpp b/src/example.cpp
index 30fb648..4ef2850 100644
--- a/src/example.cpp
+++ b/src/example.cpp
@@ -17,8 +17,6 @@ namespace libmolgrid {
 
 using namespace std;
 
-StringCache string_cache;
-
 size_t Example::num_coordinates() const {
   unsigned N = 0;
   for(unsigned i = 0, n = sets.size(); i < n; i++) {
diff --git a/src/example_extractor.cpp b/src/example_extractor.cpp
index 527c5ba..dc1c134 100644
--- a/src/example_extractor.cpp
+++ b/src/example_extractor.cpp
@@ -10,59 +10,63 @@
 
 #include <boost/algorithm/string.hpp>
 #include <boost/filesystem/path.hpp>
-#include <openbabel/obconversion.h>
 #include <cuda_runtime.h>
+#include <openbabel/obconversion.h>
 
 namespace libmolgrid {
 
 using namespace std;
 using namespace OpenBabel;
 
-void ExampleExtractor::extract(const ExampleRef& ref, Example& ex) {
-  ex.labels = ref.labels; //the easy part
+void ExampleExtractor::extract(const ExampleRef &ref, Example &ex) {
+  ex.labels = ref.labels; // the easy part
   ex.group = ref.group;
   ex.seqcont = ref.seqcont;
 
-  //for each file in ref, get a coordinate set using the matching typer
+  // for each file in ref, get a coordinate set using the matching typer
   ex.sets.clear();
 
-  if(!duplicate_poses || ref.files.size() < 3) {
+  if (!duplicate_poses || ref.files.size() < 3) {
     ex.sets.resize(ref.files.size());
-    for(unsigned i = 0, n = ref.files.size(); i < n; i++) {
-      const char* fname = ref.files[i];
+    for (unsigned i = 0, n = ref.files.size(); i < n; i++) {
+      const char *fname = ref.files[i];
       unsigned t = i;
-      if(t >= coord_caches.size()) t = coord_caches.size()-1; //repeat last typer if necessary
+      if (t >= coord_caches.size())
+        t = coord_caches.size() - 1; // repeat last typer if necessary
       coord_caches[t].set_coords(fname, ex.sets[i]);
     }
-  } else { //duplicate first pose (receptor) to match each of the remaining poses
+  } else { // duplicate first pose (receptor) to match each of the remaining
+           // poses
     unsigned N = ref.files.size() - 1;
-    ex.sets.resize(N*2);
+    ex.sets.resize(N * 2);
     coord_caches[0].set_coords(ref.files[0], ex.sets[0]);
-    for(unsigned i = 1, n = ref.files.size(); i < n; i++) {
-      const char* fname = ref.files[i];
+    for (unsigned i = 1, n = ref.files.size(); i < n; i++) {
+      const char *fname = ref.files[i];
       unsigned t = i;
-      if(t >= coord_caches.size()) t = coord_caches.size()-1; //repeat last typer if necessary
-      coord_caches[t].set_coords(fname, ex.sets[2*(i-1)+1]);
+      if (t >= coord_caches.size())
+        t = coord_caches.size() - 1; // repeat last typer if necessary
+      coord_caches[t].set_coords(fname, ex.sets[2 * (i - 1) + 1]);
 
-      //duplicate receptor by copying
-      if(i > 1) ex.sets[2*(i-1)] = ex.sets[0];
+      // duplicate receptor by copying
+      if (i > 1)
+        ex.sets[2 * (i - 1)] = ex.sets[0];
     }
   }
-
 }
 
-//assume there are n files, return number oftypes
+// assume there are n files, return number oftypes
 size_t ExampleExtractor::count_types(unsigned n) const {
   size_t ret = 0;
-  for(unsigned i = 0; i < n; i++) {
+  for (unsigned i = 0; i < n; i++) {
     unsigned t = i;
-    if(t >= coord_caches.size()) t = coord_caches.size()-1;
+    if (t >= coord_caches.size())
+      t = coord_caches.size() - 1;
     ret += coord_caches[t].num_types();
   }
-  if(duplicate_poses && coord_caches.size() > 2) {
+  if (duplicate_poses && coord_caches.size() > 2) {
     size_t rsize = coord_caches[0].num_types();
     size_t dups = coord_caches.size() - 2;
-    ret += rsize*dups;
+    ret += rsize * dups;
   }
   return ret;
 }
@@ -70,19 +74,39 @@ size_t ExampleExtractor::num_types() const {
   return count_types(coord_caches.size());
 }
 
-size_t ExampleExtractor::num_types(const ExampleRef& ref) const {
+size_t ExampleExtractor::num_types(const ExampleRef &ref) const {
   return count_types(ref.files.size());
 }
 
 std::vector<std::string> ExampleExtractor::get_type_names() const {
   vector<string> ret;
-  for(unsigned i = 0, n = coord_caches.size(); i < n; i++) {
+  for (unsigned i = 0, n = coord_caches.size(); i < n; i++) {
     vector<string> names = coord_caches[i].get_type_names();
-    for(auto name : names) {
-      ret.push_back(itoa(i)+"_"+name);
+    for (auto name : names) {
+      ret.push_back(itoa(i) + "_" + name);
     }
   }
   return ret;
 }
 
+void ExampleExtractor::save_mem_caches(std::ostream &out) const {
+  for (const auto &c : coord_caches) {
+    c.save_mem_cache(out);
+  }
+}
+
+void ExampleExtractor::load_mem_caches(std::istream &in) {
+  for (auto &c : coord_caches) {
+    c.load_mem_cache(in);
+  }
+}
+
+size_t ExampleExtractor::mem_caches_size() const {
+  size_t num = 0;
+  for (const auto &c : coord_caches) {
+    num += c.mem_cache_size();
+  }
+  return num;
+}
+
 } /* namespace libmolgrid */
diff --git a/src/string_cache.cpp b/src/string_cache.cpp
new file mode 100644
index 0000000..007fefc
--- /dev/null
+++ b/src/string_cache.cpp
@@ -0,0 +1,7 @@
+#include "libmolgrid/string_cache.h"
+
+namespace libmolgrid {
+
+StringCache string_cache;
+
+}
\ No newline at end of file
diff --git a/test/test_example_provider.py b/test/test_example_provider.py
index 414abb4..4f7fe02 100644
--- a/test/test_example_provider.py
+++ b/test/test_example_provider.py
@@ -3,6 +3,7 @@
 import numpy as np
 import os
 import torch
+import tempfile
 
 from pytest import approx
 from numpy import around
@@ -37,6 +38,28 @@ def test_mol_example_provider(capsys):
 
     assert (6.0000, 3.8697, -6.6990, -4.3010, -9.0000, 3.3747, 6.0000, 3.8697, -6.6990, -4.3010) == approx(tuple(l1))
 
+def test_mem_caches_example_provider(capsys):
+    fname = datadir + "/smallmol.types"
+    e = molgrid.ExampleProvider(data_root=datadir + "/structs")
+    e2 = molgrid.ExampleProvider(data_root=datadir + "/structs")
+
+    e.populate(fname)
+    e2.populate(fname)
+    assert e.mem_caches_size() == 0
+    with capsys.disabled():  # bunch openbabel garbage
+        ex = e.next()
+    assert e.mem_caches_size() == 2
+    with tempfile.NamedTemporaryFile() as tmp:
+        e.save_mem_caches(tmp.name)
+        e.load_mem_caches(tmp.name)
+        e2.load_mem_caches(tmp.name)
+    assert e.mem_caches_size() == 2
+    assert e2.mem_caches_size() == 2
+    with capsys.disabled():  # bunch openbabel garbage
+        ex = e.next()
+        ex = e2.next()
+    assert e.mem_caches_size() == 4
+    assert e2.mem_caches_size() == 2    
 
 def test_custom_typer_example_provider():
     fname = datadir + "/small.types"
@@ -331,19 +354,19 @@ def test_example_provider_iterator_interface():
             break
 
         
-def test_pytorch_dataset():
+def test_pytorch_mapdataset():
     fname = datadir + "/small.types"
     
     e = molgrid.ExampleProvider(data_root=datadir + "/structs")
     e.populate(fname)
-    m = molgrid.MolDataset(fname, data_root=datadir + "/structs")
+    m = molgrid.MolMapDataset(fname, data_root=datadir + "/structs")
     
     assert len(m) == 1000
 
     ex = e.next()
     coordinates = ex.merge_coordinates()
 
-    center, coords, types, radii, labels = m[0]
+    lengths, center, coords, types, radii, labels = m[0]
 
     assert list(center.shape) == [3]
     np.testing.assert_allclose(coords, coordinates.coords.tonumpy())
@@ -355,13 +378,13 @@ def test_pytorch_dataset():
     np.testing.assert_allclose(labels[1], 6.05)
     np.testing.assert_allclose(labels[-1], 0.162643)
 
-    center, coords, types, radii, labels = m[-1]
+    lengths, center, coords, types, radii, labels = m[-1]
     assert labels[0] == 0
     np.testing.assert_allclose(labels[1], -10.3)    
 
     '''Testing out the collate_fn when used with torch.utils.data.DataLoader'''
     torch_loader = torch.utils.data.DataLoader(
-        m, batch_size=8, collate_fn=molgrid.MolDataset.collateMolDataset)
+        m, batch_size=8, collate_fn=molgrid.MolMapDataset.collateMolDataset)
     iterator = iter(torch_loader)
     next(iterator)
     lengths, center, coords, types, radii, labels = next(iterator)
@@ -373,7 +396,7 @@ def test_pytorch_dataset():
     assert radii.shape[0] == 8
     assert labels.shape[0] == 8
 
-    mcenter, mcoords, mtypes, mradii, mlabels = m[10]
+    mlengths, mcenter, mcoords, mtypes, mradii, mlabels = m[10]
     np.testing.assert_allclose(center[2], mcenter) 
     np.testing.assert_allclose(coords[2][:lengths[2]], mcoords)
     np.testing.assert_allclose(types[2][:lengths[2]], mtypes)
@@ -399,6 +422,73 @@ def test_pytorch_dataset():
     singlegrid = molgrid.MGrid4f(*shape)
     gmaker.forward(ex, singlegrid.cpu())
     np.testing.assert_allclose(mgrid[2].tonumpy(),singlegrid.tonumpy(),atol=1e-5)
+
+def test_pytorch_iterdataset():
+    fname = datadir + "/small.types"
+    
+    BSIZE = 25
+    e = molgrid.ExampleProvider(data_root=datadir + "/structs", default_batch_size=BSIZE)
+    e.populate(fname)
+    m = molgrid.MolIterDataset(fname, data_root=datadir + "/structs", default_batch_size=BSIZE)
+    m_iter = iter(m)
+    
+    ex = e.next()
+    coordinates = ex.merge_coordinates()
+
+    lengths, centers, coords, types, radii, labels = next(m_iter)
+
+    assert list(centers.shape) == [BSIZE,3]
+    np.testing.assert_allclose(coords[0,:lengths[0],:], coordinates.coords.tonumpy())
+    np.testing.assert_allclose(types[0,:lengths[0]], coordinates.type_index.tonumpy())
+    np.testing.assert_allclose(radii[0,:lengths[0]], coordinates.radii.tonumpy())
+
+    assert len(labels) == BSIZE
+    assert len(labels[0]) == 3
+    assert labels[0,0] == 1
+    np.testing.assert_allclose(labels[0,1], 6.05)
+    np.testing.assert_allclose(labels[0,-1], 0.162643)
+
+    # ensure it works with more than 1 worker
+    m.examples.reset()
+    torch_loader = torch.utils.data.DataLoader(
+        m, batch_size=None, num_workers=2)
+    iterator = iter(torch_loader)
+    next(iterator)
+    lengths, center, coords, types, radii, labels = next(iterator)
+    assert len(lengths) == BSIZE
+    assert center.shape[0] == BSIZE
+    assert coords.shape[0] == BSIZE
+    assert types.shape[0] == BSIZE
+    assert radii.shape[0] == BSIZE
+    assert labels.shape[0] == BSIZE
+
+    e.reset()
+    e.next_batch()
+    ex = e.next_batch()
+    coordinates = ex[2].merge_coordinates()
+    np.testing.assert_allclose(center[2], np.array(list(ex[2].coord_sets[-1].center()))) 
+    np.testing.assert_allclose(coords[2,:lengths[2]], coordinates.coords.tonumpy())
+    np.testing.assert_allclose(types[2,:lengths[2]], coordinates.type_index.tonumpy())
+    np.testing.assert_allclose(radii[2,:lengths[2]], coordinates.radii.tonumpy())
+    assert len(labels[2]) == e.num_labels()
+    assert labels[2,0] == ex[2].labels[0]
+    assert labels[2,1] == ex[2].labels[1]
+    
+    gmaker = molgrid.GridMaker()
+    shape = gmaker.grid_dimensions(e.num_types())
+    mgrid = molgrid.MGrid5f(BSIZE,*shape)
+
+    gmaker.forward(center, coords, types, radii, mgrid.cpu())
+    
+    mgridg = molgrid.MGrid5f(BSIZE,*shape)
+    gmaker.forward(center.cuda(), coords.cuda(), types.cuda(), radii.cuda(), mgridg.gpu())
+    
+    np.testing.assert_allclose(mgrid.tonumpy(),mgridg.tonumpy(),atol=1e-5)
+    
+    #compare against standard provider
+    egrid = molgrid.MGrid5f(BSIZE,*shape)
+    gmaker.forward(ex, egrid.cpu())
+    np.testing.assert_allclose(mgridg.tonumpy(),egrid.tonumpy(),atol=1e-5)
     
         
 def test_duplicated_examples():
diff --git a/test/test_mgrid.cpp b/test/test_mgrid.cpp
index 15a856c..014d892 100644
--- a/test/test_mgrid.cpp
+++ b/test/test_mgrid.cpp
@@ -7,9 +7,6 @@
 
 #define BOOST_TEST_MODULE grid_test
 #include <boost/test/unit_test.hpp>
-#include <thrust/reduce.h>
-#include <thrust/execution_policy.h>
-
 #include "libmolgrid/managed_grid.h"
 
 using namespace libmolgrid;
@@ -88,43 +85,6 @@ BOOST_AUTO_TEST_CASE( indirect_indexing )
 
 }
 
-BOOST_AUTO_TEST_CASE( grid_conversion )
-{
-  MGrid3f g3(7,13,11);
-  MGrid1f g1(100);
-
-  for(unsigned i = 0; i < 7; i++)
-    for(unsigned j = 0; j < 13; j++)
-      for(unsigned k = 0; k < 11; k++) {
-        g3[i][j][k] = i+j+k;
-      }
-
-  for(unsigned i = 0; i < 100; i++) {
-    g1(i) = i;
-  }
-
-  Grid3f cpu3(g3);
-  Grid1f cpu1 = g1.cpu();
-
-  float sum3 = thrust::reduce(thrust::host, cpu3.data(), cpu3.data()+cpu3.size());
-  BOOST_CHECK_EQUAL(sum3,14014);
-
-  float sum1 = thrust::reduce(thrust::host, cpu1.data(), cpu1.data()+cpu1.size());
-  BOOST_CHECK_EQUAL(sum1,4950);
-
-  MGrid6d g6(3,4,5,2,1,10);
-  g6[2][2][2][0][0][5] = 3.14;
-  Grid6d cpu6 = (Grid6d)g6; //cast conversion
-  BOOST_CHECK_EQUAL(cpu6.size(),1200);
-  BOOST_CHECK_EQUAL(cpu6(2,2,2,0,0,5), 3.14);
-
-  Grid6dCUDA gpu6 = (Grid6dCUDA)g6;
-  double *cudaptr = gpu6.address(2,2,2,0,0,5);
-  double val = 0;
-  cudaMemcpy(&val, cudaptr, sizeof(double), cudaMemcpyDeviceToHost);
-  BOOST_CHECK_EQUAL(val, 3.14);
-
-}
 
 
 BOOST_AUTO_TEST_CASE( blank_mgrid )
diff --git a/test/test_mgrid.cu b/test/test_mgrid.cu
index b4b1a42..71764c6 100644
--- a/test/test_mgrid.cu
+++ b/test/test_mgrid.cu
@@ -60,3 +60,41 @@ BOOST_AUTO_TEST_CASE( grid_conversion )
   cudaError_t error = cudaGetLastError();
   BOOST_CHECK_EQUAL(error,cudaSuccess);
 }
+
+BOOST_AUTO_TEST_CASE( grid_conversion2 )
+{
+  MGrid3f g3(7,13,11);
+  MGrid1f g1(100);
+
+  for(unsigned i = 0; i < 7; i++)
+    for(unsigned j = 0; j < 13; j++)
+      for(unsigned k = 0; k < 11; k++) {
+        g3[i][j][k] = i+j+k;
+      }
+
+  for(unsigned i = 0; i < 100; i++) {
+    g1(i) = i;
+  }
+
+  Grid3f cpu3(g3);
+  Grid1f cpu1 = g1.cpu();
+
+  float sum3 = thrust::reduce(thrust::host, cpu3.data(), cpu3.data()+cpu3.size());
+  BOOST_CHECK_EQUAL(sum3,14014);
+
+  float sum1 = thrust::reduce(thrust::host, cpu1.data(), cpu1.data()+cpu1.size());
+  BOOST_CHECK_EQUAL(sum1,4950);
+
+  MGrid6d g6(3,4,5,2,1,10);
+  g6[2][2][2][0][0][5] = 3.14;
+  Grid6d cpu6 = (Grid6d)g6; //cast conversion
+  BOOST_CHECK_EQUAL(cpu6.size(),1200);
+  BOOST_CHECK_EQUAL(cpu6(2,2,2,0,0,5), 3.14);
+
+  Grid6dCUDA gpu6 = (Grid6dCUDA)g6;
+  double *cudaptr = gpu6.address(2,2,2,0,0,5);
+  double val = 0;
+  cudaMemcpy(&val, cudaptr, sizeof(double), cudaMemcpyDeviceToHost);
+  BOOST_CHECK_EQUAL(val, 3.14);
+
+}
diff --git a/test/test_transform.cpp b/test/test_transform.cpp
index d528684..bc63f64 100644
--- a/test/test_transform.cpp
+++ b/test/test_transform.cpp
@@ -7,9 +7,7 @@
 
 #define BOOST_TEST_MODULE transform_test
 #include <boost/test/unit_test.hpp>
-#include <boost/test/floating_point_comparison.hpp>
-#include <thrust/reduce.h>
-#include <thrust/execution_policy.h>
+#include <boost/test/tools/floating_point_comparison.hpp>
 
 #include "libmolgrid/libmolgrid.h"
 #include "libmolgrid/transform.h"