timaeus-research
diff --git a/‎README.md‎
Lines changed: 2 additions & 5 deletions b/‎README.md‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎docs/index.md‎ b/‎docs/index.md‎
diff --git a/‎docs/tutorial.md‎ b/‎docs/tutorial.md‎
diff --git a/‎examples/diagnostics.ipynb‎
Lines changed: 391 additions & 272 deletions b/‎examples/diagnostics.ipynb‎
Lines changed: 391 additions & 272 deletions
diff --git a/‎examples/dlns.ipynb‎
Lines changed: 28 additions & 17 deletions b/‎examples/dlns.ipynb‎
Lines changed: 28 additions & 17 deletions
@@ -22,9 +22,10 @@ DevInterp is a python library for conducting research on developmental interpret
 
 from devinterp.slt import sample, LLCEstimator
 from devinterp.optim import SGLD
+from devinterp.utils import optimal_temperature
 
 # Assuming you have a PyTorch Module and DataLoader
-llc_estimator = LLCEstimator(...)
+llc_estimator = LLCEstimator(..., temperature=optimal_temperature(trainloader))
 sample(model, trainloader, ..., callbacks = [llc_estimator])
 
 llc_mean = llc_estimator.sample()["llc/mean"]
@@ -46,10 +47,6 @@ For papers that either inspired or used the DevInterp package, [click here](http
 
 ## Known Issues
 
-- We currently calculate the LLC taking the initial loss to be the loss after one sampling step. This is slightly wrong (it should be the loss before sampling), and there are a bunch of other reasonable and similarly compute-friendly alternative choices that can be made.
-
-- Similarly, we now sample using minibatches that are passed along from the dataloader to sample(). This choice is obscured by the repo, and we should offer alternatives.
-
 - The current implementation does not work with transformers out-of-the-box. This can be fixed by adding a wrapper to your model, for example passing Unpack(model) to sample() where unpack is defined by:
 ```python
 class Unpack(nn.Module):
 
@@ -148,7 +148,6 @@
             "from devinterp.optim.sgld import SGLD\n",
             "\n",
             "\n",
-            "\n",
             "class DLN(nn.Module):\n",
             "    \"\"\"\n",
             "    A deep linear network with `L` layers with dimensions `dims`.\n",
@@ -180,7 +179,9 @@
             "        return f\"DLN({self.dims})\"\n",
             "\n",
             "    @classmethod\n",
-            "    def make_rectangular(cls, input_dim: int, output_dim: int, L: int, w: int, gamma: float):\n",
+            "    def make_rectangular(\n",
+            "        cls, input_dim: int, output_dim: int, L: int, w: int, gamma: float\n",
+            "    ):\n",
             "        \"\"\"\n",
             "        Make a rectangular DLN with `L` layers and constant hidden width `w`.\n",
             "\n",
@@ -189,7 +190,9 @@
             "        The weights are initialized from a normal distribution with variance`w ** (-gamma)`.\n",
             "        \"\"\"\n",
             "        init_variance = w ** (-gamma)\n",
-            "        return cls([input_dim] + [w] * (L - 1) + [output_dim], init_variance=init_variance)\n",
+            "        return cls(\n",
+            "            [input_dim] + [w] * (L - 1) + [output_dim], init_variance=init_variance\n",
+            "        )\n",
             "\n",
             "    def to_matrix(self):\n",
             "        \"\"\"Return the collapsed matrix representation of the DLN.\"\"\"\n",
@@ -212,7 +215,10 @@
             "\n",
             "    def ranks(self, **kwargs):\n",
             "        \"\"\"Return the ranks of the individual layers of the DLN.\"\"\"\n",
-            "        return [torch.linalg.matrix_rank(l.weight.data.to(\"cpu\"), **kwargs) for l in self.linears]\n",
+            "        return [\n",
+            "            torch.linalg.matrix_rank(l.weight.data.to(\"cpu\"), **kwargs)\n",
+            "            for l in self.linears\n",
+            "        ]\n",
             "\n",
             "    def norm(self, p: Union[int, float, str] = 2):\n",
             "        \"\"\"Return the nuclear norm of the DLN.\"\"\"\n",
@@ -247,6 +253,7 @@
             "    def device(self):\n",
             "        return next(self.parameters()).device\n",
             "\n",
+            "\n",
             "class DLNDataset(Dataset):\n",
             "    teacher: DLN\n",
             "\n",
@@ -323,11 +330,11 @@
             "\n",
             "DEVICE = os.environ.get(\n",
             "    \"DEVICE\",\n",
-            "    \"cuda:0\"\n",
-            "    if torch.cuda.is_available()\n",
-            "    else \"mps\"\n",
-            "    if torch.backends.mps.is_available()\n",
-            "    else \"cpu\",\n",
+            "    (\n",
+            "        \"cuda:0\"\n",
+            "        if torch.cuda.is_available()\n",
+            "        else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
+            "    ),\n",
             ")\n",
             "DEVICE = torch.device(DEVICE)\n",
             "NUM_CORES = int(os.environ.get(\"NUM_CORES\", 1))\n",
@@ -477,9 +484,7 @@
             "\n",
             "    def eval_rlct(model: DLN):\n",
             "        model.to(\"cpu\")\n",
-            "        optimizer_kwargs = dict(\n",
-            "            lr=1e-4, temperature=\"adaptive\", num_samples=len(trainset), elasticity=1.0\n",
-            "        )\n",
+            "        optimizer_kwargs = dict(lr=1e-4, localization=1.0)\n",
             "        optimizer_kwargs.update(kwargs)\n",
             "        rlct = estimate_learning_coeff(\n",
             "            model,\n",
@@ -658,7 +663,9 @@
             "\n",
             "    # Train error\n",
             "    ax.plot(df.step, df[\"mse/test\"], label=\"Test error\", color=PRIMARY)\n",
-            "    ax.plot(df.step, df[\"mse/train\"], label=\"Train error\", color=PRIMARY_LIGHT, alpha=0.5)\n",
+            "    ax.plot(\n",
+            "        df.step, df[\"mse/train\"], label=\"Train error\", color=PRIMARY_LIGHT, alpha=0.5\n",
+            "    )\n",
             "    ax.set_yscale(\"log\")\n",
             "    ax.set_ylabel(\"MSE\", color=PRIMARY)\n",
             "    ax.tick_params(axis=\"y\", labelcolor=PRIMARY)\n",
@@ -952,7 +959,7 @@
             "        seed=seed,\n",
             "    )\n",
             "    learner = config.create_learner(\n",
-            "        num_draws=10, num_chains=100, lr=1e-4, elasticity=1.0, repeats=5\n",
+            "        num_draws=10, num_chains=100, lr=1e-4, localization=1.0, repeats=5\n",
             "    )\n",
             "    df = train(learner)\n",
             "    dfs.append(df)\n",
@@ -1698,7 +1705,9 @@
             "    for noise_level in [0.0, 10.0]:\n",
             "        name = f\"rk{rk}_L4_w100_noise{noise_level}\"\n",
             "        results[name] = run_experiment(rk5_matrix, seed=SEED, **default_settings)\n",
-            "        plot_all(results[name], xlog=False, title=f\"r={rk}, L=4, w=100, noise={noise_level}\")\n",
+            "        plot_all(\n",
+            "            results[name], xlog=False, title=f\"r={rk}, L=4, w=100, noise={noise_level}\"\n",
+            "        )\n",
             "\n",
             "df = None\n",
             "\n",
@@ -2084,7 +2093,9 @@
             "for gamma in [0.75, 1.0, 1.5]:\n",
             "    # for w in [10, 100, 1000]:\n",
             "    for w in [10, 100]:\n",
-            "        results = run_experiment(rk5_matrix, seed=SEED, w=w, gamma=gamma, **fig5_settings)\n",
+            "        results = run_experiment(\n",
+            "            rk5_matrix, seed=SEED, w=w, gamma=gamma, **fig5_settings\n",
+            "        )\n",
             "        _df = pd.DataFrame(results)\n",
             "        _df[\"w\"] = w\n",
             "        _df[\"gamma\"] = gamma\n",
@@ -2498,7 +2509,7 @@
          "name": "python",
          "nbconvert_exporter": "python",
          "pygments_lexer": "ipython3",
-         "version": "3.8.10"
+         "version": "3.9.18"
       }
    },
    "nbformat": 4,