From 0f721a82bcd51996ff38a51e21e4ad758d027609 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Fri, 19 Jul 2024 21:16:37 -0700
Subject: [PATCH 1/2] Replace `load` with `load-by-name`

This change removes the `load` function, which generated a `graph` using
some opaque bytes, a `graph-encoding` enum, and an `execution-target`.
This mechanism allowed WebAssembly guest code (i.e., running within the
WebAssembly sandbox) to control _when_ a model is loaded, but by doing
so, exposed details that users will likely not need. In FaaS use cases,
e.g., user code simply does not have the time to retrieve and load a
model for every HTTP request.

This PR proposes instead that users _always_ load models outside the
sandbox and then load them by a host-specified name. This is a proposal
intended for discussion, not a foregone conclusion, so please provide
feedback! If you have a use case that relies directly on users being
able to load models via buffers, that would undermine the assumptions of
this PR (that no one will use wasi-nn in this way).

But consider the downsides of the current approach: wasi-nn must keep
track of an ever growing list of graph encodings and users must somehow
"see through" wasi-nn to set up the model buffers. Switching to
`load-by-name`--now called `load`--would resolve these issues, moving
any model and framework details into the host configuration, where they
already exist anyways.
---
 wit/wasi-nn.wit | 43 ++++++-------------------------------------
 1 file changed, 6 insertions(+), 37 deletions(-)
diff --git a/wit/wasi-nn.wit b/wit/wasi-nn.wit
index 872e8cd..2555a46 100644
--- a/wit/wasi-nn.wit
+++ b/wit/wasi-nn.wit
@@ -62,48 +62,19 @@ interface tensor {
 /// framework (e.g., TensorFlow):
 interface graph {
     use errors.{error};
-    use tensor.{tensor};
     use inference.{graph-execution-context};
 
-    /// An execution graph for performing inference (i.e., a model).
-    resource graph {
-        init-execution-context: func() -> result<graph-execution-context, error>;
-    }
-
-    /// Describes the encoding of the graph. This allows the API to be implemented by various
-    /// backends that encode (i.e., serialize) their graph IR with different formats.
-    enum graph-encoding {
-        openvino,
-        onnx,
-        tensorflow,
-        pytorch,
-        tensorflowlite,
-        ggml,
-        autodetect,
-    }
-
-    /// Define where the graph should be executed.
-    enum execution-target {
-        cpu,
-        gpu,
-        tpu
-    }
-
-    /// The graph initialization data.
-    ///
-    /// This gets bundled up into an array of buffers because implementing backends may encode their
-    /// graph IR in parts (e.g., OpenVINO stores its IR and weights separately).
-    type graph-builder = list<u8>;
-
-    /// Load a `graph` from an opaque sequence of bytes to use for inference.
-    load: func(builder: list<graph-builder>, encoding: graph-encoding, target: execution-target) -> result<graph, error>;
-
     /// Load a `graph` by name.
     ///
     /// How the host expects the names to be passed and how it stores the graphs for retrieval via
     /// this function is **implementation-specific**. This allows hosts to choose name schemes that
     /// range from simple to complex (e.g., URLs?) and caching mechanisms of various kinds.
-    load-by-name: func(name: string) -> result<graph, error>;
+    load: func(name: string) -> result<graph, error>;
+
+    /// An execution graph for performing inference (i.e., a model).
+    resource graph {
+        init-execution-context: func() -> result<graph-execution-context, error>;
+    }
 }
 
 /// An inference "session" is encapsulated by a `graph-execution-context`. This structure binds a
@@ -137,8 +108,6 @@ interface errors {
     enum error-code {
         // Caller module passed an invalid argument.
         invalid-argument,
-        // Invalid encoding.
-        invalid-encoding,
         // The operation timed out.
         timeout,
         // Runtime Error.

From 2fc6261be785cadd4e83dec66f57cffba5fc98f0 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Fri, 19 Jul 2024 21:29:23 -0700
Subject: [PATCH 2/2] Fixup `ml.md`

---
 ml.md | 64 +++++++++--------------------------------------------------
 1 file changed, 9 insertions(+), 55 deletions(-)

diff --git a/ml.md b/ml.md
index bce656f..7100772 100644
--- a/ml.md
+++ b/ml.md
@@ -102,10 +102,6 @@ containing a single value, use <code>[1]</code> for the tensor dimensions.</p>
 <p>Caller module passed an invalid argument.
 </li>
 <li>
-<p><a name="error_code.invalid_encoding"></a><code>invalid-encoding</code></p>
-<p>Invalid encoding.
-</li>
-<li>
 <p><a name="error_code.timeout"></a><code>timeout</code></p>
 <p>The operation timed out.
 </li>
@@ -231,73 +227,31 @@ framework (e.g., TensorFlow):</p>
 <h4><a name="error"></a><code>type error</code></h4>
 <p><a href="#error"><a href="#error"><code>error</code></a></a></p>
 <p>
-#### <a name="tensor"></a>`type tensor`
-[`tensor`](#tensor)
-<p>
 #### <a name="graph_execution_context"></a>`type graph-execution-context`
 [`graph-execution-context`](#graph_execution_context)
 <p>
 #### <a name="graph"></a>`resource graph`
-<p>An execution graph for performing inference (i.e., a model).</p>
-<h4><a name="graph_encoding"></a><code>enum graph-encoding</code></h4>
-<p>Describes the encoding of the graph. This allows the API to be implemented by various
-backends that encode (i.e., serialize) their graph IR with different formats.</p>
-<h5>Enum Cases</h5>
-<ul>
-<li><a name="graph_encoding.openvino"></a><code>openvino</code></li>
-<li><a name="graph_encoding.onnx"></a><code>onnx</code></li>
-<li><a name="graph_encoding.tensorflow"></a><code>tensorflow</code></li>
-<li><a name="graph_encoding.pytorch"></a><code>pytorch</code></li>
-<li><a name="graph_encoding.tensorflowlite"></a><code>tensorflowlite</code></li>
-<li><a name="graph_encoding.ggml"></a><code>ggml</code></li>
-<li><a name="graph_encoding.autodetect"></a><code>autodetect</code></li>
-</ul>
-<h4><a name="execution_target"></a><code>enum execution-target</code></h4>
-<p>Define where the graph should be executed.</p>
-<h5>Enum Cases</h5>
-<ul>
-<li><a name="execution_target.cpu"></a><code>cpu</code></li>
-<li><a name="execution_target.gpu"></a><code>gpu</code></li>
-<li><a name="execution_target.tpu"></a><code>tpu</code></li>
-</ul>
-<h4><a name="graph_builder"></a><code>type graph-builder</code></h4>
-<p><a href="#graph_builder"><a href="#graph_builder"><code>graph-builder</code></a></a></p>
-<p>The graph initialization data.
-<p>This gets bundled up into an array of buffers because implementing backends may encode their
-graph IR in parts (e.g., OpenVINO stores its IR and weights separately).</p>
-<hr />
+<h2>An execution graph for performing inference (i.e., a model).</h2>
 <h3>Functions</h3>
-<h4><a name="method_graph_init_execution_context"></a><code>[method]graph.init-execution-context: func</code></h4>
-<h5>Params</h5>
-<ul>
-<li><a name="method_graph_init_execution_context.self"></a><code>self</code>: borrow&lt;<a href="#graph"><a href="#graph"><code>graph</code></a></a>&gt;</li>
-</ul>
-<h5>Return values</h5>
-<ul>
-<li><a name="method_graph_init_execution_context.0"></a> result&lt;own&lt;<a href="#graph_execution_context"><a href="#graph_execution_context"><code>graph-execution-context</code></a></a>&gt;, own&lt;<a href="#error"><a href="#error"><code>error</code></a></a>&gt;&gt;</li>
-</ul>
 <h4><a name="load"></a><code>load: func</code></h4>
-<p>Load a <a href="#graph"><code>graph</code></a> from an opaque sequence of bytes to use for inference.</p>
+<p>Load a <a href="#graph"><code>graph</code></a> by name.</p>
+<p>How the host expects the names to be passed and how it stores the graphs for retrieval via
+this function is <strong>implementation-specific</strong>. This allows hosts to choose name schemes that
+range from simple to complex (e.g., URLs?) and caching mechanisms of various kinds.</p>
 <h5>Params</h5>
 <ul>
-<li><a name="load.builder"></a><code>builder</code>: list&lt;<a href="#graph_builder"><a href="#graph_builder"><code>graph-builder</code></a></a>&gt;</li>
-<li><a name="load.encoding"></a><code>encoding</code>: <a href="#graph_encoding"><a href="#graph_encoding"><code>graph-encoding</code></a></a></li>
-<li><a name="load.target"></a><code>target</code>: <a href="#execution_target"><a href="#execution_target"><code>execution-target</code></a></a></li>
+<li><a name="load.name"></a><code>name</code>: <code>string</code></li>
 </ul>
 <h5>Return values</h5>
 <ul>
 <li><a name="load.0"></a> result&lt;own&lt;<a href="#graph"><a href="#graph"><code>graph</code></a></a>&gt;, own&lt;<a href="#error"><a href="#error"><code>error</code></a></a>&gt;&gt;</li>
 </ul>
-<h4><a name="load_by_name"></a><code>load-by-name: func</code></h4>
-<p>Load a <a href="#graph"><code>graph</code></a> by name.</p>
-<p>How the host expects the names to be passed and how it stores the graphs for retrieval via
-this function is <strong>implementation-specific</strong>. This allows hosts to choose name schemes that
-range from simple to complex (e.g., URLs?) and caching mechanisms of various kinds.</p>
+<h4><a name="method_graph_init_execution_context"></a><code>[method]graph.init-execution-context: func</code></h4>
 <h5>Params</h5>
 <ul>
-<li><a name="load_by_name.name"></a><code>name</code>: <code>string</code></li>
+<li><a name="method_graph_init_execution_context.self"></a><code>self</code>: borrow&lt;<a href="#graph"><a href="#graph"><code>graph</code></a></a>&gt;</li>
 </ul>
 <h5>Return values</h5>
 <ul>
-<li><a name="load_by_name.0"></a> result&lt;own&lt;<a href="#graph"><a href="#graph"><code>graph</code></a></a>&gt;, own&lt;<a href="#error"><a href="#error"><code>error</code></a></a>&gt;&gt;</li>
+<li><a name="method_graph_init_execution_context.0"></a> result&lt;own&lt;<a href="#graph_execution_context"><a href="#graph_execution_context"><code>graph-execution-context</code></a></a>&gt;, own&lt;<a href="#error"><a href="#error"><code>error</code></a></a>&gt;&gt;</li>
 </ul>