diff --git a/sdk/python/sample/MLMD-Cases.ipynb b/sdk/python/sample/MLMD-Cases.ipynb new file mode 100644 index 000000000..69ee6856a --- /dev/null +++ b/sdk/python/sample/MLMD-Cases.ipynb @@ -0,0 +1,426 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Install the _Kubeflow-metadata_ library (Load prereqs)\n", + "_**Note:** Make sure to have run:_\n", + "\n", + "```bash\n", + "kubectl port-forward --namespace kubeflow $(kubectl get pod --namespace kubeflow --selector=\"component=grpc-server,kustomize.component=metadata\" --output jsonpath='{.items[0].metadata.name}') 8080:8080\n", + "```" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "outputs": [], + "source": [ + "# To use the latest publish `kubeflow-metadata` library, you can run:\n", + "!pip install kubeflow-metadata --user\n", + "# Install other packages:\n", + "!pip install pandas --user\n", + "# Then restart the Notebook kernel." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas\n", + "from kubeflow.metadata import metadata\n", + "from datetime import datetime\n", + "from uuid import uuid4\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Load all test cases\n", + "- All 5 columns\n", + "- 7 Columns worth of data\n", + "- Partials:\n", + " - Active Execution\n", + " - 3 columns\n", + " - 4 columns\n", + "- Multiples\n", + " - Multi-Input\n", + " - Multi-Output\n", + " - Multi-Execution" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "cases = [\n", + " \"All 5 columns\",\n", + " \"7 Columns worth of data\",\n", + " \"Active Execution\",\n", + " \"3 columns\",\n", + " \"4 columns\"\n", + "]\n", + "\n", + "ws = [\n", + " metadata.Workspace(\n", + " store=metadata.Store(grpc_host=\"localhost\", grpc_port=8080),\n", + " name=\"test_case_{}\".format(i),\n", + " description=x,\n", + " labels={\"n1\": \"v1\"}) for i, x in enumerate(cases)]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "runs = [\n", + " metadata.Run(\n", + " workspace=w,\n", + " name=\"run-\" + datetime.utcnow().isoformat(\"T\") ,\n", + " description=\"a run in ws_{}\".format(i),\n", + " ) for i, w in enumerate(ws)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "execs = [\n", + " metadata.Execution(\n", + " name = \"execution-\" + datetime.utcnow().isoformat(\"T\") ,\n", + " workspace=w,\n", + " run=runs[i],\n", + " description=cases[i],\n", + " ) for i, w in enumerate(ws)]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "Created executions: [61, 62, 63, 64, 65]\n" + } + ], + "source": [ + "print('Created executions:', list(map(lambda x: x.id, execs)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Let's create fake data sources, that can be shared by our sources" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "Data sets: [\"{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}\"]\nData sets: [\"{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}\"]\nData sets: [\"{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}\", \"{id: 96, version: 'data_set_version_7cbfcb57-101f-4a7b-9d37-72547e7314ca'}\"]\nData sets: [\"{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}\"]\nData sets: [\"{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}\", \"{id: 96, version: 'data_set_version_7cbfcb57-101f-4a7b-9d37-72547e7314ca'}\"]\n" + } + ], + "source": [ + "get_date_set_version = lambda: \"data_set_version_\" + str(uuid4())\n", + "fileSources = [\n", + " metadata.DataSet(\n", + " description=\"Sample file set 1\",\n", + " name=\"table-dump\",\n", + " owner=\"ap@kubeflow.org\",\n", + " uri=\"file://datasets/dump1\",\n", + " version=get_date_set_version(),\n", + " query=\"SELECT * FROM mytable\"),\n", + " metadata.DataSet(\n", + " description=\"Sample file set 2\",\n", + " name=\"cloud-table\",\n", + " owner=\"ap@kubeflow.org\",\n", + " uri=\"gs://cloud/table.csv\",\n", + " version=get_date_set_version(),\n", + " query=\"SELECT * FROM mytable\"),\n", + "]\n", + "\n", + "how_many_sources = np.random.choice(len(fileSources), len(execs))\n", + "data_sets = []\n", + "\n", + "for i, src_count in enumerate(how_many_sources):\n", + " exec = execs[i]\n", + " ds = fileSources[0:src_count+1]\n", + " ds = list(map(lambda x: exec.log_input(x), ds))\n", + " print(\"Data sets:\", [\"{{id: {0.id}, version: '{0.version}'}}\".format(d) for d in ds])\n", + " data_sets.append(ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Log a model" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "Models: [\"{id: 11, version: 'model_version_0'}\"]\nModels: [\"{id: 12, version: 'model_version_1'}\", \"{id: 30, version: 'model_version_1'}\"]\nModels: [\"{id: 13, version: 'model_version_2'}\"]\nModels: [\"{id: 15, version: 'model_version_3'}\", \"{id: 74, version: 'model_version_3'}\"]\nModels: [\"{id: 16, version: 'model_version_4'}\"]\n" + } + ], + "source": [ + "models = []\n", + "for i, exec in enumerate(execs):\n", + " model_version = \"model_version_{}\".format(i)\n", + " l = []\n", + " l.append(exec.log_output(\n", + " metadata.Model(\n", + " name=\"MNIST\",\n", + " description=\"model to recognize handwritten digits\",\n", + " owner=\"someone@kubeflow.org\",\n", + " uri=\"gcs://my-bucket/mnist\",\n", + " model_type=\"neural network\",\n", + " training_framework={\n", + " \"name\": \"tensorflow\",\n", + " \"version\": \"v1.0\"\n", + " },\n", + " hyperparameters={\n", + " \"learning_rate\": 0.5,\n", + " \"layers\": [10, 3, 1],\n", + " \"early_stop\": True\n", + " },\n", + " version=model_version,\n", + " labels={\"mylabel\": \"l1\"})))\n", + " if np.random.choice(2, 1, p=[.6, .4]) == 1:\n", + " l.append(exec.log_output(\n", + " metadata.Model(\n", + " name=\"SVHN\",\n", + " description=\"model to recognize house numbers on map images\",\n", + " owner=\"ap@kubeflow.org\",\n", + " uri=\"gcs://my-bucket/svhn\",\n", + " model_type=\"neural network\",\n", + " training_framework={\n", + " \"name\": \"pytorch\",\n", + " \"version\": \"v1.0\"\n", + " },\n", + " hyperparameters={\n", + " \"learning_rate\": 0.0001,\n", + " \"layers\": [10, 3, 1],\n", + " \"early_stop\": True\n", + " },\n", + " version=model_version,\n", + " labels={\"mylabel\": \"l1\"})))\n", + " print(\"Models:\", [\"{{id: {0.id}, version: '{0.version}'}}\".format(d) for d in l])\n", + " models.append(l)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Log an evaluation of a model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "Metrics created!\n" + } + ], + "source": [ + "for i, exec in enumerate(execs):\n", + " for model in models[i]:\n", + " for data_set in data_sets[i]:\n", + " metrics = exec.log_output(\n", + " metadata.Metrics(\n", + " name=\"{}-evaluation\".format(model.name),\n", + " description=\"validating the {0.name} model to {0.description}\".format(model),\n", + " owner=model.owner,\n", + " uri=\"gcs://my-bucket/{}-eval.csv\".format(model.name.lower()),\n", + " data_set_id=str(data_set.id),\n", + " model_id=str(model.id),\n", + " metrics_type=metadata.Metrics.VALIDATION,\n", + " values={\"accuracy\": np.random.uniform(low=.6)},\n", + " labels={\"mylabel\": \"l1\"}))\n", + "print('Metrics created!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Add Metadata for serving the model" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "for i, w in enumerate(ws):\n", + " serving_application = None\n", + " if i in [2,3]:\n", + " print('Retrain for', i)\n", + " serving_application = metadata.Execution(\n", + " name=\"Retrain step\",\n", + " workspace=w,\n", + " run=runs[i],\n", + " description=\"retrain model to be more accurate on a scoped problem\",\n", + " )\n", + " else:\n", + " serving_application = metadata.Execution(\n", + " name=\"serving model\",\n", + " workspace=w,\n", + " description=\"an execution to represent model serving component\",\n", + " )\n", + " for model in models[i]:\n", + " # Noticed we use model name, version, uri to uniquely identify existing model.\n", + " served_model = metadata.Model(\n", + " name=\"MNIST\",\n", + " uri=\"gcs://my-bucket/mnist\",\n", + " version=model.version,\n", + " )\n", + " m=serving_application.log_input(served_model)\n", + " if i in [2,3]:\n", + " print('Attaching new model', i)\n", + " o_model = metadata.Model(\n", + " name=\"Retrained MNIST\",\n", + " description=\"better recognition of slanted digits\",\n", + " owner=\"ap@kubeflow.org\",\n", + " uri=\"gcs://my-bucket/mnist-slanted\",\n", + " model_type=\"neural network\",\n", + " training_framework={\n", + " \"name\": \"pytorch\",\n", + " \"version\": \"v1.0\"\n", + " },\n", + " hyperparameters={\n", + " \"learning_rate\": 0.01,\n", + " \"layers\": [5, 3, 1],\n", + " \"early_stop\": True\n", + " },\n", + " version=model_version,\n", + " labels={\"mylabel\": \"l2\"}\n", + " )\n", + " serving_application.log_output(o_model)\n", + " exec = metadata.Execution(\n", + " name=\"serving model\",\n", + " workspace=w,\n", + " run=runs[i],\n", + " description=\"an execution to represent model serving component\",\n", + " )\n", + " exec.log_input(o_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### List all models in the workspace" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
| \n | id | \nworkspace | \nrun | \nversion | \nowner | \ndescription | \nname | \nmodel_type | \ncreate_time | \nuri | \ntraining_framework | \nhyperparameters | \nlabels | \nkwargs | \n
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n13 | \ntest_case_2 | \nrun-2019-12-13T23:05:35.516946 | \nmodel_version_2 | \nsomeone@kubeflow.org | \nmodel to recognize handwritten digits | \nMNIST | \nneural network | \n2019-12-13T23:05:38.064051Z | \ngcs://my-bucket/mnist | \n{'name': 'tensorflow', 'version': 'v1.0'} | \n{'learning_rate': 0.5, 'layers': [10, 3, 1], '... | \n{'mylabel': 'l1'} | \n{} | \n
| 1 | \n14 | \ntest_case_2 | \nrun-2019-12-13T23:05:35.516946 | \nmodel_version_2 | \nap@kubeflow.org | \nmodel to recognize house numbers on map images | \nSVHN | \nneural network | \n2019-12-13T23:05:38.641097Z | \ngcs://my-bucket/svhn | \n{'name': 'pytorch', 'version': 'v1.0'} | \n{'learning_rate': 0.0001, 'layers': [10, 3, 1]... | \n{'mylabel': 'l1'} | \n{} | \n