diff --git a/sdk/python/sample/MLMD-Cases.ipynb b/sdk/python/sample/MLMD-Cases.ipynb new file mode 100644 index 000000000..69ee6856a --- /dev/null +++ b/sdk/python/sample/MLMD-Cases.ipynb @@ -0,0 +1,426 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Install the _Kubeflow-metadata_ library (Load prereqs)\n", + "_**Note:** Make sure to have run:_\n", + "\n", + "```bash\n", + "kubectl port-forward --namespace kubeflow $(kubectl get pod --namespace kubeflow --selector=\"component=grpc-server,kustomize.component=metadata\" --output jsonpath='{.items[0].metadata.name}') 8080:8080\n", + "```" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "outputs": [], + "source": [ + "# To use the latest publish `kubeflow-metadata` library, you can run:\n", + "!pip install kubeflow-metadata --user\n", + "# Install other packages:\n", + "!pip install pandas --user\n", + "# Then restart the Notebook kernel." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas\n", + "from kubeflow.metadata import metadata\n", + "from datetime import datetime\n", + "from uuid import uuid4\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Load all test cases\n", + "- All 5 columns\n", + "- 7 Columns worth of data\n", + "- Partials:\n", + " - Active Execution\n", + " - 3 columns\n", + " - 4 columns\n", + "- Multiples\n", + " - Multi-Input\n", + " - Multi-Output\n", + " - Multi-Execution" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "cases = [\n", + " \"All 5 columns\",\n", + " \"7 Columns worth of data\",\n", + " \"Active Execution\",\n", + " \"3 columns\",\n", + " \"4 columns\"\n", + "]\n", + "\n", + "ws = [\n", + " metadata.Workspace(\n", + " store=metadata.Store(grpc_host=\"localhost\", grpc_port=8080),\n", + " name=\"test_case_{}\".format(i),\n", + " description=x,\n", + " labels={\"n1\": \"v1\"}) for i, x in enumerate(cases)]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "runs = [\n", + " metadata.Run(\n", + " workspace=w,\n", + " name=\"run-\" + datetime.utcnow().isoformat(\"T\") ,\n", + " description=\"a run in ws_{}\".format(i),\n", + " ) for i, w in enumerate(ws)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "execs = [\n", + " metadata.Execution(\n", + " name = \"execution-\" + datetime.utcnow().isoformat(\"T\") ,\n", + " workspace=w,\n", + " run=runs[i],\n", + " description=cases[i],\n", + " ) for i, w in enumerate(ws)]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "Created executions: [61, 62, 63, 64, 65]\n" + } + ], + "source": [ + "print('Created executions:', list(map(lambda x: x.id, execs)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Let's create fake data sources, that can be shared by our sources" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "Data sets: [\"{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}\"]\nData sets: [\"{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}\"]\nData sets: [\"{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}\", \"{id: 96, version: 'data_set_version_7cbfcb57-101f-4a7b-9d37-72547e7314ca'}\"]\nData sets: [\"{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}\"]\nData sets: [\"{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}\", \"{id: 96, version: 'data_set_version_7cbfcb57-101f-4a7b-9d37-72547e7314ca'}\"]\n" + } + ], + "source": [ + "get_date_set_version = lambda: \"data_set_version_\" + str(uuid4())\n", + "fileSources = [\n", + " metadata.DataSet(\n", + " description=\"Sample file set 1\",\n", + " name=\"table-dump\",\n", + " owner=\"ap@kubeflow.org\",\n", + " uri=\"file://datasets/dump1\",\n", + " version=get_date_set_version(),\n", + " query=\"SELECT * FROM mytable\"),\n", + " metadata.DataSet(\n", + " description=\"Sample file set 2\",\n", + " name=\"cloud-table\",\n", + " owner=\"ap@kubeflow.org\",\n", + " uri=\"gs://cloud/table.csv\",\n", + " version=get_date_set_version(),\n", + " query=\"SELECT * FROM mytable\"),\n", + "]\n", + "\n", + "how_many_sources = np.random.choice(len(fileSources), len(execs))\n", + "data_sets = []\n", + "\n", + "for i, src_count in enumerate(how_many_sources):\n", + " exec = execs[i]\n", + " ds = fileSources[0:src_count+1]\n", + " ds = list(map(lambda x: exec.log_input(x), ds))\n", + " print(\"Data sets:\", [\"{{id: {0.id}, version: '{0.version}'}}\".format(d) for d in ds])\n", + " data_sets.append(ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Log a model" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "Models: [\"{id: 11, version: 'model_version_0'}\"]\nModels: [\"{id: 12, version: 'model_version_1'}\", \"{id: 30, version: 'model_version_1'}\"]\nModels: [\"{id: 13, version: 'model_version_2'}\"]\nModels: [\"{id: 15, version: 'model_version_3'}\", \"{id: 74, version: 'model_version_3'}\"]\nModels: [\"{id: 16, version: 'model_version_4'}\"]\n" + } + ], + "source": [ + "models = []\n", + "for i, exec in enumerate(execs):\n", + " model_version = \"model_version_{}\".format(i)\n", + " l = []\n", + " l.append(exec.log_output(\n", + " metadata.Model(\n", + " name=\"MNIST\",\n", + " description=\"model to recognize handwritten digits\",\n", + " owner=\"someone@kubeflow.org\",\n", + " uri=\"gcs://my-bucket/mnist\",\n", + " model_type=\"neural network\",\n", + " training_framework={\n", + " \"name\": \"tensorflow\",\n", + " \"version\": \"v1.0\"\n", + " },\n", + " hyperparameters={\n", + " \"learning_rate\": 0.5,\n", + " \"layers\": [10, 3, 1],\n", + " \"early_stop\": True\n", + " },\n", + " version=model_version,\n", + " labels={\"mylabel\": \"l1\"})))\n", + " if np.random.choice(2, 1, p=[.6, .4]) == 1:\n", + " l.append(exec.log_output(\n", + " metadata.Model(\n", + " name=\"SVHN\",\n", + " description=\"model to recognize house numbers on map images\",\n", + " owner=\"ap@kubeflow.org\",\n", + " uri=\"gcs://my-bucket/svhn\",\n", + " model_type=\"neural network\",\n", + " training_framework={\n", + " \"name\": \"pytorch\",\n", + " \"version\": \"v1.0\"\n", + " },\n", + " hyperparameters={\n", + " \"learning_rate\": 0.0001,\n", + " \"layers\": [10, 3, 1],\n", + " \"early_stop\": True\n", + " },\n", + " version=model_version,\n", + " labels={\"mylabel\": \"l1\"})))\n", + " print(\"Models:\", [\"{{id: {0.id}, version: '{0.version}'}}\".format(d) for d in l])\n", + " models.append(l)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Log an evaluation of a model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "Metrics created!\n" + } + ], + "source": [ + "for i, exec in enumerate(execs):\n", + " for model in models[i]:\n", + " for data_set in data_sets[i]:\n", + " metrics = exec.log_output(\n", + " metadata.Metrics(\n", + " name=\"{}-evaluation\".format(model.name),\n", + " description=\"validating the {0.name} model to {0.description}\".format(model),\n", + " owner=model.owner,\n", + " uri=\"gcs://my-bucket/{}-eval.csv\".format(model.name.lower()),\n", + " data_set_id=str(data_set.id),\n", + " model_id=str(model.id),\n", + " metrics_type=metadata.Metrics.VALIDATION,\n", + " values={\"accuracy\": np.random.uniform(low=.6)},\n", + " labels={\"mylabel\": \"l1\"}))\n", + "print('Metrics created!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Add Metadata for serving the model" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "for i, w in enumerate(ws):\n", + " serving_application = None\n", + " if i in [2,3]:\n", + " print('Retrain for', i)\n", + " serving_application = metadata.Execution(\n", + " name=\"Retrain step\",\n", + " workspace=w,\n", + " run=runs[i],\n", + " description=\"retrain model to be more accurate on a scoped problem\",\n", + " )\n", + " else:\n", + " serving_application = metadata.Execution(\n", + " name=\"serving model\",\n", + " workspace=w,\n", + " description=\"an execution to represent model serving component\",\n", + " )\n", + " for model in models[i]:\n", + " # Noticed we use model name, version, uri to uniquely identify existing model.\n", + " served_model = metadata.Model(\n", + " name=\"MNIST\",\n", + " uri=\"gcs://my-bucket/mnist\",\n", + " version=model.version,\n", + " )\n", + " m=serving_application.log_input(served_model)\n", + " if i in [2,3]:\n", + " print('Attaching new model', i)\n", + " o_model = metadata.Model(\n", + " name=\"Retrained MNIST\",\n", + " description=\"better recognition of slanted digits\",\n", + " owner=\"ap@kubeflow.org\",\n", + " uri=\"gcs://my-bucket/mnist-slanted\",\n", + " model_type=\"neural network\",\n", + " training_framework={\n", + " \"name\": \"pytorch\",\n", + " \"version\": \"v1.0\"\n", + " },\n", + " hyperparameters={\n", + " \"learning_rate\": 0.01,\n", + " \"layers\": [5, 3, 1],\n", + " \"early_stop\": True\n", + " },\n", + " version=model_version,\n", + " labels={\"mylabel\": \"l2\"}\n", + " )\n", + " serving_application.log_output(o_model)\n", + " exec = metadata.Execution(\n", + " name=\"serving model\",\n", + " workspace=w,\n", + " run=runs[i],\n", + " description=\"an execution to represent model serving component\",\n", + " )\n", + " exec.log_input(o_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### List all models in the workspace" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idworkspacerunversionownerdescriptionnamemodel_typecreate_timeuritraining_frameworkhyperparameterslabelskwargs
013test_case_2run-2019-12-13T23:05:35.516946model_version_2someone@kubeflow.orgmodel to recognize handwritten digitsMNISTneural network2019-12-13T23:05:38.064051Zgcs://my-bucket/mnist{'name': 'tensorflow', 'version': 'v1.0'}{'learning_rate': 0.5, 'layers': [10, 3, 1], '...{'mylabel': 'l1'}{}
114test_case_2run-2019-12-13T23:05:35.516946model_version_2ap@kubeflow.orgmodel to recognize house numbers on map imagesSVHNneural network2019-12-13T23:05:38.641097Zgcs://my-bucket/svhn{'name': 'pytorch', 'version': 'v1.0'}{'learning_rate': 0.0001, 'layers': [10, 3, 1]...{'mylabel': 'l1'}{}
\n
", + "text/plain": " id workspace run version \\\n0 13 test_case_2 run-2019-12-13T23:05:35.516946 model_version_2 \n1 14 test_case_2 run-2019-12-13T23:05:35.516946 model_version_2 \n\n owner description \\\n0 someone@kubeflow.org model to recognize handwritten digits \n1 ap@kubeflow.org model to recognize house numbers on map images \n\n name model_type create_time uri \\\n0 MNIST neural network 2019-12-13T23:05:38.064051Z gcs://my-bucket/mnist \n1 SVHN neural network 2019-12-13T23:05:38.641097Z gcs://my-bucket/svhn \n\n training_framework \\\n0 {'name': 'tensorflow', 'version': 'v1.0'} \n1 {'name': 'pytorch', 'version': 'v1.0'} \n\n hyperparameters labels kwargs \n0 {'learning_rate': 0.5, 'layers': [10, 3, 1], '... {'mylabel': 'l1'} {} \n1 {'learning_rate': 0.0001, 'layers': [10, 3, 1]... {'mylabel': 'l1'} {} " + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pandas.DataFrame.from_dict(ws[2].list(metadata.Model.ARTIFACT_TYPE_NAME))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "outputs": [], + "source": [ + "### Basic Lineage Tracking" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "outputs": [], + "source": [ + "# print(\"model id is %s\\n\" % model.id)\n", + " \n", + "# model_events = ws1.store.get_events_by_artifact_ids([model.id])\n", + "\n", + "# execution_ids = set(e.execution_id for e in model_events)\n", + "# print(\"All executions related to the model are {}\".format(execution_ids))\n", + "# # assert execution_ids == set([serving_application.id, exec.id])\n", + "\n", + "# trainer_events = ws1.store.get_events_by_execution_ids([exec.id])\n", + "# artifact_ids = set(e.artifact_id for e in trainer_events)\n", + "# print(\"All artifacts related to the training event are {}\".format(artifact_ids))# assert artifact_ids == set([model.id, metrics.id, data_set.id])" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file