diff --git a/_src/notebooks.html b/_src/notebooks.html
new file mode 100644
index 000000000..e14da4382
--- /dev/null
+++ b/_src/notebooks.html
@@ -0,0 +1,13358 @@
+---
+layout: page
+title: Notebooks
+description: Apache SystemML Notebooks
+group: nav-right
+---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ site.data.project.name }} Notebooks
+
#@title Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+The Feature Engineering Component of TensorFlow Extended (TFX)
+This example colab notebook provides a very simple example of how TensorFlow Transform (tf.Transform) can be used to preprocess data using exactly the same code for both training a model and serving inferences in production.
TensorFlow Transform is a library for preprocessing input data for TensorFlow, including creating features that require a full pass over the training dataset. For example, using TensorFlow Transform you could:
+TensorFlow has built-in support for manipulations on a single example or a batch of examples. tf.Transform extends these capabilities to support full passes over the entire training dataset.
The output of tf.Transform is exported as a TensorFlow graph which you can use for both training and serving. Using the same graph for both training and serving can prevent skew, since the same transformations are applied in both stages.
First, we'll make sure that we're using Python 3. Then, we'll go ahead and install and import the stuff we need.
+ +import sys
+
+# Confirm that we're using Python 3
+assert sys.version_info.major is 3, 'Oops, not running Python 3. Use Runtime > Change runtime type'
+import argparse
+import os
+import pprint
+import tempfile
+import urllib.request
+import zipfile
+
+print("Installing dependencies for Colab environment")
+!pip install -Uq grpcio==1.26.0
+
+import tensorflow as tf
+
+print('Installing Apache Beam')
+!pip install -Uq apache_beam==2.16.0
+import apache_beam as beam
+
+print('Installing TensorFlow Transform')
+!pip install -Uq tensorflow-transform==0.15.0
+import tensorflow_transform as tft
+
+import apache_beam.io.iobase
+import tensorflow_transform.beam as tft_beam
+from tensorflow_transform.tf_metadata import dataset_metadata
+from tensorflow_transform.tf_metadata import dataset_schema
+We'll create some simple dummy data for our simple example:
+raw_data is the initial raw data that we're going to preprocessraw_data_metadata contains the schema that tells us the types of each of the columns in raw_data. In this case, it's very simple.raw_data = [
+ {'x': 1, 'y': 1, 's': 'hello'},
+ {'x': 2, 'y': 2, 's': 'world'},
+ {'x': 3, 'y': 3, 's': 'hello'}
+ ]
+
+raw_data_metadata = dataset_metadata.DatasetMetadata(
+ dataset_schema.from_feature_spec({
+ 'y': tf.io.FixedLenFeature([], tf.float32),
+ 'x': tf.io.FixedLenFeature([], tf.float32),
+ 's': tf.io.FixedLenFeature([], tf.string),
+ }))
+The preprocessing function is the most important concept of tf.Transform. A preprocessing function is where the transformation of the dataset really happens. It accepts and returns a dictionary of tensors, where a tensor means a Tensor or SparseTensor. There are two main groups of API calls that typically form the heart of a preprocessing function:
tft.min computes the minimum of a tensor over the training dataset. tf.Transform provides a fixed set of analyzers, but this will be extended in future versions.Caution: When you apply your preprocessing function to serving inferences, the constants that were created by analyzers during training do not change. If your data has trend or seasonality components, plan accordingly.
+ +def preprocessing_fn(inputs):
+ """Preprocess input columns into transformed columns."""
+ x = inputs['x']
+ y = inputs['y']
+ s = inputs['s']
+ x_centered = x - tft.mean(x)
+ y_normalized = tft.scale_to_0_1(y)
+ s_integerized = tft.compute_and_apply_vocabulary(s)
+ x_centered_times_y_normalized = (x_centered * y_normalized)
+ return {
+ 'x_centered': x_centered,
+ 'y_normalized': y_normalized,
+ 's_integerized': s_integerized,
+ 'x_centered_times_y_normalized': x_centered_times_y_normalized,
+ }
+Now we're ready to transform our data. We'll use Apache Beam with a direct runner, and supply three inputs:
+raw_data - The raw input data that we created aboveraw_data_metadata - The schema for the raw datapreprocessing_fn - The function that we created to do our transformationdef main():
+ # Ignore the warnings
+ with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
+ transformed_dataset, transform_fn = ( # pylint: disable=unused-variable
+ (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
+ preprocessing_fn))
+
+ transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable
+
+ print('\nRaw data:\n{}\n'.format(pprint.pformat(raw_data)))
+ print('Transformed data:\n{}'.format(pprint.pformat(transformed_data)))
+
+if __name__ == '__main__':
+ main()
+Previously, we used tf.Transform to do this:
x_centered = x - tft.mean(x)
+y_normalized = tft.scale_to_0_1(y)
+s_integerized = tft.compute_and_apply_vocabulary(s)
+x_centered_times_y_normalized = (x_centered * y_normalized)
+With input of [1, 2, 3] the mean of x is 2, and we subtract it from x to center our x values at 0. So our result of [-1.0, 0.0, 1.0] is correct.
We wanted to scale our y values between 0 and 1. Our input was [1, 2, 3] so our result of [0.0, 0.5, 1.0] is correct.
We wanted to map our strings to indexes in a vocabulary, and there were only 2 words in our vocabulary ("hello" and "world"). So with input of ["hello", "world", "hello"] our result of [0, 1, 0] is correct.
We wanted to create a new feature by crossing x_centered and y_normalized using multiplication. Note that this multiplies the results, not the original values, and our new result of [-0.0, 0.0, 1.0] is correct.
+ Compressed Linear Algebra for Large-Scale Machine Learning + Ahmed Elgohary, Matthias Boehm, Peter J. Haas, Frederick R. Reiss, Berthold Reinwald + VLDB Journal, 2018 +
+ ++ On Optimizing Operator Fusion Plans for Large-Scale Machine Learning in SystemML + Matthias Boehm, Berthold Reinwald, Dylan Hutchison, Alexandre V. Evfimievski, Prithviraj Sen + CoRR,abs/1801.00829, 2018 +
++ Scaling Machine Learning via Compressed Linear Algebra Ahmed Elgohary, Matthias Boehm, Peter J. Haas, Frederick R. Reiss, Berthold Reinwald + SIGMOD Record 46(1), 2017 +
+SPOOF: Sum-Product Optimization and Operator Fusion for Large-Scale Machine Learning + Tarek Elgamal, Shangyu Luo, Matthias Boehm, Alexandre V. Evfimievski, Shirish Tatikonda, Berthold Reinwald, Prithviraj Sen: + CIDR, 2017 +
+Compressed Linear Algebra for Large-Scale Machine Learning + Ahmed Elgohary, Matthias Boehm, Peter J. Haas, Frederick R. Reiss, Berthold Reinwald + PVLDB 9(12), 2016
+Declarative Machine Learning - A Classification of Basic Properties and Types + Matthias Boehm, Alexandre V. Evfimievski, Niketan Pansare, Berthold Reinwald + CoRR,abs/1605.05826, 2016
+SystemML: Declarative Machine Learning on Spark (Industrial) + Matthias Boehm, Michael W. Dusenberry, Deron Eriksson, Alexandre V. Evfimievski, Faraz Makari Manshadi, Niketan Pansare, Berthold Reinwald, Frederick R. Reiss, Prithviraj Sen, Arvind C. Surve, Shirish Tatikonda + PVLDB 9(13), 2016
+On Optimizing Machine Learning Workloads via Kernel Fusion +Arash Ashari, Shirish Tatikonda, Matthias Boehm, Berthold Reinwald, Keith Campbell, John Keenleyside, P. Sadayappan +PPoPP, 2015
+Costing Generated Runtime Execution Plans for Large-Scale Machine Learning Programs +Matthias Boehm +CoRR,abs/1503.06384, 2015
+Resource Elasticity for Large-Scale Machine Learning +Botong Huang, Matthias Boehm, Yuanyuan Tian, Berthold Reinwald, Shirish Tatikonda, Frederick R. Reiss +SIGMOD, 2015
+ +Large Scale Discriminative Metric Learning +Peter D. Kirchner, Matthias Boehm, Berthold Reinwald, Daby M. Sow, Michael Schmidt, Deepak S. Turaga, Alain Biem +ParLearning, 2014
+Hybrid Parallelization Strategies for Large-Scale Machine Learning in SystemML +Matthias Boehm, Shirish Tatikonda, Berthold Reinwald, Prithviraj Sen, Yuanyuan Tian, Douglas Burdick, Shivakumar Vaithyanathan +PVLDB 7(7), 2014
+SystemML's Optimizer: Plan Generation for Large-Scale Machine Learning Programs +Matthias Boehm, Douglas R. Burdick, Alexandre V. Evfimievski, Berthold Reinwald, Frederick R. Reiss, Prithviraj Sen, Shirish Tatikonda, Yuanyuan Tian +IEEE Data Eng. Bull. 37(3), 2014
+Compiling Machine Learning Algorithms with SystemML (Poster) +Matthias Boehm, Douglas Burdick, Alexandre V. Evfimievski, Berthold Reinwald, Prithviraj Sen, Shirish Tatikonda, Yuanyuan Tian +SOCC, 2013
+Scalable and Numerically Stable Descriptive Statistics in SystemML + Y. Tian, S. Tatikonda, B. Reinwald + Data Engineering (ICDE), 2012 IEEE 28th International Conference on, pp. 1351--1359
+SystemML: Declarative machine learning on MapReduce + A. Ghoting, R. Krishnamurthy, E. Pednault, B. Reinwald, V. Sindhwani, S. Tatikonda, Y. Tian, S. Vaithyanathan + Data Engineering (ICDE), 2011 IEEE 27th International Conference on, pp. 231--242
+