From 3fe96f4474ea5829a24e32f59c7041746af3699c Mon Sep 17 00:00:00 2001 From: jasminthiele Date: Tue, 27 Jan 2026 12:27:33 +0100 Subject: [PATCH 1/3] Add movie recommender tutorial --- .../docs/source/guide/movie_recommender.rst | 493 +++++++++++++++++ src/main/python/docs/source/index.rst | 1 + .../tutorials/movie_recommender_system.py | 508 ++++++++++++++++++ 3 files changed, 1002 insertions(+) create mode 100644 src/main/python/docs/source/guide/movie_recommender.rst create mode 100644 src/main/python/systemds/examples/tutorials/movie_recommender_system.py diff --git a/src/main/python/docs/source/guide/movie_recommender.rst b/src/main/python/docs/source/guide/movie_recommender.rst new file mode 100644 index 00000000000..329507ccb62 --- /dev/null +++ b/src/main/python/docs/source/guide/movie_recommender.rst @@ -0,0 +1,493 @@ +.. ------------------------------------------------------------- +.. +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. +.. +.. ------------------------------------------------------------ + + +Building a Movie Recommender System +=============== + +Have you ever wondered how Netflix, Disney+, and other streaming +platforms know exactly which movies or TV shows to recommend to you? In +this tutorial, we will explore how recommender systems work and show how +to implement them using SystemDS, as well as NumPy and PyTorch for +comparison. The goal of this tutorial is to showcase different features +of the SystemDS framework that can be accessed with the Python API. + +In this tutorial, we will explore the implementation of a recommender +system using three distinct mathematical and machine learning +approaches: + +- **Cosine Similarity**: A geometric approach to measure the similarity + between users or items based on the angle between their preference + vectors. + +- **Matrix Factorization**: A technique often used in latent factor + models (like ALS) to decompose the user-item interaction matrix into + lower-dimensional representations. + +- **Linear Regression**: A supervised learning approach used to predict + specific ratings by modeling the relationship between user/item + features and the target rating. + +This tutorial shows only snippets of the code and the whole code can be +found +`here `__. + +To start with the tutorial, you first have to install SystemDS: :doc:`/getting_started/install`. + + +Dataset +~~~~~~~ + +As a dataset we chose the `MovieLens 100K +Dataset `__. It consists +of 100.000 movie ratings from 943 different users on 1682 movies from +the late 1990s. In the following, we will often refer to movies as +items. The data is stored in different files: +- u.data (contains +user_id, item_id (movie), rating and timestamp), +- u.user (contains +user_id, age, gender, occupation and zip code), +- u.item (contains +item_id, movie name, release date, ImDb link, genre in a hot-one +format). + +Preprocessing for Cosine Similarity and Matrix Factorization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To prepare our data for Cosine Similarity and Matrix Factorization, we +must convert the raw ratings into a User-Item Interaction Matrix. In +this structure, each row represents a unique user, and each column +represents a specific movie. The intersection of a row and column +contains the user’s rating for that movie. + +Because the average user has only seen a small percentage of the +thousands of movies available, this matrix is extremely sparse. Most +cells will contain missing values (NaN), which we must handle +differently depending on the algorithm we choose. + +First, we load the MovieLens 100k dataset: + +.. code:: python + + # Load the MovieLens 100k dataset + header = ['user_id', 'item_id', 'rating', 'timestamp'] + ratings_df = pd.read_csv('movie_data/ml-100k/u.data', sep='\t', names=header) + +We then use the Pandas ``.pivot()`` function to transform the data. This +gives us the User-Item table. + +.. code:: python + + pivot_df = ratings_df.pivot(index='user_id', columns='item_id', values='rating') + +The resulting matrix provides a high-level view of our dataset’s +interaction patterns: + +====== ====== ====== ====== +User Item 1 Item 2 Item 3 +====== ====== ====== ====== +user 1 5 3 +user 2 4 2 +user 3 1 5 +====== ====== ====== ====== + +Cosine Similarity +~~~~~~~~~~~~~~~~~ + +Collaborative Filtering is an umbrella term for algorithms that generate +recommendations by identifying patterns in user ratings. One of the most +common techniques is User-Based Collaborative Filtering, where we +calculate the similarity between users based on their rating history. To +do this, we treat each user as a vector in a high-dimensional space and +measure the “distance” between them using `Cosine Similarity `__. + +To calculate the cosine similarity between all users (rows) in a matrix +:math:`X`, we normalize this matrix and then multiply it with its +transposose. + +If :math:`\hat{X}` is the row-normalized version of :math:`X` such that +each row :math:`i` is defined as +:math:`\hat{x}_i = \frac{x_i}{\|x_i\|}`, then the entire Cosine +Similarity matrix :math:`S` is calculated via the gramian matrix: + +.. math:: S = \hat{X}\hat{X}^T + +Using NumPy, we perform these operations using vectorization for +efficiency: + +.. code:: python + + # L2 norm of each user vector + norms = np.linalg.norm(X, axis=1, keepdims=True) + + # Normalize user vectors + X_norm = X / norms + + # Cosine similarity = dot product of normalized vectors + user_similarity = X_norm @ X_norm.T + +In SystemDS, we follow a similar logic. First, we import and initialize +the SystemDSContext. + +.. code:: python + + from systemds.context import SystemDSContext + + with SystemDSContext() as sds: + +In this context window, we load the data into SystemDS and do our +calculations, using simple matrix +functions: :doc:`/api/operator/node/matrix`. + +.. code:: python + + # Load into SystemDS + X = sds.from_numpy(X_np) + + # Compute L2 norms + row_sums = (X * X).sum(axis=1) + norms = row_sums.sqrt() + + # Normalize user vectors + X_norm = X / norms + + # Cosine similarity = dot product of normalized vectors + user_similarity = X_norm @ X_norm.t() + +In SystemDS, the line ``user_similarity_op = X_norm @ X_norm.t()`` does +not execute any math. Instead, it creates an execution plan. The actual +computation only occurs when we call ``.compute()``, allowing SystemDS +to optimize the entire operation. + +.. code:: python + + user_similarity = user_similarity.compute() + +In both cases ``user_similarity`` gives us a diagonal matrix that shows the +similarity for every user-user pair. + +While both methods produce the same results, SystemDS takes slightly +longer for this specific dataset. + +=============== ===== ======== +Method NumPy SystemDS +=============== ===== ======== +Time in seconds 0.02 0.47 +=============== ===== ======== + +Matrix Factorization +~~~~~~~~~~~~~~~~~~~~ + +Another powerful method for generating movie recommendations is Matrix +Factorization. Instead of looking at surface-level data, this technique +uncovers latent factors, the hidden patterns that represent a user’s +specific tastes (like a preference for 90s rom-coms) and a movie’s +unique characteristics (like its level of whimsy). + +In a real-world scenario, our user-item interaction matrix :math:`R` +is incredibly sparse because most users have only rated a tiny fraction +of the available movies. Matrix factorization solves this by decomposing +:math:`R` into two much smaller, lower-dimensional matrices: + +- :math:`P`: Representing user preferences. +- :math:`Q`: Representing item characteristics. + +By multiplying these two matrices back together, we can estimate the +missing values in our original matrix: + +.. math:: R \approx P \cdot Q^T + +To find :math:`P` and :math:`Q`, we use the optimization algorithm +called +`Alternating Least Squares (ALS) `__. + +In NumPy, we manually iterate through users and items, solving a +least-squares problem for each. This gives us full control but can be +computationally expensive as the dataset grows. We can compute +:math:`\hat{R} = P \cdot Q^T` like this +`(cf. CodeSignal) `__: + +.. code:: python + + # Random initialization of user and item factors + P = np.random.rand(num_users, rank) * 0.01 + Q = np.random.rand(num_items, rank) * 0.01 + + for iteration in range(maxi): + + # Update user factors + for u in range(num_users): + + # Get only items user 'u' actually rated + user_mask = mask[u, :] + Q_u = Q[user_mask, :] + R_u = R[u, user_mask] + + if Q_u.shape[0] > 0: + P[u, :] = np.linalg.solve(np.dot(Q_u.T, Q_u) + reg * np.eye(rank), np.dot(Q_u.T, R_u)) + + # Update item factors + for i in range(num_items): + + # Get only users who actually rated item 'i' + item_mask = mask[:, i] + P_i = P[item_mask, :] + R_i = R[item_mask, i] + + if P_i.shape[0] > 0: + Q[i, :] = np.linalg.solve(np.dot(P_i.T, P_i) + reg * np.eye(rank), np.dot(P_i.T, R_i)) + + R_hat = P @ Q.T + +SystemDS allows us to execute the same logic using high-level +script-like functions that are internally optimized. It offers a wide +variety of built-in algorithms :doc:`/api/operator/algorithms`, including ALS. +First, we import our algorithm. + +.. code:: python + + from systemds.operator.algorithm import als + +Then, we initialize the SystemDS context: + +.. code:: python + + with SystemDSContext() as sds: + +To tune the model for our specific dataset, we configure the following +hyperparameters: + +- ``rank = 20`` The number of latent factors (hidden features) used to + describe users and movies. A higher rank allows for more complexity + but increases the risk of overfitting. +- ``reg = 1.0`` The regularization parameter. This prevents the model + from becoming too complex by penalizing large weights, helping it + generalize better to unseen data. +- ``maxi = 20`` The maximum number of iterations. ALS is an iterative + process. + +Then we can do the computation. + +.. code:: python + + # Load data into SystemDS + R = sds.from_numpy(pivot_df.fillna(0).values) + + # Approximate factorization of R into two matrices P and Q using ALS + P, Q = als(R, rank=20, reg=1.0, maxi=20).compute() + + R_hat = P @ Q + +To test how well our models generalize to new data, we performed an +80/20 split, using the first 80,000 ratings for training and the +remainder for testing. We compared both approaches based on execution +time and Root Mean Squared Error (RMSE). + +=============== ===== ======== +Method NumPy SystemDS +=============== ===== ======== +Time in seconds 1.09 2.48 +Train RMSE 0.67 0.67 +Test RMSE 1.03 1.01 +=============== ===== ======== + +Both implementations are mathematically consistent. SystemDS achieved a +slightly better Test RMSE. + +Linear Regression +~~~~~~~~~~~~~~~~~ + +Unlike Matrix Factorization, which relies purely on interaction +patterns, Linear Regression allows us to incorporate “side information” +about users and items. By using features like user demographics and +movie genres, we can build a Content-Based Filtering model that predicts +ratings based on specific attributes. + +Preprocessing +^^^^^^^^^^^^^ + +For Linear Regression and Neural Networks, our data must be strictly +numerical and properly scaled. We begin by loading the MovieLens +datasets: + +.. code:: python + + ratings_df = pd.read_csv('movie_data/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp']) + user_df = pd.read_csv('movie_data/ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code']) + item_df = pd.read_csv('movie_data/ml-100k/u.item', sep='|', names=[ + 'item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', + "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', + 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], encoding='latin-1') + +Libraries like NumPy and SystemDS cannot process strings (e.g., +“Student” or “Female”). We must transform these into numerical +representations: + +.. code:: python + + # Turn categorical data into numerical + user_df['gender'] = user_df['gender'].apply(lambda x: 0 if x == 'F' else 1) + user_df = pd.get_dummies(user_df, columns=['occupation']) + item_df['release_date'] = pd.to_datetime(item_df['release_date'], errors='raise', format='%d-%b-%Y') + item_df['release_year'] = item_df['release_date'].dt.year + +Features like ``age`` and ``release_year`` have different scales. If +left unscaled, the model might incorrectly give more “weight” to the +larger year values. We normalize them to a 0–1 range to ensure equal +influence. + +.. code:: python + + # Normalize data + user_df['age'] = (user_df['age'] - user_df['age'].min()) / (user_df['age'].max() - user_df['age'].min()) + item_df['release_year'] = (item_df['release_year'] - item_df['release_year'].min()) / (item_df['release_year'].max() - item_df['release_year'].min()) + +Finally, we merge these datasets into a single table. Each row +represents a specific rating, enriched with all available user and movie +features. After merging, we drop non-numerical columns (like ``title`` +or ``IMDb_URL``), remove rows with NaN-values and split the data into +Training and Testing sets. + +Linear Regression with PyTorch +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +PyTorch is a popular deep learning framework that approaches linear +regression as an iterative optimization problem. We use Gradient Descent +to minimize the Mean Squared Error (MSE) by repeatedly updating the +model’s weights based on calculated gradients. + +Data must be converted to ``torch.Tensor`` format. + +.. code:: python + + X_train_tensor = torch.from_numpy(X_train).float() + y_train_tensor = torch.from_numpy(y_train).float().reshape(-1, 1) + X_test_tensor = torch.from_numpy(X_test).float() + y_test_tensor = torch.from_numpy(y_test).float().reshape(-1, 1) + +We define a model class and an optimizer (SGD). The learning rate +(``lr``) determines the step size for each update. + +.. code:: python + + n_features = X_train.shape[1] + + class linearRegression(torch.nn.Module): + def __init__(self): + super(linearRegression, self).__init__() + # input size: n_features, output size: 1 + self.linear = torch.nn.Linear(n_features, 1) + + def forward(self, x): + out = self.linear(x) + return out + + lr_model = linearRegression() + criterion = torch.nn.MSELoss() + optimizer = torch.optim.SGD(lr_model.parameters(), lr = 0.01) + +The model iterates through the dataset for a set number of epochs. In +each iteration, it performs a forward pass, calculates the loss, and +backpropagates the gradients to update the weights. + +.. code:: python + + for epoch in range(1000): + + # Forward pass and loss + pred_y = lr_model(X_train_tensor) + loss = criterion(pred_y, y_train_tensor) + + # Backward pass and optimization + optimizer.zero_grad() + loss.backward() + optimizer.step() + +We use ``.eval()`` and ``torch.no_grad()`` to disable gradient tracking +during inference + +.. code:: python + + lr_model.eval() + with torch.no_grad(): + y_pred_test = lr_model(X_test_tensor) + +Then, we can calculate the RMSE. + +.. code:: python + + y_pred = y_pred_test.numpy().flatten() + y_true = y_test_tensor.numpy().flatten() + rmse = np.sqrt(np.mean((y_pred - y_true) ** 2)) + +Linear Regression with SystemDS +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Following the same pattern as ALS, SystemDS +provides a highly optimized, built-in algorithm for linear regression. +This implementation is designed to handle large-scale data by +automatically deciding between direct solvers and conjugate gradient +methods based on the data’s characteristics. + +First, we import the ``lm`` training algorithm and the ``lmPredict`` +function for inference. + +.. code:: python + + from systemds.operator.algorithm import lm, lmPredict + +We transfer our NumPy arrays into the SystemDS context. + +.. code:: python + + X_ds = sds.from_numpy(X_train) + y_ds = sds.from_numpy(y_train) + X_test_ds = sds.from_numpy(X_test) + +We call the ``lm`` function to train our model. + +.. code:: python + + model = lm(X=X_ds, y=y_ds) + +To generate predictions for the test set, we use ``lmPredict``. Because +SystemDS uses Lazy Evaluation, the actual computation is only triggered +when we call ``.compute()``. + +.. code:: python + + predictions = lmPredict(X_test_ds, model).compute() + +Finally, we calculate the RMSE to compare the performance against your +PyTorch implementation. + +Comparison +^^^^^^^^^^ + +=============== ======= ======== +Method PyTorch SystemDS +=============== ======= ======== +Time in seconds 1.77 0.87 +Test RMSE 1.13 1.08 +=============== ======= ======== + +Using linear regression, SystemDS worked way faster than our PyTorch +approach and achieved better results. diff --git a/src/main/python/docs/source/index.rst b/src/main/python/docs/source/index.rst index 09de5494df5..98dd9393447 100644 --- a/src/main/python/docs/source/index.rst +++ b/src/main/python/docs/source/index.rst @@ -54,6 +54,7 @@ tensors (multi-dimensional arrays) whose first dimension may have a heterogeneou guide/federated.rst guide/algorithms_basics.rst guide/python_end_to_end_tut.rst + guide/movie_recommender.rst .. toctree:: :maxdepth: 1 diff --git a/src/main/python/systemds/examples/tutorials/movie_recommender_system.py b/src/main/python/systemds/examples/tutorials/movie_recommender_system.py new file mode 100644 index 00000000000..c25c7d9c9f1 --- /dev/null +++ b/src/main/python/systemds/examples/tutorials/movie_recommender_system.py @@ -0,0 +1,508 @@ +#!/usr/bin/env python3 +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +import time +import pandas as pd +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from systemds.context import SystemDSContext +from systemds.operator.algorithm import als, lm, lmPredict + +# To run this code, first download the MovieLens 100k dataset from +# https://grouplens.org/datasets/movielens/100k/ and extract it to the specified folder. + +data_folder = '/movie_data/ml-100k/' + +def read_movie_data(n_rows: int=10000) -> pd.DataFrame: + """ + Reads the MovieLens 100k dataset and returns a DataFrame with the following columns: user_id, item_id, rating. + + :param n_rows: Number of rows to read from the dataset. + :return: DataFrame containing the movie ratings data. + """ + + # Load the MovieLens 100k dataset + header = ['user_id', 'item_id', 'rating', 'timestamp'] + ratings_df = pd.read_csv(data_folder + 'u.data', sep='\t', names=header) + + # Drop timestamp column + ratings_df = ratings_df.drop('timestamp', axis=1) + + # Only check first n_rows rows to speed up processing + ratings_df = ratings_df.head(n_rows) + + return ratings_df + + +def create_pivot_table(ratings_df: pd.DataFrame) -> pd.DataFrame: + """ + Creates a pivot table from the ratings DataFrame where rows are users, columns are items, and values are ratings. + + :param ratings_df: DataFrame containing the movie ratings data with columns user_id, item_id, rating. + :return: Pivot table with users as rows, items as columns, and ratings as values. + """ + + return ratings_df.pivot(index='user_id', columns='item_id', values='rating') + + +### Cosine Similarity Functions ### + +def numpy_cosine_similarity(pivot_df: pd.DataFrame) -> tuple[pd.DataFrame, float]: + """ + Calculates the cosine similarity between users using NumPy. + + :param pivot_df: DataFrame containing the pivot table of user-item ratings. + :return: DataFrame containing the cosine similarity between users and time taken. + """ + + # zeros = unrated items + X = pivot_df.fillna(0).values + + start = time.time() + + # L2 norm of each user vector + norms = np.linalg.norm(X, axis=1, keepdims=True) + + # Normalize user vectors + X_norm = X / norms + + # Cosine similarity = dot product of normalized vectors + user_similarity = X_norm @ X_norm.T + + end = time.time() + + # convert to DataFrame for better readability + user_similarity_df = pd.DataFrame( + user_similarity, + index=pivot_df.index, + columns=pivot_df.index + ) + + return user_similarity_df, end - start + + +def systemds_cosine_similarity(pivot_df: pd.DataFrame) -> tuple[pd.DataFrame, float]: + """ + Calculates the cosine similarity between users using SystemDS. + + :param pivot_df: DataFrame containing the pivot table of user-item ratings. + :return: DataFrame containing the cosine similarity between users and time taken. + """ + + # Zeros = unrated items + X_np = pivot_df.fillna(0).values + + with SystemDSContext() as sds: + + start = time.time() + + # Load into SystemDS + X = sds.from_numpy(X_np) + + # Compute L2 norms + row_sums = (X * X).sum(axis=1) + norms = row_sums.sqrt() + + # Normalize user vectors + X_norm = X / norms + + # Cosine similarity = dot product of normalized vectors + user_similarity = X_norm @ X_norm.t() + + # Compute result + user_similarity = user_similarity.compute() + + end = time.time() + + # Convert to DataFrame for better readability + user_similarity_df = pd.DataFrame( + user_similarity, + index=pivot_df.index, + columns=pivot_df.index + ) + + return user_similarity_df, end - start + + +def evaluate_cosine_similarity() -> None: + """ + Evaluates and compares the cosine similarity computations between NumPy and SystemDS. + """ + + ratings = read_movie_data(100000) + pivot_df = create_pivot_table(ratings) + + numpy_df, numpy_time = numpy_cosine_similarity(pivot_df) + systemds_df, systemds_time = systemds_cosine_similarity(pivot_df) + + # Check if the results are approximately equal + if np.allclose(numpy_df.values, systemds_df.values, atol=1e-8): + print("Cosine similarity DataFrames are approximately equal.") + else: + print("Cosine similarity DataFrames are NOT equal.") + + print(f"Time taken for NumPy cosine similarity: {numpy_time}") + print(f"Time taken for SystemDS cosine similarity: {systemds_time}") + + +### Matrix Factorization Functions ### + +def numpy_als(pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int) -> tuple[pd.DataFrame, float]: + """ + Calculates a matrix R_hat using Alternating Least Squares (ALS) matrix factorization in numpy. + + :param pivot_df: DataFrame containing the pivot table of user-item ratings. + :return: DataFrame containing the predicted ratings and time taken. + """ + + # Fill NaNs with zeros for computation + R = pivot_df.fillna(0).values + + start = time.time() + num_users, num_items = R.shape + mask = (R != 0) + + # Random initialization of user and item factors + P = np.random.rand(num_users, rank) * 0.01 + Q = np.random.rand(num_items, rank) * 0.01 + + for iteration in range(maxi): + + # Update user factors + for u in range(num_users): + + # Get only items user 'u' actually rated + user_mask = mask[u, :] + Q_u = Q[user_mask, :] + R_u = R[u, user_mask] + + if Q_u.shape[0] > 0: + P[u, :] = np.linalg.solve(np.dot(Q_u.T, Q_u) + reg * np.eye(rank), np.dot(Q_u.T, R_u)) + + # Update item factors + for i in range(num_items): + + # Get only users who actually rated item 'i' + item_mask = mask[:, i] + P_i = P[item_mask, :] + R_i = R[item_mask, i] + + if P_i.shape[0] > 0: + Q[i, :] = np.linalg.solve(np.dot(P_i.T, P_i) + reg * np.eye(rank), np.dot(P_i.T, R_i)) + + end = time.time() + + # Multiply P and Q to get the approximated ratings matrix + R_hat = P @ Q.T + + # Convert to DataFrame for better readability + ratings_hat_df = pd.DataFrame( + R_hat, + index=pivot_df.index, + columns=pivot_df.columns + ) + + return ratings_hat_df, end - start + + +def systemds_als(pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int) -> tuple[pd.DataFrame, float]: + """ + Calculates a matrix R_hat using Alternating Least Squares (ALS) matrix factorization in SystemDS. + + :param pivot_df: DataFrame containing the pivot table of user-item ratings. + :return: DataFrame containing the predicted ratings and time taken. + """ + + start = time.time() + + with SystemDSContext() as sds: + + # Load data into SystemDS + R = sds.from_numpy(pivot_df.fillna(0).values) + + # Approximate factorization of R into two matrices P and Q using ALS + P, Q = als(R, rank=rank, reg=reg, maxi=maxi).compute() + end = time.time() + + # Multiply P and Q to get the approximated ratings matrix + R_hat = P @ Q + + # Convert to DataFrame for better readability + ratings_hat_df = pd.DataFrame( + R_hat, + index=pivot_df.index, + columns=pivot_df.columns + ) + + return ratings_hat_df, end - start + + +def evaluate_als(model: str = "systemds", rank: int = 10, reg: float = 1.0, maxi: int = 20) -> None: + """ + Evaluates and compares the ALS computations between NumPy and SystemDS. The data is split into training + and test sets with an 80/20 ratio. Then the RMSE is calculated for both sets. + + :param model: Model to use for ALS computation ("systemds" or "numpy"). + :param rank: Rank of the factorized matrices. + :param reg: Regularization parameter. + :param maxi: Maximum number of iterations. + """ + + ratings = read_movie_data(100000) + pivot_df = create_pivot_table(ratings[:80000]) + + if model == "systemds": + ratings_hat_df, systemds_time = systemds_als(pivot_df, rank, reg, maxi) + else: + ratings_hat_df, numpy_time = numpy_als(pivot_df, rank, reg, maxi) + + # Print time taken + print(f"Time taken for {model} ALS: ", systemds_time if model == "systemds" else numpy_time) + + # Training error + mask = ~np.isnan(pivot_df.values) + train_rmse = np.sqrt(np.mean((ratings_hat_df.values[mask] - pivot_df.values[mask])**2)) + print(f"Train RMSE for model with {model}: {train_rmse}") + + # Test error + test_set = ratings[80000:] + stacked_series = ratings_hat_df.stack() + ratings_hat_long = stacked_series.reset_index() + ratings_hat_long.columns = ['user_id', 'item_id', 'rating'] + + merged_df = pd.merge( + test_set, + ratings_hat_long, + on=['user_id', 'item_id'], + how='inner', + suffixes=('_actual', '_predicted') + ) + + # Force predictions to stay between 0.5 and 5.0 + merged_df['rating_predicted'] = merged_df['rating_predicted'].clip(0.5, 5.0) + + # Calculate root mean squared error (RMSE) + squared_errors = (merged_df['rating_actual'] - merged_df['rating_predicted'])**2 + mse = np.mean(squared_errors) + test_rmse = np.sqrt(mse) + + print(f"Test RMSE for model with {model}: {test_rmse}") + + +### Linear Regression ### + +def preprocess_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """ + This function reads and preprocesses the MovieLens 100k dataset for linear regression. It returns four + different numpy arrays: X_train, y_train, X_test, y_test. The preprocessing steps include: + - Reading the datasets + - Handling categorical variables + - Normalizing numerical features + - Merging datasets + - Dropping unnecessary columns + - Dropping rows with NaN values + - Splitting into training and testing sets. + + :return: tuple of numpy arrays (X_train, y_train, X_test, y_test) + """ + + # Read datasets + ratings_df = pd.read_csv(data_folder + 'u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp']) + user_df = pd.read_csv(data_folder + 'u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code']) + item_df = pd.read_csv(data_folder + 'u.item', sep='|', names=[ + 'item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', + "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', + 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], encoding='latin-1') + + # Turn categorical data into numerical + user_df['gender'] = user_df['gender'].apply(lambda x: 0 if x == 'F' else 1) + user_df = pd.get_dummies(user_df, columns=['occupation']) + item_df['release_date'] = pd.to_datetime(item_df['release_date'], errors='raise', format='%d-%b-%Y') + item_df['release_year'] = item_df['release_date'].dt.year + + # Normalize data + user_df['age'] = (user_df['age'] - user_df['age'].min()) / (user_df['age'].max() - user_df['age'].min()) + item_df['release_year'] = (item_df['release_year'] - item_df['release_year'].min()) / (item_df['release_year'].max() - item_df['release_year'].min()) + + # Merge datasets + merged_df = ratings_df.merge(user_df, on='user_id').merge(item_df, on='item_id') + + # Drop unnecessary columns + merged_df = merged_df.drop(['user_id', 'item_id', 'timestamp', 'zip_code', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown'], axis=1) + + # Convert boolean columns to integers (important for NumPy and SystemDS) + bool_cols = merged_df.select_dtypes(include=['bool']).columns + merged_df[bool_cols] = merged_df[bool_cols].astype(int) + + # Drop rows with NaN values + merged_df = merged_df.dropna() + + ratings = merged_df.pop('rating') + features = merged_df + + # Split into train and test sets and convert to numpy arrays + train_size = int(0.8 * len(ratings)) + X_train = features[:train_size].to_numpy() + y_train = ratings[:train_size].to_numpy() + X_test = features[train_size:].to_numpy() + y_test = ratings[train_size:].to_numpy() + + print("NaNs in X:", np.isnan(X_train).any()) + print("NaNs in y:", np.isnan(y_train).any()) + + return X_train, y_train, X_test, y_test + + +def linear_regression_pytorch(X_train, y_train, X_test, y_test, num_epochs=1000) -> tuple[float, float]: + """ + Trains a linear regression model using PyTorch. + + :param X_train, X_test: numpy arrays of shape (n_samples, n_features) + :param y_train, y_test: numpy arrays of shape (n_samples,) + :param num_epochs: number of training iterations + + :return rmse: RMSE on test set + :return time taken: time in seconds for training and prediction + """ + + start = time.time() + + # Convert to PyTorch tensors + X_train_tensor = torch.from_numpy(X_train).float() + y_train_tensor = torch.from_numpy(y_train).float().reshape(-1, 1) + X_test_tensor = torch.from_numpy(X_test).float() + y_test_tensor = torch.from_numpy(y_test).float().reshape(-1, 1) + + # Define model + n_features = X_train.shape[1] + class linearRegression(torch.nn.Module): + def __init__(self): + super(linearRegression, self).__init__() + # input size: n_features, output size: 1 + self.linear = torch.nn.Linear(n_features, 1) + def forward(self, x): + out = self.linear(x) + return out + lr_model = linearRegression() + + # Loss and optimizer + criterion = torch.nn.MSELoss() + optimizer = torch.optim.SGD(lr_model.parameters(), lr = 0.01) + + # Training loop + for epoch in range(num_epochs): + + # Forward pass and loss + pred_y = lr_model(X_train_tensor) + loss = criterion(pred_y, y_train_tensor) + + # Backward pass and optimization + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if (epoch+1) % 100 == 0: + print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') + + # Make predictions on test set + lr_model.eval() + with torch.no_grad(): + y_pred_test = lr_model(X_test_tensor) + + end = time.time() + + y_pred = y_pred_test.numpy().flatten() + y_true = y_test_tensor.numpy().flatten() + + # Compute RMSE + rmse = np.sqrt(np.mean((y_pred - y_true) ** 2)) + + return rmse, end - start + + +def linear_regression_systemds(X_train, y_train, X_test, y_test, num_epochs=1000) -> tuple[float, float]: + """ + Trains a linear regression model using SystemDS. + + :param X_train, X_test: numpy arrays of shape (n_samples, n_features) + :param y_train, y_test: numpy arrays of shape (n_samples,) + :param num_epochs: maximum number of training iterations + + :return rmse: RMSE on test set + :return time taken: time in seconds for training and prediction + """ + + with SystemDSContext() as sds: + + start = time.time() + + # Read data into SystemDS + X_ds = sds.from_numpy(X_train) + y_ds = sds.from_numpy(y_train) + X_test_ds = sds.from_numpy(X_test) + + # Train linear regression model with max iterations + model = lm(X=X_ds, y=y_ds, maxi=num_epochs) + # Make predictions on test set + predictions = lmPredict(X_test_ds, model).compute() + + end = time.time() + + y_pred = predictions.flatten() + y_true = y_test.flatten() + + # Compute RMSE + rmse = np.sqrt(np.mean((y_pred - y_true) ** 2)) + + return rmse, end - start + + +def evaluate_lr() -> None: + """ + Evaluates and compares the linear regression computations between PyTorch and SystemDS. The data is split into + training and test sets with an 80/20 ratio. Then the RMSE is calculated for both sets. + """ + + print("Evaluating Linear Regression Models...") + + X_train, y_train, X_test, y_test = preprocess_data() + + pytorch_rmse, pytorch_time = linear_regression_pytorch(X_train, y_train, X_test, y_test, num_epochs=1000) + systemds_rmse, systemds_time = linear_regression_systemds(X_train, y_train, X_test, y_test, num_epochs=1000) + + print(f"PyTorch RMSE: {pytorch_rmse}, Time: {pytorch_time} seconds") + print(f"SystemDS RMSE: {systemds_rmse}, Time: {systemds_time} seconds") + + +if __name__ == "__main__": + + # Cosine Similarity + evaluate_cosine_similarity() + + # Matrix Factorization using ALS + evaluate_als("systemds") + evaluate_als("numpy") + + # Linear Regression + evaluate_lr() From f42c05fce7f39e33c997359a3d53d609de2ac429 Mon Sep 17 00:00:00 2001 From: jasminthiele Date: Tue, 27 Jan 2026 12:37:06 +0100 Subject: [PATCH 2/3] Fix formating issue --- .../python/docs/source/guide/movie_recommender.rst | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/main/python/docs/source/guide/movie_recommender.rst b/src/main/python/docs/source/guide/movie_recommender.rst index 329507ccb62..ed74b5c260b 100644 --- a/src/main/python/docs/source/guide/movie_recommender.rst +++ b/src/main/python/docs/source/guide/movie_recommender.rst @@ -61,13 +61,12 @@ Dataset `__. It consists of 100.000 movie ratings from 943 different users on 1682 movies from the late 1990s. In the following, we will often refer to movies as items. The data is stored in different files: -- u.data (contains -user_id, item_id (movie), rating and timestamp), -- u.user (contains -user_id, age, gender, occupation and zip code), -- u.item (contains -item_id, movie name, release date, ImDb link, genre in a hot-one -format). + +- **u.data** (contains user_id, item_id (movie), rating and timestamp), + +- **u.user** (contains user_id, age, gender, occupation and zip code), + +- **u.item** (contains item_id, movie name, release date, ImDb link, genre in a hot-one format). Preprocessing for Cosine Similarity and Matrix Factorization ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 32c797309d49c1f1d2f44337a140c4f487101ad0 Mon Sep 17 00:00:00 2001 From: jasminthiele Date: Tue, 27 Jan 2026 18:24:15 +0100 Subject: [PATCH 3/3] apply black formatting --- .../tutorials/movie_recommender_system.py | 228 ++++++++++++------ 1 file changed, 150 insertions(+), 78 deletions(-) diff --git a/src/main/python/systemds/examples/tutorials/movie_recommender_system.py b/src/main/python/systemds/examples/tutorials/movie_recommender_system.py index c25c7d9c9f1..66e486949a7 100644 --- a/src/main/python/systemds/examples/tutorials/movie_recommender_system.py +++ b/src/main/python/systemds/examples/tutorials/movie_recommender_system.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#------------------------------------------------------------- +# ------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -18,7 +18,7 @@ # specific language governing permissions and limitations # under the License. # -#------------------------------------------------------------- +# ------------------------------------------------------------- import time import pandas as pd @@ -29,25 +29,26 @@ from systemds.context import SystemDSContext from systemds.operator.algorithm import als, lm, lmPredict -# To run this code, first download the MovieLens 100k dataset from +# To run this code, first download the MovieLens 100k dataset from # https://grouplens.org/datasets/movielens/100k/ and extract it to the specified folder. -data_folder = '/movie_data/ml-100k/' +data_folder = "/movie_data/ml-100k/" -def read_movie_data(n_rows: int=10000) -> pd.DataFrame: + +def read_movie_data(n_rows: int = 10000) -> pd.DataFrame: """ Reads the MovieLens 100k dataset and returns a DataFrame with the following columns: user_id, item_id, rating. - + :param n_rows: Number of rows to read from the dataset. :return: DataFrame containing the movie ratings data. """ # Load the MovieLens 100k dataset - header = ['user_id', 'item_id', 'rating', 'timestamp'] - ratings_df = pd.read_csv(data_folder + 'u.data', sep='\t', names=header) + header = ["user_id", "item_id", "rating", "timestamp"] + ratings_df = pd.read_csv(data_folder + "u.data", sep="\t", names=header) # Drop timestamp column - ratings_df = ratings_df.drop('timestamp', axis=1) + ratings_df = ratings_df.drop("timestamp", axis=1) # Only check first n_rows rows to speed up processing ratings_df = ratings_df.head(n_rows) @@ -63,15 +64,16 @@ def create_pivot_table(ratings_df: pd.DataFrame) -> pd.DataFrame: :return: Pivot table with users as rows, items as columns, and ratings as values. """ - return ratings_df.pivot(index='user_id', columns='item_id', values='rating') + return ratings_df.pivot(index="user_id", columns="item_id", values="rating") ### Cosine Similarity Functions ### + def numpy_cosine_similarity(pivot_df: pd.DataFrame) -> tuple[pd.DataFrame, float]: """ Calculates the cosine similarity between users using NumPy. - + :param pivot_df: DataFrame containing the pivot table of user-item ratings. :return: DataFrame containing the cosine similarity between users and time taken. """ @@ -94,9 +96,7 @@ def numpy_cosine_similarity(pivot_df: pd.DataFrame) -> tuple[pd.DataFrame, float # convert to DataFrame for better readability user_similarity_df = pd.DataFrame( - user_similarity, - index=pivot_df.index, - columns=pivot_df.index + user_similarity, index=pivot_df.index, columns=pivot_df.index ) return user_similarity_df, end - start @@ -105,7 +105,7 @@ def numpy_cosine_similarity(pivot_df: pd.DataFrame) -> tuple[pd.DataFrame, float def systemds_cosine_similarity(pivot_df: pd.DataFrame) -> tuple[pd.DataFrame, float]: """ Calculates the cosine similarity between users using SystemDS. - + :param pivot_df: DataFrame containing the pivot table of user-item ratings. :return: DataFrame containing the cosine similarity between users and time taken. """ @@ -120,7 +120,7 @@ def systemds_cosine_similarity(pivot_df: pd.DataFrame) -> tuple[pd.DataFrame, fl # Load into SystemDS X = sds.from_numpy(X_np) - # Compute L2 norms + # Compute L2 norms row_sums = (X * X).sum(axis=1) norms = row_sums.sqrt() @@ -137,9 +137,7 @@ def systemds_cosine_similarity(pivot_df: pd.DataFrame) -> tuple[pd.DataFrame, fl # Convert to DataFrame for better readability user_similarity_df = pd.DataFrame( - user_similarity, - index=pivot_df.index, - columns=pivot_df.index + user_similarity, index=pivot_df.index, columns=pivot_df.index ) return user_similarity_df, end - start @@ -168,10 +166,13 @@ def evaluate_cosine_similarity() -> None: ### Matrix Factorization Functions ### -def numpy_als(pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int) -> tuple[pd.DataFrame, float]: + +def numpy_als( + pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int +) -> tuple[pd.DataFrame, float]: """ Calculates a matrix R_hat using Alternating Least Squares (ALS) matrix factorization in numpy. - + :param pivot_df: DataFrame containing the pivot table of user-item ratings. :return: DataFrame containing the predicted ratings and time taken. """ @@ -181,7 +182,7 @@ def numpy_als(pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int) -> tuple start = time.time() num_users, num_items = R.shape - mask = (R != 0) + mask = R != 0 # Random initialization of user and item factors P = np.random.rand(num_users, rank) * 0.01 @@ -193,12 +194,14 @@ def numpy_als(pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int) -> tuple for u in range(num_users): # Get only items user 'u' actually rated - user_mask = mask[u, :] + user_mask = mask[u, :] Q_u = Q[user_mask, :] R_u = R[u, user_mask] - + if Q_u.shape[0] > 0: - P[u, :] = np.linalg.solve(np.dot(Q_u.T, Q_u) + reg * np.eye(rank), np.dot(Q_u.T, R_u)) + P[u, :] = np.linalg.solve( + np.dot(Q_u.T, Q_u) + reg * np.eye(rank), np.dot(Q_u.T, R_u) + ) # Update item factors for i in range(num_items): @@ -207,9 +210,11 @@ def numpy_als(pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int) -> tuple item_mask = mask[:, i] P_i = P[item_mask, :] R_i = R[item_mask, i] - + if P_i.shape[0] > 0: - Q[i, :] = np.linalg.solve(np.dot(P_i.T, P_i) + reg * np.eye(rank), np.dot(P_i.T, R_i)) + Q[i, :] = np.linalg.solve( + np.dot(P_i.T, P_i) + reg * np.eye(rank), np.dot(P_i.T, R_i) + ) end = time.time() @@ -217,19 +222,17 @@ def numpy_als(pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int) -> tuple R_hat = P @ Q.T # Convert to DataFrame for better readability - ratings_hat_df = pd.DataFrame( - R_hat, - index=pivot_df.index, - columns=pivot_df.columns - ) + ratings_hat_df = pd.DataFrame(R_hat, index=pivot_df.index, columns=pivot_df.columns) return ratings_hat_df, end - start -def systemds_als(pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int) -> tuple[pd.DataFrame, float]: +def systemds_als( + pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int +) -> tuple[pd.DataFrame, float]: """ Calculates a matrix R_hat using Alternating Least Squares (ALS) matrix factorization in SystemDS. - + :param pivot_df: DataFrame containing the pivot table of user-item ratings. :return: DataFrame containing the predicted ratings and time taken. """ @@ -249,20 +252,18 @@ def systemds_als(pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int) -> tu R_hat = P @ Q # Convert to DataFrame for better readability - ratings_hat_df = pd.DataFrame( - R_hat, - index=pivot_df.index, - columns=pivot_df.columns - ) + ratings_hat_df = pd.DataFrame(R_hat, index=pivot_df.index, columns=pivot_df.columns) return ratings_hat_df, end - start -def evaluate_als(model: str = "systemds", rank: int = 10, reg: float = 1.0, maxi: int = 20) -> None: +def evaluate_als( + model: str = "systemds", rank: int = 10, reg: float = 1.0, maxi: int = 20 +) -> None: """ - Evaluates and compares the ALS computations between NumPy and SystemDS. The data is split into training + Evaluates and compares the ALS computations between NumPy and SystemDS. The data is split into training and test sets with an 80/20 ratio. Then the RMSE is calculated for both sets. - + :param model: Model to use for ALS computation ("systemds" or "numpy"). :param rank: Rank of the factorized matrices. :param reg: Regularization parameter. @@ -278,32 +279,37 @@ def evaluate_als(model: str = "systemds", rank: int = 10, reg: float = 1.0, maxi ratings_hat_df, numpy_time = numpy_als(pivot_df, rank, reg, maxi) # Print time taken - print(f"Time taken for {model} ALS: ", systemds_time if model == "systemds" else numpy_time) + print( + f"Time taken for {model} ALS: ", + systemds_time if model == "systemds" else numpy_time, + ) # Training error mask = ~np.isnan(pivot_df.values) - train_rmse = np.sqrt(np.mean((ratings_hat_df.values[mask] - pivot_df.values[mask])**2)) + train_rmse = np.sqrt( + np.mean((ratings_hat_df.values[mask] - pivot_df.values[mask]) ** 2) + ) print(f"Train RMSE for model with {model}: {train_rmse}") # Test error test_set = ratings[80000:] stacked_series = ratings_hat_df.stack() ratings_hat_long = stacked_series.reset_index() - ratings_hat_long.columns = ['user_id', 'item_id', 'rating'] + ratings_hat_long.columns = ["user_id", "item_id", "rating"] merged_df = pd.merge( - test_set, - ratings_hat_long, - on=['user_id', 'item_id'], - how='inner', - suffixes=('_actual', '_predicted') + test_set, + ratings_hat_long, + on=["user_id", "item_id"], + how="inner", + suffixes=("_actual", "_predicted"), ) # Force predictions to stay between 0.5 and 5.0 - merged_df['rating_predicted'] = merged_df['rating_predicted'].clip(0.5, 5.0) + merged_df["rating_predicted"] = merged_df["rating_predicted"].clip(0.5, 5.0) # Calculate root mean squared error (RMSE) - squared_errors = (merged_df['rating_actual'] - merged_df['rating_predicted'])**2 + squared_errors = (merged_df["rating_actual"] - merged_df["rating_predicted"]) ** 2 mse = np.mean(squared_errors) test_rmse = np.sqrt(mse) @@ -312,6 +318,7 @@ def evaluate_als(model: str = "systemds", rank: int = 10, reg: float = 1.0, maxi ### Linear Regression ### + def preprocess_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ This function reads and preprocesses the MovieLens 100k dataset for linear regression. It returns four @@ -328,37 +335,91 @@ def preprocess_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ # Read datasets - ratings_df = pd.read_csv(data_folder + 'u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp']) - user_df = pd.read_csv(data_folder + 'u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code']) - item_df = pd.read_csv(data_folder + 'u.item', sep='|', names=[ - 'item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', - "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', - 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], encoding='latin-1') - + ratings_df = pd.read_csv( + data_folder + "u.data", + sep="\t", + names=["user_id", "item_id", "rating", "timestamp"], + ) + user_df = pd.read_csv( + data_folder + "u.user", + sep="|", + names=["user_id", "age", "gender", "occupation", "zip_code"], + ) + item_df = pd.read_csv( + data_folder + "u.item", + sep="|", + names=[ + "item_id", + "title", + "release_date", + "video_release_date", + "IMDb_URL", + "unknown", + "Action", + "Adventure", + "Animation", + "Children's", + "Comedy", + "Crime", + "Documentary", + "Drama", + "Fantasy", + "Film-Noir", + "Horror", + "Musical", + "Mystery", + "Romance", + "Sci-Fi", + "Thriller", + "War", + "Western", + ], + encoding="latin-1", + ) + # Turn categorical data into numerical - user_df['gender'] = user_df['gender'].apply(lambda x: 0 if x == 'F' else 1) - user_df = pd.get_dummies(user_df, columns=['occupation']) - item_df['release_date'] = pd.to_datetime(item_df['release_date'], errors='raise', format='%d-%b-%Y') - item_df['release_year'] = item_df['release_date'].dt.year + user_df["gender"] = user_df["gender"].apply(lambda x: 0 if x == "F" else 1) + user_df = pd.get_dummies(user_df, columns=["occupation"]) + item_df["release_date"] = pd.to_datetime( + item_df["release_date"], errors="raise", format="%d-%b-%Y" + ) + item_df["release_year"] = item_df["release_date"].dt.year # Normalize data - user_df['age'] = (user_df['age'] - user_df['age'].min()) / (user_df['age'].max() - user_df['age'].min()) - item_df['release_year'] = (item_df['release_year'] - item_df['release_year'].min()) / (item_df['release_year'].max() - item_df['release_year'].min()) + user_df["age"] = (user_df["age"] - user_df["age"].min()) / ( + user_df["age"].max() - user_df["age"].min() + ) + item_df["release_year"] = ( + item_df["release_year"] - item_df["release_year"].min() + ) / (item_df["release_year"].max() - item_df["release_year"].min()) # Merge datasets - merged_df = ratings_df.merge(user_df, on='user_id').merge(item_df, on='item_id') + merged_df = ratings_df.merge(user_df, on="user_id").merge(item_df, on="item_id") # Drop unnecessary columns - merged_df = merged_df.drop(['user_id', 'item_id', 'timestamp', 'zip_code', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown'], axis=1) + merged_df = merged_df.drop( + [ + "user_id", + "item_id", + "timestamp", + "zip_code", + "title", + "release_date", + "video_release_date", + "IMDb_URL", + "unknown", + ], + axis=1, + ) # Convert boolean columns to integers (important for NumPy and SystemDS) - bool_cols = merged_df.select_dtypes(include=['bool']).columns + bool_cols = merged_df.select_dtypes(include=["bool"]).columns merged_df[bool_cols] = merged_df[bool_cols].astype(int) # Drop rows with NaN values merged_df = merged_df.dropna() - ratings = merged_df.pop('rating') + ratings = merged_df.pop("rating") features = merged_df # Split into train and test sets and convert to numpy arrays @@ -374,7 +435,9 @@ def preprocess_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: return X_train, y_train, X_test, y_test -def linear_regression_pytorch(X_train, y_train, X_test, y_test, num_epochs=1000) -> tuple[float, float]: +def linear_regression_pytorch( + X_train, y_train, X_test, y_test, num_epochs=1000 +) -> tuple[float, float]: """ Trains a linear regression model using PyTorch. @@ -396,19 +459,22 @@ def linear_regression_pytorch(X_train, y_train, X_test, y_test, num_epochs=1000) # Define model n_features = X_train.shape[1] + class linearRegression(torch.nn.Module): def __init__(self): super(linearRegression, self).__init__() # input size: n_features, output size: 1 self.linear = torch.nn.Linear(n_features, 1) + def forward(self, x): out = self.linear(x) return out + lr_model = linearRegression() # Loss and optimizer criterion = torch.nn.MSELoss() - optimizer = torch.optim.SGD(lr_model.parameters(), lr = 0.01) + optimizer = torch.optim.SGD(lr_model.parameters(), lr=0.01) # Training loop for epoch in range(num_epochs): @@ -422,8 +488,8 @@ def forward(self, x): loss.backward() optimizer.step() - if (epoch+1) % 100 == 0: - print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') + if (epoch + 1) % 100 == 0: + print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}") # Make predictions on test set lr_model.eval() @@ -441,7 +507,9 @@ def forward(self, x): return rmse, end - start -def linear_regression_systemds(X_train, y_train, X_test, y_test, num_epochs=1000) -> tuple[float, float]: +def linear_regression_systemds( + X_train, y_train, X_test, y_test, num_epochs=1000 +) -> tuple[float, float]: """ Trains a linear regression model using SystemDS. @@ -480,7 +548,7 @@ def linear_regression_systemds(X_train, y_train, X_test, y_test, num_epochs=1000 def evaluate_lr() -> None: """ - Evaluates and compares the linear regression computations between PyTorch and SystemDS. The data is split into + Evaluates and compares the linear regression computations between PyTorch and SystemDS. The data is split into training and test sets with an 80/20 ratio. Then the RMSE is calculated for both sets. """ @@ -488,8 +556,12 @@ def evaluate_lr() -> None: X_train, y_train, X_test, y_test = preprocess_data() - pytorch_rmse, pytorch_time = linear_regression_pytorch(X_train, y_train, X_test, y_test, num_epochs=1000) - systemds_rmse, systemds_time = linear_regression_systemds(X_train, y_train, X_test, y_test, num_epochs=1000) + pytorch_rmse, pytorch_time = linear_regression_pytorch( + X_train, y_train, X_test, y_test, num_epochs=1000 + ) + systemds_rmse, systemds_time = linear_regression_systemds( + X_train, y_train, X_test, y_test, num_epochs=1000 + ) print(f"PyTorch RMSE: {pytorch_rmse}, Time: {pytorch_time} seconds") print(f"SystemDS RMSE: {systemds_rmse}, Time: {systemds_time} seconds") @@ -499,7 +571,7 @@ def evaluate_lr() -> None: # Cosine Similarity evaluate_cosine_similarity() - + # Matrix Factorization using ALS evaluate_als("systemds") evaluate_als("numpy")