From ad50f369033a65d0c73776e3d6921ce0f855ebc4 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Wed, 16 Jul 2025 17:25:15 +0200 Subject: [PATCH 1/3] fix: jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data dev install --- stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml b/stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml index 4e88bfcc..5fe372b2 100644 --- a/stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml +++ b/stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml @@ -53,8 +53,9 @@ spec: - name: hdfs-discovery-configmap configMap: name: hdfs - config: + roleConfig: listenerClass: external-unstable + config: resources: memory: limit: "2Gi" From 71b138426d9d1f19dd6256a2d111350173aa307b Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Wed, 16 Jul 2025 18:06:23 +0200 Subject: [PATCH 2/3] docs: remove dead link --- .../jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc b/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc index 138dac25..592d2c34 100644 --- a/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc +++ b/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc @@ -3,7 +3,6 @@ :scikit-lib: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html :k8s-cpu: https://kubernetes.io/docs/tasks/debug/debug-cluster/resource-metrics-pipeline/#cpu :spark-pkg: https://spark.apache.org/docs/latest/api/python/user_guide/python_packaging.html -:forest-article: https://towardsdatascience.com/isolation-forest-and-spark-b88ade6c63ff :pyspark: https://spark.apache.org/docs/latest/api/python/getting_started/index.html :forest-algo: https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf :nyc-taxi: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page @@ -133,7 +132,7 @@ In practice, clients of Spark Connect do not need a full-blown Spark installatio == Model details The job uses an implementation of the Isolation Forest {forest-algo}[algorithm] provided by the scikit-learn {scikit-lib}[library]: -the model is trained and then invoked by a user-defined function (see {forest-article}[this article] for how to call the sklearn library with a pyspark UDF), all of which is run using the Spark Connect executors. +the model is trained and then invoked by a user-defined function running on the Spark Connect executors. This type of model attempts to isolate each data point by continually partitioning the data. Data closely packed together will require more partitions to separate data points. In contrast, any outliers will require less: the number of partitions needed for a particular data point is thus inversely proportional to the anomaly "score". From 766e6c290e5597504c364e4b59401ef620b67a96 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Wed, 16 Jul 2025 20:52:14 +0200 Subject: [PATCH 3/3] update notebook --- stacks/jupyterhub-pyspark-hdfs/notebook.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stacks/jupyterhub-pyspark-hdfs/notebook.ipynb b/stacks/jupyterhub-pyspark-hdfs/notebook.ipynb index acb9b431..3e81c879 100644 --- a/stacks/jupyterhub-pyspark-hdfs/notebook.ipynb +++ b/stacks/jupyterhub-pyspark-hdfs/notebook.ipynb @@ -27,14 +27,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spark = (\n", " SparkSession\n", " .builder\n", - " .remote(\"sc://spark-connect-server-default:15002\")\n", + " .remote(\"sc://spark-connect-server:15002\")\n", " .appName(\"taxi-data-anomaly-detection\")\n", " .getOrCreate()\n", ")"