From b635db526c074cb563c526f6e818c0e8bd785555 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Mon, 17 Nov 2025 16:56:59 -0700 Subject: [PATCH 01/23] Start adding support for running against remote Armada cluster server. These changes are to allow running an armada-spark client against a remote (i.e. non-localhost) Armada cluster server. Signed-off-by: Rich Scott --- README.md | 12 +++++ scripts/dev-e2e.sh | 19 ++++--- scripts/init.sh | 5 ++ scripts/set-version.sh | 25 ++++++++-- .../deploy/armada/submit/ArmadaUtils.scala | 7 ++- .../deploy/armada/e2e/ArmadaOperations.scala | 5 +- .../deploy/armada/e2e/ArmadaSparkE2E.scala | 13 +++-- .../spark/deploy/armada/e2e/K8sClient.scala | 49 +++++++++++++++++-- 8 files changed, 115 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index f0d040c4..8d77ecfd 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,18 @@ Run the following command to load the Armada Spark image into your local kind cl kind load docker-image $IMAGE_NAME --name armada ``` +### Running a remote Armada server using Armada Operator +The default Armada Operator setup allows only localhost access. You can quickly set up a local Armada server +configured to allow external access from other hosts, useful for client development and testing. For this +configuration, +- Copy the file `e2e/kind-config-external-access.yaml` in this repository to `hack/kind-config.yaml` +in your `armada-operator` repository. +- Edit the newly-copied `hack/kind-config.yaml` as noted in the beginning comments of that file. +- Run the armada-operator setup commands (usually `make kind-all`) to create and start your Armada instance. + +Then copy the `$HOME/.kube/config` and `$HOME/.armadctl.yaml` (that Armada Operator will generate) from the Armada +server host to your `$HOME` directory on the client (local) host. + --- ## Development diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index 69d73c3f..6b40e7a6 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -161,16 +161,21 @@ run-test() { cd "$scripts/.." # Run the Scala E2E test suite - mvn scalatest:test -Dsuites="org.apache.spark.deploy.armada.e2e.ArmadaSparkE2E" \ + # env MAVEN_OPTS='-Dcom.sun.net.ssl.checkRevocation=false' + env KUBERNETES_TRUST_CERTIFICATES=true \ + mvn -e scalatest:test -Dsuites="org.apache.spark.deploy.armada.e2e.ArmadaSparkE2E" \ -Dcontainer.image="$IMAGE_NAME" \ -Dscala.version="$SCALA_VERSION" \ -Dscala.binary.version="$SCALA_BIN_VERSION" \ -Dspark.version="$SPARK_VERSION" \ -Darmada.queue="$ARMADA_QUEUE" \ - -Darmada.master="armada://localhost:30002" \ - -Darmada.lookout.url="http://localhost:30000" \ - -Darmadactl.path="$scripts/armadactl" 2>&1 | \ - tee e2e-test.log + -Darmada.master="armada://$ARMADA_MASTER" \ + -Darmada.lookout.url="$ARMADA_LOOKOUT_URL" \ + -Darmadactl.path="$scripts/armadactl" \ + -Dclient_cert_file="$CLIENT_CERT_FILE" \ + -Dclient_key_file="$CLIENT_KEY_FILE" \ + -Dcluster_ca_file="$CLUSTER_CA_FILE" \ + 2>&1 | tee e2e-test.log TEST_EXIT_CODE=${PIPESTATUS[0]} @@ -183,8 +188,8 @@ run-test() { } main() { - init-cluster + # init-cluster run-test } -main \ No newline at end of file +main diff --git a/scripts/init.sh b/scripts/init.sh index 91ab121c..68e0a1a4 100644 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -59,6 +59,11 @@ export ARMADA_AUTH_TOKEN=${ARMADA_AUTH_TOKEN:-} export SCALA_CLASS="${SCALA_CLASS:-org.apache.spark.examples.SparkPi}" export RUNNING_E2E_TESTS="${RUNNING_E2E_TESTS:-false}" +# Client certificate for TLS +export CLIENT_CERT_FILE=${CLIENT_CERT_FILE} +export CLIENT_KEY_FILE=${CLIENT_KEY_FILE} +export CLUSTER_CA_FILE=${CLUSTER_CA_FILE} + if [ -z "${PYTHON_SCRIPT:-}" ]; then PYTHON_SCRIPT="/opt/spark/examples/src/main/python/pi.py" else diff --git a/scripts/set-version.sh b/scripts/set-version.sh index 283ea44a..916fa608 100755 --- a/scripts/set-version.sh +++ b/scripts/set-version.sh @@ -1,6 +1,24 @@ #!/bin/bash -root="$(cd "$(dirname "$0")/.."; pwd)" +root="$(cd "$(dirname "$0")/.." || exit; pwd)" +SED="sed" +OS=$(uname -s) + +# The sed that macOS ships does not understand all the regex patterns (which +# we use) that GNU sed does, so look for 'gsed' and use that, if available. +if [ "$OS" = 'Darwin' ]; then + sed_location=$(type -p $SED) + if [ "$sed_location" = '/usr/bin/sed' ]; then + type -p gsed > /dev/null + if [ $? -eq 0 ]; then + SED=gsed + else + echo "$0: the version of sed on this system ($sed_location) does not handle" > /dev/stderr + echo "all the GNU sed extensions needed. Please install 'gsed' and re-run this script" > /dev/stderr + exit 1 + fi + fi +fi if [ $# -eq 2 ] then @@ -33,7 +51,7 @@ then fi echo "setting spark=$spark and scala=$scala" - sed -i -E \ + $SED -i -E \ -e "s%^( )([^_]+)[_0-9.]+()$%\1\2_${scala_compat}\3%" \ -e "s%^( ).+()$%\1${scala_major}\2%" \ -e "s%^( ).+()$%\1${scala_minor}\2%" \ @@ -45,7 +63,8 @@ then -e "s%^( ).+()$%\1${jackson_version}\2%" \ "$root/pom.xml" else - echo "Provide the Spark and Scala version to set" + echo "Provide the Spark and Scala version to set; for example:" + echo " $0 3.5.5 2.13.5" exit 1 fi diff --git a/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala b/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala index c228826e..b6d36731 100644 --- a/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala +++ b/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala @@ -26,8 +26,13 @@ object ArmadaUtils { import ArmadaUtilsExceptions._ def parseMasterUrl(masterUrl: String): (String, Int) = { + val startString = if (masterUrl.startsWith("local")) { + "local://armada://" + } else { + "armada://" + } Some(masterUrl) - .map(_.substring("armada://".length).split(":").toSeq) + .map(_.substring(startString.length).split(":").toSeq) .filter(_.length == 2) .map { case Seq(host: String, portString: String) => (host, Try(portString.toInt).getOrElse(-1)) diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala index 5d0c1e0e..fef33093 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala @@ -213,7 +213,10 @@ class ArmadaClient(armadaUrl: String = "localhost:30002") { val armadactlCmd = resolveArmadactlPath.getOrElse { throw new RuntimeException("armadactl not found in system properties or PATH") } - Seq(armadactlCmd) ++ subCommand.split(" ") ++ Seq("--armadaUrl", armadaUrl) + // armadactl command expects the server address to be of the form + // : with no pseudo-protocol prefix + var armadactlUrl = armadaUrl.replaceFirst("^armada://", "") + Seq(armadactlCmd) ++ subCommand.split(" ") ++ Seq("--armadaUrl", armadactlUrl) } /** Resolves the path to `armadactl`: diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala index 59681883..491f25e0 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala @@ -41,9 +41,9 @@ class ArmadaSparkE2E private val baseQueueName = "e2e-template" - private lazy val armadaClient = new ArmadaClient() - private lazy val k8sClient = new K8sClient() - private lazy val orchestrator = new TestOrchestrator(armadaClient, k8sClient) + private var armadaClient: ArmadaClient = _ + private var k8sClient: K8sClient = _ + implicit private var orch: TestOrchestrator = _ private var baseConfig: TestConfig = _ @@ -52,6 +52,11 @@ class ArmadaSparkE2E val props = loadProperties() + val armadaApiUrl = props.getProperty("armada.master", "localhost:30002") + armadaClient = new ArmadaClient(armadaApiUrl) + k8sClient = new K8sClient(props) + orch = new TestOrchestrator(armadaClient, k8sClient) + // Get Scala binary version - either from system property or derive from full version // This should be "2.12" or "2.13", not the full version like "2.12.15" val scalaBinaryVersion = props.getProperty("scala.binary.version") match { @@ -135,8 +140,6 @@ class ArmadaSparkE2E ) } - implicit val orch: TestOrchestrator = orchestrator - test("Basic SparkPi job with gang scheduling", E2ETest) { E2ETestBuilder("basic-spark-pi-gang") .withBaseConfig(baseConfig) diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/K8sClient.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/K8sClient.scala index cb53f68e..7a536d51 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/K8sClient.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/K8sClient.scala @@ -18,18 +18,61 @@ package org.apache.spark.deploy.armada.e2e import org.apache.spark.deploy.armada.Config -import io.fabric8.kubernetes.client.{DefaultKubernetesClient, KubernetesClient} +import org.apache.spark.deploy.armada.submit +import io.fabric8.kubernetes.client.{ + ConfigBuilder, + DefaultKubernetesClient, + KubernetesClient, + KubernetesClientBuilder +} import io.fabric8.kubernetes.api.model.{NamespaceBuilder, Pod} import io.fabric8.kubernetes.api.model.networking.v1.Ingress import java.util.concurrent.TimeoutException +import java.util.Properties import scala.jdk.CollectionConverters._ import scala.concurrent.duration._ import scala.concurrent.{ExecutionContext, Future} /** Kubernetes client implementation using fabric8 Kubernetes client library. */ -class K8sClient { - private val client: KubernetesClient = new DefaultKubernetesClient() +class K8sClient(props: Properties) { + val armadaMaster: String = props.getProperty("armada.master") + val pattern = """armada://([^:]+):.*""".r + val k8sApiURL: String = pattern.replaceAllIn(armadaMaster, "https://$1:6443") + + println(s"************************************************************************************") + println(s"armadaMaster= $armadaMaster\n\n") + println(s"k8sApiURL = $k8sApiURL\n\n") + + val clientCertFile: String = props.getProperty("client_cert_file", "") + val clientKeyFile: String = props.getProperty("client_key_file", "") + val clusterCaFile: String = props.getProperty("cluster_ca_file", "") + + var cb: ConfigBuilder = new ConfigBuilder() + // .withMasterUrl("https://api.sandbox.x8i5.example.com:6443") + .withMasterUrl(k8sApiURL) + // .withOauthToken("sha256~secret") + .withNamespace("default") + + if (clusterCaFile.nonEmpty) { + cb = cb.withCaCertFile(clusterCaFile) + } + + if (clientCertFile.nonEmpty) { + cb = cb.withClientCertFile(clientCertFile) + } + + if (clientKeyFile.nonEmpty) { + cb = cb.withClientKeyFile(clientKeyFile) + } + + val cfg = cb + .withClientKeyAlgo("RSA") + .build() + + private val client: KubernetesClient = new KubernetesClientBuilder() + .withConfig(cfg) + .build() def createNamespace(name: String)(implicit ec: ExecutionContext): Future[Unit] = Future { val namespace = new NamespaceBuilder() From b14a8e884499659dee0b817bf9e9924e6f53daf0 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Wed, 19 Nov 2025 16:46:25 -0700 Subject: [PATCH 02/23] Add script for getting TLS cert files for remote server Also, fixes in TestOrchestrator for running against a remote Armada instance, and run the tests directly, instead of using a Docker container on the client. Signed-off-by: Rich Scott --- e2e/extract-kind-cert.sh | 22 ++++++++ e2e/kind-config-external-access.yaml | 52 +++++++++++++++++++ .../deploy/armada/e2e/TestOrchestrator.scala | 46 +++++++--------- 3 files changed, 92 insertions(+), 28 deletions(-) create mode 100755 e2e/extract-kind-cert.sh create mode 100644 e2e/kind-config-external-access.yaml diff --git a/e2e/extract-kind-cert.sh b/e2e/extract-kind-cert.sh new file mode 100755 index 00000000..2644e418 --- /dev/null +++ b/e2e/extract-kind-cert.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +CONTEXT="kind-armada" + +# What These Files Are +# - client.crt: Your user (client) certificate +# - client.key: The private key associated with the certificate +# - ca.crt: The CA certificate used by the Kubernetes API server (for verifying client and server certs) + + +# Extract the client certificate +kubectl config view --raw -o json | jq -r \ + ".users[] | select(.name == \"${CONTEXT}\") | .user.[\"client-certificate-data\"]" | base64 -d > client.crt + +# Extract the client key +kubectl config view --raw -o json | jq -r \ + ".users[] | select(.name == \"${CONTEXT}\") | .user.[\"client-key-data\"]" | base64 -d > client.key + +# Extract the cluster CA certificate +kubectl config view --raw -o json | jq -r \ + ".clusters[] | select(.name == \"${CONTEXT}\") | .cluster.[\"certificate-authority-data\"]" | base64 -d > ca.crt + diff --git a/e2e/kind-config-external-access.yaml b/e2e/kind-config-external-access.yaml new file mode 100644 index 00000000..131bc28e --- /dev/null +++ b/e2e/kind-config-external-access.yaml @@ -0,0 +1,52 @@ +# A kind configuration for running an Armada server that can be accessed +# outside the host system, for working/developing with remote clients, +# such as Armada-Spark clients. +# +# This configuration will allow you to run kubectl and armadactl +# against the Armada instance on this system. To use this: +# - Copy your $HOME/.kube/config on this system to the same directory +# on your remote client host, then modify that copied file so the +# IP address in there (0.0.0.0) is the address of the external interface +# mentioned below. +# - Copy your $HOME/.armadactl.yaml to your $HOME directory on the remote +# client host, in that copied file, change the value of the 'armadaUrl' +# field from 'localhost' to the hostname (or IP address) of this server, +# and below that line a new line (at same indent level), add the entry +# forceNoTls: true +# You should then be able to run `kubectl cluster-info` or +# `armadactl get queues` without errors on the remote client host. +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane + kubeadmConfigPatches: + - | + kind: ClusterConfiguration + apiServer: + certSANs: + - localhost + - 127.0.0.1 + # replace the following line with the IP address + # of the external interface on this system + - 192.168.12.135 + - 0.0.0.0 + extraPortMappings: + # Lookout UI + - containerPort: 30000 + hostPort: 30000 + protocol: TCP + # Armada Server REST API + - containerPort: 30001 + hostPort: 30001 + protocol: TCP + # Armada Server gRPC API + - containerPort: 30002 + hostPort: 30002 + protocol: TCP + # Kubernetes API + - containerPort: 6443 + hostPort: 6443 + protocol: TCP +- role: worker + labels: + armada-spark: true diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala index 1bd415ca..5ae2f44c 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala @@ -191,6 +191,7 @@ class TestOrchestrator( println(s"Test ID: ${context.testId}") println(s"Namespace: ${context.namespace}") println(s"Queue: $queueName") + println(s"MasterURL: ${config.masterUrl}") val resultFuture = for { _ <- k8sClient.createNamespace(context.namespace) @@ -249,7 +250,6 @@ class TestOrchestrator( val appResource = config.pythonScript.getOrElse( s"local:///opt/spark/examples/jars/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" ) - val volumeMounts = buildVolumeMounts() val contextLabelString = context.labels.iterator.map { case (k, v) => s"$k=$v" }.mkString(",") val mergedLabels = config.sparkConfs @@ -262,9 +262,8 @@ class TestOrchestrator( "spark.armada.scheduling.namespace" -> context.namespace ) - val dockerCommand = buildDockerCommand( + val runTestCommand = buildRunTestCommand( config.imageName, - volumeMounts, config.masterUrl, testName, queueName, @@ -275,7 +274,7 @@ class TestOrchestrator( config.pythonScript ) - println(s"\n[SUBMIT] Submitting Spark job via Docker:") + println(s"\n[SUBMIT] Submitting Spark job:") println(s"[SUBMIT] Queue: $queueName") println(s"[SUBMIT] JobSetId: $jobSetId") println(s"[SUBMIT] Namespace: ${context.namespace}") @@ -288,7 +287,7 @@ class TestOrchestrator( println(s"[SUBMIT] $key = $displayValue") } // Properly escape command for shell reproduction - val escapedCommand = dockerCommand.map { arg => + val escapedCommand = runTestCommand.map { arg => if (arg.contains(" ") || arg.contains("'") || arg.contains("\"")) { "'" + arg.replace("'", "'\\''") + "'" } else arg @@ -297,7 +296,7 @@ class TestOrchestrator( @tailrec def attemptSubmit(attempt: Int = 1): ProcessResult = { - val result = ProcessExecutor.executeWithResult(dockerCommand, jobSubmitTimeout) + val result = ProcessExecutor.executeWithResult(runTestCommand, jobSubmitTimeout) if (result.exitCode != 0) { val allOutput = result.stdout + "\n" + result.stderr @@ -463,9 +462,8 @@ class TestOrchestrator( TestResult(jobSetId, queueName, finalStatus, assertionResults) } - private def buildDockerCommand( + private def buildRunTestCommand( imageName: String, - volumeMounts: Seq[String], masterUrl: String, testName: String, queueName: String, @@ -475,15 +473,18 @@ class TestOrchestrator( lookoutUrl: String, pythonScript: Option[String] ): Seq[String] = { + val sparkRepoCopy = ".spark-3.5.5" + + val classPathEntries: Seq[String] = Seq( + ".", + s"${sparkRepoCopy}/assembly/target/scala-2.13/jars/*", + "./target/armada-cluster-manager_2.13-1.0.0-SNAPSHOT-all.jar" + ) + val baseCommand = Seq( - "docker", - "run", - "--rm", - "--network", - "host" - ) ++ volumeMounts ++ Seq( - imageName, - "/opt/spark/bin/spark-class", + s"${sparkRepoCopy}/bin/spark-class", + "-cp", + classPathEntries.mkString(":"), "org.apache.spark.deploy.ArmadaSparkSubmit", "--master", masterUrl, @@ -514,7 +515,7 @@ class TestOrchestrator( "spark.armada.executor.request.cores" -> "100m", "spark.armada.executor.request.memory" -> "510Mi", "spark.local.dir" -> "/tmp", - "spark.home" -> "/opt/spark", + "spark.home" -> sparkRepoCopy, "spark.driver.extraJavaOptions" -> "-XX:-UseContainerSupport", "spark.executor.extraJavaOptions" -> "-XX:-UseContainerSupport" ) @@ -526,15 +527,4 @@ class TestOrchestrator( commandWithApp ++ confArgs ++ Seq(appResource, "100") } - - private def buildVolumeMounts(): Seq[String] = { - val userDir = System.getProperty("user.dir") - val e2eDir = new File(s"$userDir/src/test/resources/e2e") - - if (e2eDir.exists() && e2eDir.isDirectory) { - Seq("-v", s"${e2eDir.getAbsolutePath}:/opt/spark/work-dir/src/test/resources/e2e:ro") - } else { - Seq.empty - } - } } From 7d3a4568265d9c55cb26687649988cbc2d4686d5 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Thu, 20 Nov 2025 11:34:25 -0700 Subject: [PATCH 03/23] Move K8sClient class from src/test into src/main It will soon be used by ArmadaClientApplication. Signed-off-by: Rich Scott --- .../scala/org/apache/spark/deploy/armada}/K8sClient.scala | 2 +- .../org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala | 2 ++ .../org/apache/spark/deploy/armada/e2e/TestAssertions.scala | 1 + .../org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) rename src/{test/scala/org/apache/spark/deploy/armada/e2e => main/scala/org/apache/spark/deploy/armada}/K8sClient.scala (99%) diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/K8sClient.scala b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala similarity index 99% rename from src/test/scala/org/apache/spark/deploy/armada/e2e/K8sClient.scala rename to src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala index 7a536d51..a7dc30cf 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/K8sClient.scala +++ b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.deploy.armada.e2e +package org.apache.spark.deploy.armada import org.apache.spark.deploy.armada.Config import org.apache.spark.deploy.armada.submit diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala index 491f25e0..0b515c69 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala @@ -17,6 +17,8 @@ package org.apache.spark.deploy.armada.e2e +import org.apache.spark.deploy.armada.K8sClient + import org.scalatest.BeforeAndAfterAll import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestAssertions.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestAssertions.scala index 5fc6b008..a539445c 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestAssertions.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestAssertions.scala @@ -18,6 +18,7 @@ package org.apache.spark.deploy.armada.e2e import io.fabric8.kubernetes.api.model.Pod +import org.apache.spark.deploy.armada.K8sClient import scala.jdk.CollectionConverters._ import scala.concurrent.{ExecutionContext, Future} diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala index 5ae2f44c..32d9b023 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala @@ -20,6 +20,7 @@ package org.apache.spark.deploy.armada.e2e import java.io.File import java.util.UUID import java.util.concurrent.TimeoutException +import org.apache.spark.deploy.armada.K8sClient import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration._ import TestConstants._ From aa1e144f5e9ec6e3b7266fb20ce323787ab6b9a9 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Fri, 21 Nov 2025 13:08:53 -0700 Subject: [PATCH 04/23] Remove debugging messages; re-enable init-cluster on e2e script. Signed-off-by: Rich Scott --- scripts/dev-e2e.sh | 2 +- .../scala/org/apache/spark/deploy/armada/K8sClient.scala | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index 6b40e7a6..8cfe9690 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -188,7 +188,7 @@ run-test() { } main() { - # init-cluster + init-cluster run-test } diff --git a/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala index a7dc30cf..fb0fc2e9 100644 --- a/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala +++ b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala @@ -40,16 +40,11 @@ class K8sClient(props: Properties) { val pattern = """armada://([^:]+):.*""".r val k8sApiURL: String = pattern.replaceAllIn(armadaMaster, "https://$1:6443") - println(s"************************************************************************************") - println(s"armadaMaster= $armadaMaster\n\n") - println(s"k8sApiURL = $k8sApiURL\n\n") - val clientCertFile: String = props.getProperty("client_cert_file", "") val clientKeyFile: String = props.getProperty("client_key_file", "") val clusterCaFile: String = props.getProperty("cluster_ca_file", "") var cb: ConfigBuilder = new ConfigBuilder() - // .withMasterUrl("https://api.sandbox.x8i5.example.com:6443") .withMasterUrl(k8sApiURL) // .withOauthToken("sha256~secret") .withNamespace("default") From c538b1be72521c63519bbb1e27901698975736dc Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Fri, 21 Nov 2025 14:04:40 -0700 Subject: [PATCH 05/23] In E2E script, if Armada server is not localhost, don't do certain startup tasks Signed-off-by: Rich Scott --- scripts/dev-e2e.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index 8cfe9690..c3d0cab5 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -140,14 +140,19 @@ init-cluster() { mkdir -p "$scripts/.tmp" - TMPDIR="$scripts/.tmp" "$AOHOME/bin/tooling/kind" load docker-image "$IMAGE_NAME" --name armada 2>&1 \ - | log_group "Loading Docker image $IMAGE_NAME into Armada cluster"; + if [ "$ARMADA_MASTER" = "localhost" ] ; then + TMPDIR="$scripts/.tmp" "$AOHOME/bin/tooling/kind" load docker-image "$IMAGE_NAME" --name armada 2>&1 \ + | log_group "Loading Docker image $IMAGE_NAME into Armada cluster"; + fi # configure the defaults for the e2e test - cp $scripts/../e2e/spark-defaults.conf $scripts/../conf/spark-defaults.conf + cp "$scripts/../e2e/spark-defaults.conf" "$scripts/../conf/spark-defaults.conf" - log "Waiting 60 seconds for Armada to stabilize ..." - sleep 60 + # If using a remote Armada server, assume it is already running and ready + if [ "$ARMADA_MASTER" = "localhost" ] ; then + log "Waiting 60 seconds for Armada to stabilize ..." + sleep 60 + fi } run-test() { From 0c06f07542f68719525d21616a9cd3885ee3c668 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Fri, 21 Nov 2025 16:16:30 -0700 Subject: [PATCH 06/23] Add more README.md content on setting up for using a remote Armada server. Signed-off-by: Rich Scott --- README.md | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8d77ecfd..4011b1b5 100644 --- a/README.md +++ b/README.md @@ -83,14 +83,47 @@ kind load docker-image $IMAGE_NAME --name armada ### Running a remote Armada server using Armada Operator The default Armada Operator setup allows only localhost access. You can quickly set up a local Armada server configured to allow external access from other hosts, useful for client development and testing. For this -configuration, +configuration: + - Copy the file `e2e/kind-config-external-access.yaml` in this repository to `hack/kind-config.yaml` in your `armada-operator` repository. + - Edit the newly-copied `hack/kind-config.yaml` as noted in the beginning comments of that file. + - Run the armada-operator setup commands (usually `make kind-all`) to create and start your Armada instance. -Then copy the `$HOME/.kube/config` and `$HOME/.armadctl.yaml` (that Armada Operator will generate) from the Armada -server host to your `$HOME` directory on the client (local) host. +- Copy the `$HOME/.kube/config` and `$HOME/.armadctl.yaml` (that Armada Operator will generate) from the Armada +server host to your `$HOME` directory on the client (local) host. Then edit the local `.kube/config` and on +the line that has `server: https://0.0.0.0:6443`, change the `0.0.0.0` address to the IP address or hostname +of the remote Armada server system. + +- Generate a copy of the client TLS key, cert, and CA-cert files: (1) go into the `e2e` subdirectory, and +run `./extract-kind-cert.sh` - it will generate `client.crt`, `client.key`, and `ca.crt`, from the output +of `kubectl config view`. These files can be left in this directory. + +- Copy the `$HOME/.armadactl.yaml` from the Armada server host to your home directory on your client system. + +- You should then be able to run `kubectl get pods -A` and see a list of the running pods on the remote +Armada server, as well as running `armadactl get queues`. + +- Verify the functionality of your setup by editing `scripts/config.sh` and changing the following line: +``` +ARMADA_MASTER=armada://192.168.12.135:30002 +``` +to the IP address or hostname of your Armada server. You should not need to change the port number. + +Also, set the location of the three TLS certificate files by adding/setting: +``` +CLIENT_CERT_FILE=e2e/client.crt +CLIENT_KEY_FILE=e2e/client.key +CLUSTER_CA_FILE=e2e/ca.crt +``` + +- You should be able to now verify the armada-spark configuration by running the E2E tests: +``` +$ ./scripts/dev-e2e.sh +``` +This will save its output to `e2e-test.log` for further debugging. --- From fee45527116b5858958d8ebd0fd8d811f813aa50 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Mon, 24 Nov 2025 08:58:27 -0700 Subject: [PATCH 07/23] Quote evaluated script vars, for CI checker Signed-off-by: Rich Scott --- scripts/init.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/init.sh b/scripts/init.sh index 835ae885..77bdde90 100644 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -64,9 +64,9 @@ export SCALA_CLASS="${SCALA_CLASS:-org.apache.spark.examples.SparkPi}" export RUNNING_E2E_TESTS="${RUNNING_E2E_TESTS:-false}" # Client certificate for TLS -export CLIENT_CERT_FILE=${CLIENT_CERT_FILE} -export CLIENT_KEY_FILE=${CLIENT_KEY_FILE} -export CLUSTER_CA_FILE=${CLUSTER_CA_FILE} +export CLIENT_CERT_FILE="${CLIENT_CERT_FILE}" +export CLIENT_KEY_FILE="${CLIENT_KEY_FILE}" +export CLUSTER_CA_FILE="${CLUSTER_CA_FILE}" if [ -z "${PYTHON_SCRIPT:-}" ]; then PYTHON_SCRIPT="/opt/spark/examples/src/main/python/pi.py" From e4e4a83a5467b11fca5cb7409e2998356d780127 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Mon, 24 Nov 2025 11:19:29 -0700 Subject: [PATCH 08/23] Disable building/testing for Spark 3.5.5 config for now. Signed-off-by: Rich Scott --- .github/workflows/build.yaml | 14 +++++++------- .github/workflows/e2e.yaml | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ef0fdc11..cb182424 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -13,16 +13,16 @@ jobs: fail-fast: false matrix: include: - - scala_version: "2.12.15" - spark_version: "3.3.4" - java_version: "11" + # - scala_version: "2.12.15" + # spark_version: "3.3.4" + # java_version: "11" - scala_version: "2.12.18" spark_version: "3.5.5" java_version: "17" - - scala_version: "2.13.8" - spark_version: "3.3.4" - java_version: "11" + # - scala_version: "2.13.8" + # spark_version: "3.3.4" + # java_version: "11" - scala_version: "2.13.8" spark_version: "3.5.5" java_version: "17" @@ -35,4 +35,4 @@ jobs: with: spark_version: ${{ matrix.spark_version }} scala_version: ${{ matrix.scala_version }} - java_version: ${{ matrix.java_version }} \ No newline at end of file + java_version: ${{ matrix.java_version }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 4df404fc..81165cac 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -13,16 +13,16 @@ jobs: fail-fast: false matrix: include: - - scala_version: "2.12.15" - spark_version: "3.3.4" - java_version: "11" + # - scala_version: "2.12.15" + # spark_version: "3.3.4" + # java_version: "11" - scala_version: "2.12.18" spark_version: "3.5.5" java_version: "17" - - scala_version: "2.13.8" - spark_version: "3.3.4" - java_version: "11" + # - scala_version: "2.13.8" + # spark_version: "3.3.4" + # java_version: "11" - scala_version: "2.13.8" spark_version: "3.5.5" java_version: "17" From 7e49435fed70dc43f05813afcea423a6bf1cfaca Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Mon, 24 Nov 2025 11:45:09 -0700 Subject: [PATCH 09/23] Scala linter fixes; Bash lint fixes Signed-off-by: Rich Scott --- scripts/init.sh | 5 ----- .../scala/org/apache/spark/deploy/armada/K8sClient.scala | 4 ++-- .../org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala | 6 +++--- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/scripts/init.sh b/scripts/init.sh index 77bdde90..6d6f11b2 100644 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -63,11 +63,6 @@ export ARMADA_AUTH_TOKEN=${ARMADA_AUTH_TOKEN:-} export SCALA_CLASS="${SCALA_CLASS:-org.apache.spark.examples.SparkPi}" export RUNNING_E2E_TESTS="${RUNNING_E2E_TESTS:-false}" -# Client certificate for TLS -export CLIENT_CERT_FILE="${CLIENT_CERT_FILE}" -export CLIENT_KEY_FILE="${CLIENT_KEY_FILE}" -export CLUSTER_CA_FILE="${CLUSTER_CA_FILE}" - if [ -z "${PYTHON_SCRIPT:-}" ]; then PYTHON_SCRIPT="/opt/spark/examples/src/main/python/pi.py" else diff --git a/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala index fb0fc2e9..2b07d2bb 100644 --- a/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala +++ b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala @@ -37,8 +37,8 @@ import scala.concurrent.{ExecutionContext, Future} /** Kubernetes client implementation using fabric8 Kubernetes client library. */ class K8sClient(props: Properties) { val armadaMaster: String = props.getProperty("armada.master") - val pattern = """armada://([^:]+):.*""".r - val k8sApiURL: String = pattern.replaceAllIn(armadaMaster, "https://$1:6443") + val pattern = """armada://([^:]+):.*""".r + val k8sApiURL: String = pattern.replaceAllIn(armadaMaster, "https://$1:6443") val clientCertFile: String = props.getProperty("client_cert_file", "") val clientKeyFile: String = props.getProperty("client_key_file", "") diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala index 0b515c69..0cea0251 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala @@ -43,8 +43,8 @@ class ArmadaSparkE2E private val baseQueueName = "e2e-template" - private var armadaClient: ArmadaClient = _ - private var k8sClient: K8sClient = _ + private var armadaClient: ArmadaClient = _ + private var k8sClient: K8sClient = _ implicit private var orch: TestOrchestrator = _ private var baseConfig: TestConfig = _ @@ -56,7 +56,7 @@ class ArmadaSparkE2E val armadaApiUrl = props.getProperty("armada.master", "localhost:30002") armadaClient = new ArmadaClient(armadaApiUrl) - k8sClient = new K8sClient(props) + k8sClient = new K8sClient(props) orch = new TestOrchestrator(armadaClient, k8sClient) // Get Scala binary version - either from system property or derive from full version From 2b705aeefe6ebabcd8ea6d1c408d532dc3221a1d Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Mon, 24 Nov 2025 12:18:44 -0700 Subject: [PATCH 10/23] Add ARMADA_LOOKOUT_URL to init.sh Signed-off-by: Rich Scott --- scripts/init.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/init.sh b/scripts/init.sh index 6d6f11b2..f6fca45f 100644 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -58,6 +58,7 @@ export USE_KIND="${USE_KIND:-false}" export STATIC_MODE="${STATIC_MODE:-false}" export IMAGE_NAME="${IMAGE_NAME:-spark:armada}" export ARMADA_MASTER="${ARMADA_MASTER:-armada://localhost:30002}" +export ARMADA_LOOKOUT_URL="${ARMADA_LOOKOUT_URL:-https://localhost:30000}" export ARMADA_QUEUE="${ARMADA_QUEUE:-test}" export ARMADA_AUTH_TOKEN=${ARMADA_AUTH_TOKEN:-} export SCALA_CLASS="${SCALA_CLASS:-org.apache.spark.examples.SparkPi}" From cc7639a153adef031f1b30de2023cf5f8e2a56a6 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Mon, 24 Nov 2025 14:33:57 -0700 Subject: [PATCH 11/23] Check if TLS cert vars are defined before referencing Signed-off-by: Rich Scott --- scripts/init.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/init.sh b/scripts/init.sh index f6fca45f..6244fb6a 100644 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -64,6 +64,16 @@ export ARMADA_AUTH_TOKEN=${ARMADA_AUTH_TOKEN:-} export SCALA_CLASS="${SCALA_CLASS:-org.apache.spark.examples.SparkPi}" export RUNNING_E2E_TESTS="${RUNNING_E2E_TESTS:-false}" +if [ -n "${CLIENT_CERT_FILE:-}" ]; then + export CLIENT_CERT_FILE="${CLIENT_CERT_FILE}" +fi +if [ -n "${CLIENT_CERT_KEY:-}" ]; then + export CLIENT_CERT_KEY="${CLIENT_CERT_KEY}" +fi +if [ -n "${CLUSTER_CA_FILE:-}" ]; then + export CLUSTER_CA_FILE="${CLUSTER_CA_FILE}" +fi + if [ -z "${PYTHON_SCRIPT:-}" ]; then PYTHON_SCRIPT="/opt/spark/examples/src/main/python/pi.py" else From b3d42655849e0a10749d2583f2a700373c5c1865 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Mon, 24 Nov 2025 15:07:17 -0700 Subject: [PATCH 12/23] Conditionally add TLS cert properties to test invocation Signed-off-by: Rich Scott --- scripts/dev-e2e.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index c3d0cab5..4e6d7f92 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -165,6 +165,11 @@ run-test() { # Change to armada-spark directory cd "$scripts/.." + tls_args=() + test -n "${CLIENT_CERT_FILE:-}" && tls_args+=( -Dclient_cert_file="$CLIENT_CERT_FILE" ) + test -n "${CLIENT_KEY_FILE:-}" && tls_args+=( -Dclient_key_file="$CLIENT_KEY_FILE" ) + test -n "${CLUSTER_CA_FILE:-}" && tls_args+=( -Dcluster_ca_file="$CLUSTER_CA_FILE" ) + # Run the Scala E2E test suite # env MAVEN_OPTS='-Dcom.sun.net.ssl.checkRevocation=false' env KUBERNETES_TRUST_CERTIFICATES=true \ @@ -177,11 +182,7 @@ run-test() { -Darmada.master="armada://$ARMADA_MASTER" \ -Darmada.lookout.url="$ARMADA_LOOKOUT_URL" \ -Darmadactl.path="$scripts/armadactl" \ - -Dclient_cert_file="$CLIENT_CERT_FILE" \ - -Dclient_key_file="$CLIENT_KEY_FILE" \ - -Dcluster_ca_file="$CLUSTER_CA_FILE" \ - 2>&1 | tee e2e-test.log - + "${tls_args[@]}" 2>&1 | tee e2e-test.log TEST_EXIT_CODE=${PIPESTATUS[0]} if [ "$TEST_EXIT_CODE" -ne 0 ]; then From 616dcacccb2a9e7469a3b8384d4105c1564bc54d Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Mon, 1 Dec 2025 10:40:49 -0700 Subject: [PATCH 13/23] Dynamically extract external IP addr for TLS cert setup Get the first external interface IP address and use in Kind configuration for allowing external K8S/Armada API access. Add protective quotes around TLS vars in dev-e2e.sh, per shellcheck. Use `realpath` for getting reliable full pathnames. Signed-off-by: Rich Scott --- e2e/extract-kind-cert.sh | 5 ++++- scripts/dev-e2e.sh | 33 +++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/e2e/extract-kind-cert.sh b/e2e/extract-kind-cert.sh index 2644e418..39caef06 100755 --- a/e2e/extract-kind-cert.sh +++ b/e2e/extract-kind-cert.sh @@ -2,12 +2,15 @@ CONTEXT="kind-armada" +E2E_DIR=$(realpath "$0" | xargs dirname) + +cd "$E2E_DIR" || (echo "Error: could not cd to $E2E_DIR"; exit 1) + # What These Files Are # - client.crt: Your user (client) certificate # - client.key: The private key associated with the certificate # - ca.crt: The CA certificate used by the Kubernetes API server (for verifying client and server certs) - # Extract the client certificate kubectl config view --raw -o json | jq -r \ ".users[] | select(.name == \"${CONTEXT}\") | .user.[\"client-certificate-data\"]" | base64 -d > client.crt diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index 4e6d7f92..07756c24 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -7,7 +7,7 @@ source "$scripts"/init.sh STATUSFILE="$(mktemp)" AOREPO='https://github.com/armadaproject/armada-operator.git' -AOHOME="$scripts/../../armada-operator" +AOHOME=$(realpath "$scripts/../../armada-operator") ARMADACTL_VERSION='0.19.1' GREEN='\033[0;32m' @@ -87,12 +87,37 @@ start-armada() { fi fi - echo "Running 'make kind-all' to install and start Armada; this may take up to 6 minutes" + kind_extern_cfg='e2e/kind-config-external-access.yaml' + if ! cp "$kind_extern_cfg" "$AOHOME/hack/kind-config.yaml"; then + err "There was an error copying $kind_extern_cfg to $AOHOME/hack/kind-config.yaml" + exit 1 + fi + + # Get IP address of first network interface that is not loopback or a K8S internal network interface + external_ip=$(ifconfig -a| grep -w 'inet' | grep -v 'inet 127\.0\.0' | grep -v 'inet 172\.' | awk '{print $2}' | sed -ne '1p') + if [ "$(uname -s)" = 'Darwin' ]; then + sed_opt='-I .bak' + else + sed_opt='-i.bak' + fi + + if ! sed "$sed_opt" -e "s/192.168.12.135/$external_ip/" "$AOHOME/hack/kind-config.yaml"; then + err "There was an error modifying $AOHOME/hack/kind-config.yaml" + exit 1 + fi + + echo "Running 'make kind-all' to install and start Armada; this may take up to 6 minutes" if ! (cd "$AOHOME"; make kind-all 2>&1) | tee armada-start.txt; then echo "" err "There was a problem starting Armada; exiting now" exit 1 fi + + echo "Extracting TLS client certificate files from Kind cluster" + if ! e2e/extract-kind-cert.sh; then + err "There was a problem extracting the certificates" + exit 1 + fi } init-cluster() { @@ -182,12 +207,12 @@ run-test() { -Darmada.master="armada://$ARMADA_MASTER" \ -Darmada.lookout.url="$ARMADA_LOOKOUT_URL" \ -Darmadactl.path="$scripts/armadactl" \ - "${tls_args[@]}" 2>&1 | tee e2e-test.log + "${tls_args[@]:-}" 2>&1 | tee e2e-test.log TEST_EXIT_CODE=${PIPESTATUS[0]} if [ "$TEST_EXIT_CODE" -ne 0 ]; then err "E2E tests failed with exit code $TEST_EXIT_CODE" - exit $TEST_EXIT_CODE + exit "$TEST_EXIT_CODE" fi log "E2E tests completed successfully" From 8127bcb64e0c068ef907f2a6b130ea86918ad1b9 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Tue, 2 Dec 2025 12:01:51 -0700 Subject: [PATCH 14/23] Check for busybox image for init container Use better pattern checks for verifying if Armada master is localhost; remove quote wrapper around tls_args, so Maven doesn't error on a "" target. Signed-off-by: Rich Scott --- scripts/dev-e2e.sh | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index 07756c24..092ec794 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -127,6 +127,12 @@ init-cluster() { exit 1 fi + if ! (echo "$INIT_CONTAINER_IMAGE" | grep -Eq '^[[:alnum:]_]+:[[:alnum:]_]+$'); then + err "INIT_CONTAINER_IMAGE is not defined. Please set it in $scripts/config.sh, for example:" + err "INIT_CONTAINER_IMAGE=busybox:latest" + exit 1 + fi + if [ -z "$ARMADA_QUEUE" ]; then err "ARMADA_QUEUE is not defined. Please set it in $scripts/config.sh, for example:" err "ARMADA_QUEUE=spark-test" @@ -145,6 +151,17 @@ init-cluster() { exit 1 fi + echo "Checking if image $INIT_CONTAINER_IMAGE is available" + if ! docker image inspect "$INIT_CONTAINER_IMAGE" > /dev/null 2>&1; then + echo "Image $INIT_CONTAINER_IMAGE not found in local Docker instance; pulling it from Docker Hub." + if ! docker pull "$INIT_CONTAINER_IMAGE"; then + err "Could not pull $INIT_CONTAINER_IMAGE; please try running" + err " docker pull $INIT_CONTAINER_IMAGE" + err "then run this script again" + exit 1 + fi + fi + echo "Checking to see if Armada cluster is available ..." if ! "$scripts"/armadactl get queues > "$STATUSFILE" 2>&1 ; then @@ -165,16 +182,18 @@ init-cluster() { mkdir -p "$scripts/.tmp" - if [ "$ARMADA_MASTER" = "localhost" ] ; then - TMPDIR="$scripts/.tmp" "$AOHOME/bin/tooling/kind" load docker-image "$IMAGE_NAME" --name armada 2>&1 \ - | log_group "Loading Docker image $IMAGE_NAME into Armada cluster"; + if [[ "$ARMADA_MASTER" == *"//localhost"* ]] ; then + for IMG in "$IMAGE_NAME" "$INIT_CONTAINER_IMAGE"; do + TMPDIR="$scripts/.tmp" "$AOHOME/bin/tooling/kind" load docker-image "$IMG" --name armada 2>&1 \ + | log_group "Loading Docker image $IMG into Armada (Kind) cluster"; + done fi # configure the defaults for the e2e test cp "$scripts/../e2e/spark-defaults.conf" "$scripts/../conf/spark-defaults.conf" # If using a remote Armada server, assume it is already running and ready - if [ "$ARMADA_MASTER" = "localhost" ] ; then + if [[ "$ARMADA_MASTER" == *"//localhost"* ]] ; then log "Waiting 60 seconds for Armada to stabilize ..." sleep 60 fi @@ -207,7 +226,7 @@ run-test() { -Darmada.master="armada://$ARMADA_MASTER" \ -Darmada.lookout.url="$ARMADA_LOOKOUT_URL" \ -Darmadactl.path="$scripts/armadactl" \ - "${tls_args[@]:-}" 2>&1 | tee e2e-test.log + ${tls_args[@]:-} 2>&1 | tee e2e-test.log TEST_EXIT_CODE=${PIPESTATUS[0]} if [ "$TEST_EXIT_CODE" -ne 0 ]; then From 5d514664fd3c2e4269964dc9679326a73399e356 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Tue, 2 Dec 2025 12:12:52 -0700 Subject: [PATCH 15/23] Add init container image name. Signed-off-by: Rich Scott --- scripts/init.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/init.sh b/scripts/init.sh index 6244fb6a..6476353f 100644 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -63,6 +63,7 @@ export ARMADA_QUEUE="${ARMADA_QUEUE:-test}" export ARMADA_AUTH_TOKEN=${ARMADA_AUTH_TOKEN:-} export SCALA_CLASS="${SCALA_CLASS:-org.apache.spark.examples.SparkPi}" export RUNNING_E2E_TESTS="${RUNNING_E2E_TESTS:-false}" +export INIT_CONTAINER_IMAGE="${INIT_CONTAINER_IMAGE:-busybox:latest}" if [ -n "${CLIENT_CERT_FILE:-}" ]; then export CLIENT_CERT_FILE="${CLIENT_CERT_FILE}" @@ -80,8 +81,6 @@ else INCLUDE_PYTHON=true fi - - # derive Scala and Spark versions from pom.xml, set via ./scripts/set-version.sh if [[ -z "${SCALA_VERSION:-}" ]]; then export SCALA_VERSION=$(cd "$scripts/.."; mvn help:evaluate -Dexpression=scala.version -q -DforceStdout) From 117255068806e9cd10f8609e6aba3fce3710ba2d Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Tue, 2 Dec 2025 13:51:34 -0700 Subject: [PATCH 16/23] Add diagnostic search log for .spark-* directory Signed-off-by: Rich Scott --- scripts/dev-e2e.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index 092ec794..6dd2ef41 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -200,6 +200,11 @@ init-cluster() { } run-test() { + if [[ ! -d ".spark-$SPARK_VERSION" ]]; then + echo "Checking out Spark sources for tag v$SPARK_VERSION." + git clone https://github.com/apache/spark --branch v$SPARK_VERSION --depth 1 --no-tags ".spark-$SPARK_VERSION" + fi + echo "Running Scala E2E test suite..." # Add armadactl to PATH so the e2e framework can access it @@ -209,6 +214,11 @@ run-test() { # Change to armada-spark directory cd "$scripts/.." + echo '' + echo 'Looking for .spark* directory ----------------------------------------------' + ls .spark* + echo '-----------------------------------------------------------------------------' + tls_args=() test -n "${CLIENT_CERT_FILE:-}" && tls_args+=( -Dclient_cert_file="$CLIENT_CERT_FILE" ) test -n "${CLIENT_KEY_FILE:-}" && tls_args+=( -Dclient_key_file="$CLIENT_KEY_FILE" ) From a0c0d8af09440313ef4741818226d479c7e986cb Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Tue, 2 Dec 2025 14:04:15 -0700 Subject: [PATCH 17/23] More GH Actions debugging Signed-off-by: Rich Scott --- scripts/dev-e2e.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index 6dd2ef41..2fed3216 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -216,7 +216,8 @@ run-test() { echo '' echo 'Looking for .spark* directory ----------------------------------------------' - ls .spark* + pwd + ls -a echo '-----------------------------------------------------------------------------' tls_args=() From 233e413ca931aef0e51a3ce7961182387de31d09 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Tue, 2 Dec 2025 14:22:08 -0700 Subject: [PATCH 18/23] More diagnostics for finding spark source dir Signed-off-by: Rich Scott --- scripts/dev-e2e.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index 2fed3216..b33db29f 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -200,12 +200,18 @@ init-cluster() { } run-test() { + echo "Running Scala E2E test suite..." + if [[ ! -d ".spark-$SPARK_VERSION" ]]; then echo "Checking out Spark sources for tag v$SPARK_VERSION." git clone https://github.com/apache/spark --branch v$SPARK_VERSION --depth 1 --no-tags ".spark-$SPARK_VERSION" fi - echo "Running Scala E2E test suite..." + echo '' + echo 'Looking for .spark* directory ----------------------------------------------' + pwd + ls -a + echo '-----------------------------------------------------------------------------' # Add armadactl to PATH so the e2e framework can access it PATH="$scripts:$AOHOME/bin/tooling/:$PATH" @@ -214,12 +220,6 @@ run-test() { # Change to armada-spark directory cd "$scripts/.." - echo '' - echo 'Looking for .spark* directory ----------------------------------------------' - pwd - ls -a - echo '-----------------------------------------------------------------------------' - tls_args=() test -n "${CLIENT_CERT_FILE:-}" && tls_args+=( -Dclient_cert_file="$CLIENT_CERT_FILE" ) test -n "${CLIENT_KEY_FILE:-}" && tls_args+=( -Dclient_key_file="$CLIENT_KEY_FILE" ) From 9e0b309632b3ad580dc040251512a346805607b6 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Tue, 2 Dec 2025 15:00:18 -0700 Subject: [PATCH 19/23] Clone the Spark repo to get bin/spark-class and jars. Signed-off-by: Rich Scott --- scripts/dev-e2e.sh | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index b33db29f..555e25a7 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -205,14 +205,26 @@ run-test() { if [[ ! -d ".spark-$SPARK_VERSION" ]]; then echo "Checking out Spark sources for tag v$SPARK_VERSION." git clone https://github.com/apache/spark --branch v$SPARK_VERSION --depth 1 --no-tags ".spark-$SPARK_VERSION" + cd ".spark-$SPARK_VERSION" + # Spark 3.3.4 does not compile without this fix + if [[ "$SPARK_VERSION" == "3.3.4" ]]; then + sed -i -e "s%2.13.8%2.13.6%" pom.xml + # Fix deprecated openjdk base image - use eclipse-temurin:11-jammy instead. + spark_dockerfile="resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile" + if [ -f "$spark_dockerfile" ]; then + sed -i -e 's|FROM openjdk:|FROM eclipse-temurin:|g' "$spark_dockerfile" + sed -i -E 's/^ARG java_image_tag=11-jre-slim$/ARG java_image_tag=11-jammy/' "$spark_dockerfile" + fi + fi + ./dev/change-scala-version.sh $SCALA_BIN_VERSION + # by packaging the assembly project specifically, jars of all depending Spark projects are fetch from Maven + # spark-examples jars are not released, so we need to build these from sources + ./build/mvn --batch-mode clean + ./build/mvn --batch-mode package -pl examples + ./build/mvn --batch-mode package -Pkubernetes -Pscala-$SCALA_BIN_VERSION -pl assembly + cd .. fi - echo '' - echo 'Looking for .spark* directory ----------------------------------------------' - pwd - ls -a - echo '-----------------------------------------------------------------------------' - # Add armadactl to PATH so the e2e framework can access it PATH="$scripts:$AOHOME/bin/tooling/:$PATH" export PATH From 6345155bce8bd8f918053238ac9a9d847118075c Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Tue, 2 Dec 2025 15:39:10 -0700 Subject: [PATCH 20/23] Always rebuild the Spark jars before running E2E. Signed-off-by: Rich Scott --- scripts/dev-e2e.sh | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index 555e25a7..36ed251c 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -205,25 +205,26 @@ run-test() { if [[ ! -d ".spark-$SPARK_VERSION" ]]; then echo "Checking out Spark sources for tag v$SPARK_VERSION." git clone https://github.com/apache/spark --branch v$SPARK_VERSION --depth 1 --no-tags ".spark-$SPARK_VERSION" - cd ".spark-$SPARK_VERSION" - # Spark 3.3.4 does not compile without this fix - if [[ "$SPARK_VERSION" == "3.3.4" ]]; then - sed -i -e "s%2.13.8%2.13.6%" pom.xml - # Fix deprecated openjdk base image - use eclipse-temurin:11-jammy instead. - spark_dockerfile="resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile" - if [ -f "$spark_dockerfile" ]; then - sed -i -e 's|FROM openjdk:|FROM eclipse-temurin:|g' "$spark_dockerfile" - sed -i -E 's/^ARG java_image_tag=11-jre-slim$/ARG java_image_tag=11-jammy/' "$spark_dockerfile" - fi + fi + + cd ".spark-$SPARK_VERSION" + # Spark 3.3.4 does not compile without this fix + if [[ "$SPARK_VERSION" == "3.3.4" ]]; then + sed -i -e "s%2.13.8%2.13.6%" pom.xml + # Fix deprecated openjdk base image - use eclipse-temurin:11-jammy instead. + spark_dockerfile="resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile" + if [ -f "$spark_dockerfile" ]; then + sed -i -e 's|FROM openjdk:|FROM eclipse-temurin:|g' "$spark_dockerfile" + sed -i -E 's/^ARG java_image_tag=11-jre-slim$/ARG java_image_tag=11-jammy/' "$spark_dockerfile" fi - ./dev/change-scala-version.sh $SCALA_BIN_VERSION - # by packaging the assembly project specifically, jars of all depending Spark projects are fetch from Maven - # spark-examples jars are not released, so we need to build these from sources - ./build/mvn --batch-mode clean - ./build/mvn --batch-mode package -pl examples - ./build/mvn --batch-mode package -Pkubernetes -Pscala-$SCALA_BIN_VERSION -pl assembly - cd .. fi + ./dev/change-scala-version.sh $SCALA_BIN_VERSION + # by packaging the assembly project specifically, jars of all depending Spark projects are fetch from Maven + # spark-examples jars are not released, so we need to build these from sources + ./build/mvn --batch-mode clean + ./build/mvn --batch-mode package -pl examples + ./build/mvn --batch-mode package -Pkubernetes -Pscala-$SCALA_BIN_VERSION -pl assembly + cd .. # Add armadactl to PATH so the e2e framework can access it PATH="$scripts:$AOHOME/bin/tooling/:$PATH" From 3e26371b6ddc6c5cc1c58e3620ae872002acdd1e Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Wed, 11 Feb 2026 16:19:27 -0700 Subject: [PATCH 21/23] Fix Scala formatting violations. Signed-off-by: Rich Scott --- .../apache/spark/deploy/armada/e2e/ArmadaOperations.scala | 8 +++++--- .../apache/spark/deploy/armada/e2e/TestOrchestrator.scala | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala index 8da8246a..6e85a4b7 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala @@ -216,13 +216,15 @@ class ArmadaClient(armadaUrl: String = "localhost:30002") { // armadactl command expects the server address to be of the form // : with no pseudo-protocol prefix // val input = "local://armada://localhost:30002" - val pattern = """.*armada://(.+)""".r + val pattern = """.*armada://(.+)""".r var armadactlUrl = "undefined-armadactl-url" armadaUrl match { case pattern(hostPort) => armadactlUrl = hostPort // e.g. "localhost:30002" - case _ => - throw new RuntimeException(s"could not extract valid armadactl URL from armada URL ${armadaUrl}") + case _ => + throw new RuntimeException( + s"could not extract valid armadactl URL from armada URL ${armadaUrl}" + ) } // var armadactlUrl = armadaUrl.replaceFirst("^armada://", "") diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala index e5ecb32d..a0e8896b 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala @@ -349,7 +349,7 @@ class TestOrchestrator( def attemptSubmit(attempt: Int = 1): ProcessResult = { // In client mode, spark-submit runs until application completes, so use longer timeout val timeout = if (!modeHelper.isDriverInCluster) jobWatchTimeout else jobSubmitTimeout - val result = ProcessExecutor.executeWithResult(runTestCommand, timeout) + val result = ProcessExecutor.executeWithResult(runTestCommand, timeout) if (result.exitCode != 0) { val allOutput = result.stdout + "\n" + result.stderr @@ -530,8 +530,8 @@ class TestOrchestrator( modeHelper: DeploymentModeHelper ): Seq[String] = { val sparkRepoCopy = ".spark-3.5.5" - val deployMode = if (modeHelper.isDriverInCluster) "cluster" else "client" - val isClientMode = !modeHelper.isDriverInCluster + val deployMode = if (modeHelper.isDriverInCluster) "cluster" else "client" + val isClientMode = !modeHelper.isDriverInCluster val classPathEntries: Seq[String] = Seq( ".", From 2347697b3f25fd1dbeecb8f4d6c7735297c9055e Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Fri, 20 Feb 2026 15:34:38 -0700 Subject: [PATCH 22/23] Remove more refs and logic for old `local://` prefix. Signed-off-by: Rich Scott --- README.md | 1 - .../notebooks/jupyter_armada_spark.ipynb | 4 +- .../spark/deploy/armada/K8sClient.scala | 2 +- .../deploy/armada/submit/ArmadaUtils.scala | 6 +-- .../deploy/armada/e2e/ArmadaOperations.scala | 1 - .../deploy/armada/e2e/TestOrchestrator.scala | 37 ++++++++++++------- .../submit/ArmadaClientApplicationSuite.scala | 1 - 7 files changed, 27 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 8ced2667..9b4495aa 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,6 @@ export USE_KIND=true **Note:** For client mode, you need to set additional configuration: ```bash -export ARMADA_MASTER="local://armada://localhost:30002" # Add "local://" prefix export SPARK_DRIVER_HOST="172.18.0.1" # Required for client mode export SPARK_DRIVER_PORT="7078" # Required for client mode ``` diff --git a/example/jupyter/notebooks/jupyter_armada_spark.ipynb b/example/jupyter/notebooks/jupyter_armada_spark.ipynb index 0aa15007..4984bc25 100644 --- a/example/jupyter/notebooks/jupyter_armada_spark.ipynb +++ b/example/jupyter/notebooks/jupyter_armada_spark.ipynb @@ -73,7 +73,7 @@ "driver_host = os.environ.get('SPARK_DRIVER_HOST')\n", "driver_port = os.environ.get('SPARK_DRIVER_PORT', '7078')\n", "block_manager_port = os.environ.get('SPARK_BLOCK_MANAGER_PORT', '10061')\n", - "armada_master = os.environ.get('ARMADA_MASTER', 'local://armada://host.docker.internal:30002')\n", + "armada_master = os.environ.get('ARMADA_MASTER', 'armada://host.docker.internal:30002')\n", "armada_queue = os.environ.get('ARMADA_QUEUE', 'default')\n", "armada_namespace = os.environ.get('ARMADA_NAMESPACE', 'default')\n", "image_name = os.environ.get('IMAGE_NAME', 'spark:armada')\n", @@ -233,4 +233,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala index c10b45c8..c3d24d47 100644 --- a/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala +++ b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala @@ -37,7 +37,7 @@ import scala.concurrent.{ExecutionContext, Future} /** Kubernetes client implementation using fabric8 Kubernetes client library. */ class K8sClient(props: Properties) { val armadaMaster: String = props.getProperty("armada.master") - val pattern = """local://armada://([^:]+):.*""".r + val pattern = """armada://([^:]+):.*""".r val k8sApiURL: String = pattern.replaceAllIn(armadaMaster, "https://$1:6443") println(s"-------- K8sClient(): armadaMaster = ${armadaMaster}") diff --git a/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala b/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala index 4f09d748..61f8f289 100644 --- a/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala +++ b/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala @@ -28,11 +28,7 @@ object ArmadaUtils { import ArmadaUtilsExceptions._ def parseMasterUrl(masterUrl: String): (String, Int) = { - val startString = if (masterUrl.startsWith("local")) { - "local://armada://" - } else { - "armada://" - } + val startString = "armada://" Some(masterUrl) .map(_.substring(startString.length).split(":").toSeq) .filter(_.length == 2) diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala index 6e85a4b7..3e40b611 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala @@ -215,7 +215,6 @@ class ArmadaClient(armadaUrl: String = "localhost:30002") { } // armadactl command expects the server address to be of the form // : with no pseudo-protocol prefix - // val input = "local://armada://localhost:30002" val pattern = """.*armada://(.+)""".r var armadactlUrl = "undefined-armadactl-url" diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala index a0e8896b..c0be71ef 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala @@ -73,6 +73,7 @@ class TestOrchestrator( armadaClient: ArmadaClient, k8sClient: K8sClient )(implicit ec: ExecutionContext) { + val sparkRepoCopy = ".spark-3.5.5" private val jobSubmitTimeout = JobSubmitTimeout private val jobWatchTimeout = JobWatchTimeout @@ -292,11 +293,22 @@ class TestOrchestrator( context: TestContext, modeHelper: DeploymentModeHelper ): Future[Unit] = Future { + val deployMode = if (modeHelper.isDriverInCluster) "cluster" else "client" + // Use spark-examples JAR with the correct path based on Scala binary version and Spark version // Following the same pattern as scripts/init.sh - val appResource = config.pythonScript.getOrElse( - s"local:///opt/spark/examples/jars/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" - ) + val appResource = { + if (config.pythonScript.isDefined) { + config.pythonScript.get + } else if (deployMode == "cluster") { + // s"local:///opt/spark/examples/target/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" + s"local:///opt/spark/examples/jars/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" + } else { + // broken s"${sparkRepoCopy}/examples/target/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" + s"${sparkRepoCopy}/examples/target/scala-${config.scalaVersion}/jars/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" + // .spark-3.5.5/examples/target/scala-2.13/jars/spark-examples_2.13-3.5.5.jar + } + } val contextLabelString = context.labels.iterator.map { case (k, v) => s"$k=$v" }.mkString(",") val mergedLabels = config.sparkConfs @@ -304,8 +316,6 @@ class TestOrchestrator( .map(existing => s"$existing,$contextLabelString") .getOrElse(contextLabelString) - val deployMode = if (modeHelper.isDriverInCluster) "cluster" else "client" - val enhancedSparkConfs = config.sparkConfs ++ Map( "spark.armada.pod.labels" -> mergedLabels, "spark.armada.scheduling.namespace" -> context.namespace, @@ -529,21 +539,19 @@ class TestOrchestrator( pythonScript: Option[String], modeHelper: DeploymentModeHelper ): Seq[String] = { - val sparkRepoCopy = ".spark-3.5.5" - val deployMode = if (modeHelper.isDriverInCluster) "cluster" else "client" - val isClientMode = !modeHelper.isDriverInCluster + val deployMode = if (modeHelper.isDriverInCluster) "cluster" else "client" + val isClientMode = !modeHelper.isDriverInCluster - val classPathEntries: Seq[String] = Seq( + val driverClassPath = Seq( ".", - s"${sparkRepoCopy}/assembly/target/scala-2.13/jars/*", "./target/armada-cluster-manager_2.13-1.0.0-SNAPSHOT-all.jar" - ) + ).mkString(":") val baseCommand = Seq( s"${sparkRepoCopy}/bin/spark-class", - "-cp", - classPathEntries.mkString(":"), "org.apache.spark.deploy.SparkSubmit", + "--driver-class-path", + driverClassPath, "--master", masterUrl, "--deploy-mode", @@ -589,7 +597,8 @@ class TestOrchestrator( } else { // In cluster mode, driver runs in a pod, so use internal URL Map( - "spark.armada.internalUrl" -> "armada://armada-server.armada:50051" + // "spark.armada.internalUrl" -> "armada://armada-server.armada:50051" + "spark.armada.internalUrl" -> "armada://armada-server.armada:30002" ) } diff --git a/src/test/scala/org/apache/spark/deploy/armada/submit/ArmadaClientApplicationSuite.scala b/src/test/scala/org/apache/spark/deploy/armada/submit/ArmadaClientApplicationSuite.scala index 39d7ff72..938b8ae5 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/submit/ArmadaClientApplicationSuite.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/submit/ArmadaClientApplicationSuite.scala @@ -61,7 +61,6 @@ class ArmadaClientApplicationSuite extends AnyFunSuite with BeforeAndAfter with private val SPARK_DRIVER_URL = "SPARK_DRIVER_URL" // Constants for paths - private val PYTHON_EXAMPLE_PATH = "/opt/spark/examples/src/main/python/pi.py" private val clientArguments = ClientArguments( mainAppResource = JavaMainAppResource(Some("app.jar")), mainClass = "org.example.SparkApp", From 8f94b07265e2774b47bc4106d8df465d3625a364 Mon Sep 17 00:00:00 2001 From: Rich Scott Date: Fri, 20 Mar 2026 13:15:48 -0600 Subject: [PATCH 23/23] Return to running client code in Docker image. Also, derive K8S server address (especially the port, as `kind` dynamically selects a random port number to expose the api-server) from ~/.kube/config. Signed-off-by: Rich Scott --- .../spark/deploy/armada/K8sClient.scala | 18 ++++++- .../deploy/armada/e2e/TestOrchestrator.scala | 51 +++++++++++-------- 2 files changed, 47 insertions(+), 22 deletions(-) diff --git a/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala index 4ccc10c5..3cd739b8 100644 --- a/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala +++ b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala @@ -28,6 +28,9 @@ import io.fabric8.kubernetes.client.{ import io.fabric8.kubernetes.api.model.{NamespaceBuilder, Pod} import io.fabric8.kubernetes.api.model.networking.v1.Ingress +import org.yaml.snakeyaml.Yaml + +import java.io.FileReader import java.util.concurrent.TimeoutException import java.util.Properties import scala.jdk.CollectionConverters._ @@ -38,7 +41,20 @@ import scala.concurrent.{ExecutionContext, Future} class K8sClient(props: Properties) { val armadaMaster: String = props.getProperty("armada.master") val pattern = """armada://([^:]+):.*""".r - val k8sApiURL: String = pattern.replaceAllIn(armadaMaster, "https://$1:6443") + + // If armadaMaster is local, derive k8sApiURL from ~/.kube/config, + // which `kind` will create/update. + val yaml = new Yaml() + val home = System.getProperty("user.home") + val data = yaml.load[java.util.Map[String, Object]](new FileReader(s"$home/.kube/config")) + var k8sApiURL = "no-K8S-server-found" + + val clusters = + data.get("clusters").asInstanceOf[java.util.List[java.util.Map[String, Object]]].asScala + clusters.foreach { entry => + val cluster = entry.get("cluster").asInstanceOf[java.util.Map[String, Object]] + k8sApiURL = cluster.get("server").toString() + } println(s"-------- K8sClient(): armadaMaster = ${armadaMaster}") println(s"-------- K8sClient(): k8sApiURL= ${k8sApiURL}") diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala index 61fa86c8..28584bb5 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala @@ -338,22 +338,12 @@ class TestOrchestrator( context: TestContext, modeHelper: DeploymentModeHelper ): Future[Unit] = Future { - val deployMode = if (modeHelper.isDriverInCluster) "cluster" else "client" - // Use spark-examples JAR with the correct path based on Scala binary version and Spark version // Following the same pattern as scripts/init.sh - val appResource = { - if (config.pythonScript.isDefined) { - config.pythonScript.get - } else if (deployMode == "cluster") { - // s"local:///opt/spark/examples/target/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" - s"local:///opt/spark/examples/jars/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" - } else { - // broken s"${sparkRepoCopy}/examples/target/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" - s"${sparkRepoCopy}/examples/target/scala-${config.scalaVersion}/jars/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" - // .spark-3.5.5/examples/target/scala-2.13/jars/spark-examples_2.13-3.5.5.jar - } - } + val appResource = config.pythonScript.getOrElse( + s"local:///opt/spark/examples/jars/spark-examples_${config.scalaVersion}-${config.sparkVersion}.jar" + ) + val volumeMounts = buildVolumeMounts() val contextLabelString = context.labels.iterator.map { case (k, v) => s"$k=$v" }.mkString(",") val mergedLabels = config.sparkConfs @@ -361,6 +351,8 @@ class TestOrchestrator( .map(existing => s"$existing,$contextLabelString") .getOrElse(contextLabelString) + val deployMode = if (modeHelper.isDriverInCluster) "cluster" else "client" + val enhancedSparkConfs = config.sparkConfs ++ Map( "spark.armada.pod.labels" -> mergedLabels, "spark.armada.scheduling.namespace" -> context.namespace, @@ -369,6 +361,7 @@ class TestOrchestrator( val runTestCommand = buildRunTestCommand( config.imageName, + volumeMounts, config.masterUrl, testName, queueName, @@ -381,7 +374,7 @@ class TestOrchestrator( config.appArgs ) - println(s"\n[SUBMIT] Submitting Spark job:") + println(s"\n[SUBMIT] Submitting Spark job via Docker:") println(s"[SUBMIT] Queue: $queueName") println(s"[SUBMIT] JobSetId: $jobSetId") println(s"[SUBMIT] Namespace: ${context.namespace}") @@ -600,6 +593,7 @@ class TestOrchestrator( private def buildRunTestCommand( imageName: String, + volumeMounts: Seq[String], masterUrl: String, testName: String, queueName: String, @@ -620,10 +614,15 @@ class TestOrchestrator( ).mkString(":") val baseCommand = Seq( - s"${sparkRepoCopy}/bin/spark-class", + "docker", + "run", + "--rm", + "--network", + "host" + ) ++ volumeMounts ++ Seq( + imageName, + "/opt/spark/bin/spark-class", "org.apache.spark.deploy.SparkSubmit", - "--driver-class-path", - driverClassPath, "--master", masterUrl, "--deploy-mode", @@ -652,7 +651,7 @@ class TestOrchestrator( "spark.armada.executor.request.cores" -> "100m", "spark.armada.executor.request.memory" -> "510Mi", "spark.local.dir" -> "/tmp", - "spark.home" -> sparkRepoCopy, + "spark.home" -> "/opt/spark", "spark.driver.extraJavaOptions" -> "-XX:-UseContainerSupport", "spark.executor.extraJavaOptions" -> "-XX:-UseContainerSupport" ) @@ -669,8 +668,7 @@ class TestOrchestrator( } else { // In cluster mode, driver runs in a pod, so use internal URL Map( - // "spark.armada.internalUrl" -> "armada://armada-server.armada:50051" - "spark.armada.internalUrl" -> "armada://armada-server.armada:30002" + "spark.armada.internalUrl" -> "armada://armada-server.armada:50051" ) } @@ -683,4 +681,15 @@ class TestOrchestrator( commandWithApp ++ confArgs ++ (appResource +: appArgs) } + + private def buildVolumeMounts(): Seq[String] = { + val userDir = System.getProperty("user.dir") + val e2eDir = new File(s"$userDir/src/test/resources/e2e") + + if (e2eDir.exists() && e2eDir.isDirectory) { + Seq("-v", s"${e2eDir.getAbsolutePath}:/opt/spark/work-dir/src/test/resources/e2e:ro") + } else { + Seq.empty + } + } }