diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ef0fdc11..cb182424 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -13,16 +13,16 @@ jobs: fail-fast: false matrix: include: - - scala_version: "2.12.15" - spark_version: "3.3.4" - java_version: "11" + # - scala_version: "2.12.15" + # spark_version: "3.3.4" + # java_version: "11" - scala_version: "2.12.18" spark_version: "3.5.5" java_version: "17" - - scala_version: "2.13.8" - spark_version: "3.3.4" - java_version: "11" + # - scala_version: "2.13.8" + # spark_version: "3.3.4" + # java_version: "11" - scala_version: "2.13.8" spark_version: "3.5.5" java_version: "17" @@ -35,4 +35,4 @@ jobs: with: spark_version: ${{ matrix.spark_version }} scala_version: ${{ matrix.scala_version }} - java_version: ${{ matrix.java_version }} \ No newline at end of file + java_version: ${{ matrix.java_version }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 4df404fc..81165cac 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -13,16 +13,16 @@ jobs: fail-fast: false matrix: include: - - scala_version: "2.12.15" - spark_version: "3.3.4" - java_version: "11" + # - scala_version: "2.12.15" + # spark_version: "3.3.4" + # java_version: "11" - scala_version: "2.12.18" spark_version: "3.5.5" java_version: "17" - - scala_version: "2.13.8" - spark_version: "3.3.4" - java_version: "11" + # - scala_version: "2.13.8" + # spark_version: "3.3.4" + # java_version: "11" - scala_version: "2.13.8" spark_version: "3.5.5" java_version: "17" diff --git a/README.md b/README.md index 8111cb63..42c645a7 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,90 @@ We recommend [kind](https://kind.sigs.k8s.io/) for local testing. If you are usi kind load docker-image $IMAGE_NAME --name armada ``` +### Running a remote Armada server using Armada Operator +The default Armada Operator setup allows only localhost access. You can quickly set up a local Armada server +configured to allow external access from other hosts, useful for client development and testing. For this +configuration: + +- Copy the file `e2e/kind-config-external-access.yaml` in this repository to `hack/kind-config.yaml` +in your `armada-operator` repository. + +- Edit the newly-copied `hack/kind-config.yaml` as noted in the beginning comments of that file. + +- Run the armada-operator setup commands (usually `make kind-all`) to create and start your Armada instance. + +- Copy the `$HOME/.kube/config` and `$HOME/.armadctl.yaml` (that Armada Operator will generate) from the Armada +server host to your `$HOME` directory on the client (local) host. Then edit the local `.kube/config` and on +the line that has `server: https://0.0.0.0:6443`, change the `0.0.0.0` address to the IP address or hostname +of the remote Armada server system. + +- Generate a copy of the client TLS key, cert, and CA-cert files: (1) go into the `e2e` subdirectory, and +run `./extract-kind-cert.sh` - it will generate `client.crt`, `client.key`, and `ca.crt`, from the output +of `kubectl config view`. These files can be left in this directory. + +- Copy the `$HOME/.armadactl.yaml` from the Armada server host to your home directory on your client system. + +- You should then be able to run `kubectl get pods -A` and see a list of the running pods on the remote +Armada server, as well as running `armadactl get queues`. + +- Verify the functionality of your setup by editing `scripts/config.sh` and changing the following line: +``` +ARMADA_MASTER=armada://192.168.12.135:30002 +``` +to the IP address or hostname of your Armada server. You should not need to change the port number. + +Also, set the location of the three TLS certificate files by adding/setting: +``` +CLIENT_CERT_FILE=e2e/client.crt +CLIENT_KEY_FILE=e2e/client.key +CLUSTER_CA_FILE=e2e/ca.crt +``` + +- You should be able to now verify the armada-spark configuration by running the E2E tests: +``` +$ ./scripts/dev-e2e.sh +``` +This will save its output to `e2e-test.log` for further debugging. + +--- + +## Development + +Before submitting a pull request, please ensure that your code adheres to the project's coding standards and passes all tests. + +### Testing + +To run the unit tests, use the following command: + +```bash +mvn test +``` + +To run the E2E tests, run Armada using the [Operator Quickstart](https://github.com/armadaproject/armada-operator?tab=readme-ov-file#quickstart) guide, then execute: + +```bash +scripts/test-e2e.sh +``` +### Linting + +To check the code for linting issues, use the following command: + +```bash +mvn spotless:check +``` + +To automatically apply linting fixes, use: + +```bash +mvn spotless:apply +``` + +### E2E + +Make sure that the [SparkPi](#sparkpi-example) job successfully runs on your Armada cluster before submitting a pull request. + +--- + ## Running Example Workloads ### SparkPi diff --git a/e2e/extract-kind-cert.sh b/e2e/extract-kind-cert.sh new file mode 100755 index 00000000..39caef06 --- /dev/null +++ b/e2e/extract-kind-cert.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +CONTEXT="kind-armada" + +E2E_DIR=$(realpath "$0" | xargs dirname) + +cd "$E2E_DIR" || (echo "Error: could not cd to $E2E_DIR"; exit 1) + +# What These Files Are +# - client.crt: Your user (client) certificate +# - client.key: The private key associated with the certificate +# - ca.crt: The CA certificate used by the Kubernetes API server (for verifying client and server certs) + +# Extract the client certificate +kubectl config view --raw -o json | jq -r \ + ".users[] | select(.name == \"${CONTEXT}\") | .user.[\"client-certificate-data\"]" | base64 -d > client.crt + +# Extract the client key +kubectl config view --raw -o json | jq -r \ + ".users[] | select(.name == \"${CONTEXT}\") | .user.[\"client-key-data\"]" | base64 -d > client.key + +# Extract the cluster CA certificate +kubectl config view --raw -o json | jq -r \ + ".clusters[] | select(.name == \"${CONTEXT}\") | .cluster.[\"certificate-authority-data\"]" | base64 -d > ca.crt + diff --git a/e2e/kind-config-external-access.yaml b/e2e/kind-config-external-access.yaml new file mode 100644 index 00000000..131bc28e --- /dev/null +++ b/e2e/kind-config-external-access.yaml @@ -0,0 +1,52 @@ +# A kind configuration for running an Armada server that can be accessed +# outside the host system, for working/developing with remote clients, +# such as Armada-Spark clients. +# +# This configuration will allow you to run kubectl and armadactl +# against the Armada instance on this system. To use this: +# - Copy your $HOME/.kube/config on this system to the same directory +# on your remote client host, then modify that copied file so the +# IP address in there (0.0.0.0) is the address of the external interface +# mentioned below. +# - Copy your $HOME/.armadactl.yaml to your $HOME directory on the remote +# client host, in that copied file, change the value of the 'armadaUrl' +# field from 'localhost' to the hostname (or IP address) of this server, +# and below that line a new line (at same indent level), add the entry +# forceNoTls: true +# You should then be able to run `kubectl cluster-info` or +# `armadactl get queues` without errors on the remote client host. +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane + kubeadmConfigPatches: + - | + kind: ClusterConfiguration + apiServer: + certSANs: + - localhost + - 127.0.0.1 + # replace the following line with the IP address + # of the external interface on this system + - 192.168.12.135 + - 0.0.0.0 + extraPortMappings: + # Lookout UI + - containerPort: 30000 + hostPort: 30000 + protocol: TCP + # Armada Server REST API + - containerPort: 30001 + hostPort: 30001 + protocol: TCP + # Armada Server gRPC API + - containerPort: 30002 + hostPort: 30002 + protocol: TCP + # Kubernetes API + - containerPort: 6443 + hostPort: 6443 + protocol: TCP +- role: worker + labels: + armada-spark: true diff --git a/example/jupyter/notebooks/jupyter_armada_spark.ipynb b/example/jupyter/notebooks/jupyter_armada_spark.ipynb index 3db16d77..544ad7fe 100644 --- a/example/jupyter/notebooks/jupyter_armada_spark.ipynb +++ b/example/jupyter/notebooks/jupyter_armada_spark.ipynb @@ -82,7 +82,7 @@ "driver_host = os.environ.get('SPARK_DRIVER_HOST')\n", "driver_port = os.environ.get('SPARK_DRIVER_PORT', '7078')\n", "block_manager_port = os.environ.get('SPARK_BLOCK_MANAGER_PORT', '10061')\n", - "armada_master = os.environ.get('ARMADA_MASTER', 'local://armada://host.docker.internal:30002')\n", + "armada_master = os.environ.get('ARMADA_MASTER', 'armada://host.docker.internal:30002')\n", "armada_queue = os.environ.get('ARMADA_QUEUE', 'default')\n", "armada_namespace = os.environ.get('ARMADA_NAMESPACE', 'default')\n", "image_name = os.environ.get('IMAGE_NAME', 'spark:armada')\n", diff --git a/scripts/dev-e2e.sh b/scripts/dev-e2e.sh index 9bead731..99463161 100755 --- a/scripts/dev-e2e.sh +++ b/scripts/dev-e2e.sh @@ -7,7 +7,7 @@ source "$scripts"/init.sh STATUSFILE="$(mktemp)" AOREPO='https://github.com/armadaproject/armada-operator.git' -AOHOME="$scripts/../../armada-operator" +AOHOME=$(realpath "$scripts/../../armada-operator") ARMADACTL_VERSION='0.20.23' GREEN='\033[0;32m' @@ -87,12 +87,37 @@ start-armada() { fi fi - echo "Running 'make kind-all' to install and start Armada; this may take up to 6 minutes" + kind_extern_cfg='e2e/kind-config-external-access.yaml' + if ! cp "$kind_extern_cfg" "$AOHOME/hack/kind-config.yaml"; then + err "There was an error copying $kind_extern_cfg to $AOHOME/hack/kind-config.yaml" + exit 1 + fi + + # Get IP address of first network interface that is not loopback or a K8S internal network interface + external_ip=$(ifconfig -a| grep -w 'inet' | grep -v 'inet 127\.0\.0' | grep -v 'inet 172\.' | awk '{print $2}' | sed -ne '1p') + if [ "$(uname -s)" = 'Darwin' ]; then + sed_opt='-I .bak' + else + sed_opt='-i.bak' + fi + + if ! sed "$sed_opt" -e "s/192.168.12.135/$external_ip/" "$AOHOME/hack/kind-config.yaml"; then + err "There was an error modifying $AOHOME/hack/kind-config.yaml" + exit 1 + fi + + echo "Running 'make kind-all' to install and start Armada; this may take up to 6 minutes" if ! (cd "$AOHOME"; make kind-all 2>&1) | tee armada-start.txt; then echo "" err "There was a problem starting Armada; exiting now" exit 1 fi + + echo "Extracting TLS client certificate files from Kind cluster" + if ! e2e/extract-kind-cert.sh; then + err "There was a problem extracting the certificates" + exit 1 + fi } init-cluster() { @@ -102,6 +127,12 @@ init-cluster() { exit 1 fi + if ! (echo "$INIT_CONTAINER_IMAGE" | grep -Eq '^[[:alnum:]_]+:[[:alnum:]_]+$'); then + err "INIT_CONTAINER_IMAGE is not defined. Please set it in $scripts/config.sh, for example:" + err "INIT_CONTAINER_IMAGE=busybox:latest" + exit 1 + fi + if [ -z "$ARMADA_QUEUE" ]; then err "ARMADA_QUEUE is not defined. Please set it in $scripts/config.sh, for example:" err "ARMADA_QUEUE=spark-test" @@ -120,6 +151,17 @@ init-cluster() { exit 1 fi + echo "Checking if image $INIT_CONTAINER_IMAGE is available" + if ! docker image inspect "$INIT_CONTAINER_IMAGE" > /dev/null 2>&1; then + echo "Image $INIT_CONTAINER_IMAGE not found in local Docker instance; pulling it from Docker Hub." + if ! docker pull "$INIT_CONTAINER_IMAGE"; then + err "Could not pull $INIT_CONTAINER_IMAGE; please try running" + err " docker pull $INIT_CONTAINER_IMAGE" + err "then run this script again" + exit 1 + fi + fi + echo "Checking to see if Armada cluster is available ..." if ! "$scripts"/armadactl get queues > "$STATUSFILE" 2>&1 ; then @@ -140,19 +182,50 @@ init-cluster() { mkdir -p "$scripts/.tmp" - TMPDIR="$scripts/.tmp" "$AOHOME/bin/tooling/kind" load docker-image "$IMAGE_NAME" --name armada 2>&1 \ - | log_group "Loading Docker image $IMAGE_NAME into Armada cluster"; + if [[ "$ARMADA_MASTER" == *"//localhost"* ]] ; then + for IMG in "$IMAGE_NAME" "$INIT_CONTAINER_IMAGE"; do + TMPDIR="$scripts/.tmp" "$AOHOME/bin/tooling/kind" load docker-image "$IMG" --name armada 2>&1 \ + | log_group "Loading Docker image $IMG into Armada (Kind) cluster"; + done + fi # configure the defaults for the e2e test - cp $scripts/../e2e/spark-defaults.conf $scripts/../conf/spark-defaults.conf + cp "$scripts/../e2e/spark-defaults.conf" "$scripts/../conf/spark-defaults.conf" - log "Waiting 60 seconds for Armada to stabilize ..." - sleep 60 + # If using a remote Armada server, assume it is already running and ready + if [[ "$ARMADA_MASTER" == *"//localhost"* ]] ; then + log "Waiting 60 seconds for Armada to stabilize ..." + sleep 60 + fi } run-test() { echo "Running Scala E2E test suite..." + if [[ ! -d ".spark-$SPARK_VERSION" ]]; then + echo "Checking out Spark sources for tag v$SPARK_VERSION." + git clone https://github.com/apache/spark --branch v$SPARK_VERSION --depth 1 --no-tags ".spark-$SPARK_VERSION" + fi + + cd ".spark-$SPARK_VERSION" + # Spark 3.3.4 does not compile without this fix + if [[ "$SPARK_VERSION" == "3.3.4" ]]; then + sed -i -e "s%2.13.8%2.13.6%" pom.xml + # Fix deprecated openjdk base image - use eclipse-temurin:11-jammy instead. + spark_dockerfile="resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile" + if [ -f "$spark_dockerfile" ]; then + sed -i -e 's|FROM openjdk:|FROM eclipse-temurin:|g' "$spark_dockerfile" + sed -i -E 's/^ARG java_image_tag=11-jre-slim$/ARG java_image_tag=11-jammy/' "$spark_dockerfile" + fi + fi + ./dev/change-scala-version.sh $SCALA_BIN_VERSION + # by packaging the assembly project specifically, jars of all depending Spark projects are fetch from Maven + # spark-examples jars are not released, so we need to build these from sources + ./build/mvn --batch-mode clean + ./build/mvn --batch-mode package -pl examples + ./build/mvn --batch-mode package -Pkubernetes -Pscala-$SCALA_BIN_VERSION -pl assembly + cd .. + # Add armadactl to PATH so the e2e framework can access it PATH="$scripts:$AOHOME/bin/tooling/:$PATH" export PATH @@ -160,23 +233,29 @@ run-test() { # Change to armada-spark directory cd "$scripts/.." + tls_args=() + test -n "${CLIENT_CERT_FILE:-}" && tls_args+=( -Dclient_cert_file="$CLIENT_CERT_FILE" ) + test -n "${CLIENT_KEY_FILE:-}" && tls_args+=( -Dclient_key_file="$CLIENT_KEY_FILE" ) + test -n "${CLUSTER_CA_FILE:-}" && tls_args+=( -Dcluster_ca_file="$CLUSTER_CA_FILE" ) + # Run the Scala E2E test suite - mvn scalatest:test -Dsuites="org.apache.spark.deploy.armada.e2e.ArmadaSparkE2E" \ + # env MAVEN_OPTS='-Dcom.sun.net.ssl.checkRevocation=false' + env KUBERNETES_TRUST_CERTIFICATES=true \ + mvn -e scalatest:test -Dsuites="org.apache.spark.deploy.armada.e2e.ArmadaSparkE2E" \ -Dcontainer.image="$IMAGE_NAME" \ -Dscala.version="$SCALA_VERSION" \ -Dscala.binary.version="$SCALA_BIN_VERSION" \ -Dspark.version="$SPARK_VERSION" \ -Darmada.queue="$ARMADA_QUEUE" \ - -Darmada.master="armada://localhost:30002" \ - -Darmada.lookout.url="http://localhost:30000" \ - -Darmadactl.path="$scripts/armadactl" 2>&1 | \ - tee e2e-test.log - + -Darmada.master="armada://$ARMADA_MASTER" \ + -Darmada.lookout.url="$ARMADA_LOOKOUT_URL" \ + -Darmadactl.path="$scripts/armadactl" \ + ${tls_args[@]:-} 2>&1 | tee e2e-test.log TEST_EXIT_CODE=${PIPESTATUS[0]} if [ "$TEST_EXIT_CODE" -ne 0 ]; then err "E2E tests failed with exit code $TEST_EXIT_CODE" - exit $TEST_EXIT_CODE + exit "$TEST_EXIT_CODE" fi log "E2E tests completed successfully" @@ -187,4 +266,4 @@ main() { run-test } -main \ No newline at end of file +main diff --git a/scripts/init.sh b/scripts/init.sh index 7c021799..a5e22c81 100644 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -104,6 +104,7 @@ export INCLUDE_PYTHON="${INCLUDE_PYTHON:-false}" export USE_KIND="${USE_KIND:-false}" export IMAGE_NAME="${IMAGE_NAME:-spark:armada}" export ARMADA_MASTER="${ARMADA_MASTER:-armada://localhost:30002}" +export ARMADA_LOOKOUT_URL="${ARMADA_LOOKOUT_URL:-https://localhost:30000}" export ARMADA_INTERNAL_URL="${ARMADA_INTERNAL_URL:-armada://armada-server.armada:50051}" export ARMADA_QUEUE="${ARMADA_QUEUE:-test}" export ARMADA_AUTH_TOKEN=${ARMADA_AUTH_TOKEN:-} @@ -112,9 +113,20 @@ export ARMADA_EVENT_WATCHER_USE_TLS=${ARMADA_EVENT_WATCHER_USE_TLS:-false} export SPARK_BLOCK_MANAGER_PORT=${SPARK_BLOCK_MANAGER_PORT:-} export SCALA_CLASS="${SCALA_CLASS:-org.apache.spark.examples.SparkPi}" export RUNNING_E2E_TESTS="${RUNNING_E2E_TESTS:-false}" +export INIT_CONTAINER_IMAGE="${INIT_CONTAINER_IMAGE:-busybox:latest}" export USE_FALLBACK_STORAGE="${USE_FALLBACK_STORAGE:-false}" export SPARK_SECRET_KEY="${SPARK_SECRET_KEY:-armada-secret}" +if [ -n "${CLIENT_CERT_FILE:-}" ]; then + export CLIENT_CERT_FILE="${CLIENT_CERT_FILE}" +fi +if [ -n "${CLIENT_CERT_KEY:-}" ]; then + export CLIENT_CERT_KEY="${CLIENT_CERT_KEY}" +fi +if [ -n "${CLUSTER_CA_FILE:-}" ]; then + export CLUSTER_CA_FILE="${CLUSTER_CA_FILE}" +fi + ARMADA_AUTH_ARGS=() # Add auth script path if configured if [ "$ARMADA_AUTH_SCRIPT_PATH" != "" ]; then diff --git a/scripts/set-version.sh b/scripts/set-version.sh index 283ea44a..916fa608 100755 --- a/scripts/set-version.sh +++ b/scripts/set-version.sh @@ -1,6 +1,24 @@ #!/bin/bash -root="$(cd "$(dirname "$0")/.."; pwd)" +root="$(cd "$(dirname "$0")/.." || exit; pwd)" +SED="sed" +OS=$(uname -s) + +# The sed that macOS ships does not understand all the regex patterns (which +# we use) that GNU sed does, so look for 'gsed' and use that, if available. +if [ "$OS" = 'Darwin' ]; then + sed_location=$(type -p $SED) + if [ "$sed_location" = '/usr/bin/sed' ]; then + type -p gsed > /dev/null + if [ $? -eq 0 ]; then + SED=gsed + else + echo "$0: the version of sed on this system ($sed_location) does not handle" > /dev/stderr + echo "all the GNU sed extensions needed. Please install 'gsed' and re-run this script" > /dev/stderr + exit 1 + fi + fi +fi if [ $# -eq 2 ] then @@ -33,7 +51,7 @@ then fi echo "setting spark=$spark and scala=$scala" - sed -i -E \ + $SED -i -E \ -e "s%^( )([^_]+)[_0-9.]+()$%\1\2_${scala_compat}\3%" \ -e "s%^( ).+()$%\1${scala_major}\2%" \ -e "s%^( ).+()$%\1${scala_minor}\2%" \ @@ -45,7 +63,8 @@ then -e "s%^( ).+()$%\1${jackson_version}\2%" \ "$root/pom.xml" else - echo "Provide the Spark and Scala version to set" + echo "Provide the Spark and Scala version to set; for example:" + echo " $0 3.5.5 2.13.5" exit 1 fi diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/K8sClient.scala b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala similarity index 71% rename from src/test/scala/org/apache/spark/deploy/armada/e2e/K8sClient.scala rename to src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala index d8c92e75..3cd739b8 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/K8sClient.scala +++ b/src/main/scala/org/apache/spark/deploy/armada/K8sClient.scala @@ -15,21 +15,78 @@ * limitations under the License. */ -package org.apache.spark.deploy.armada.e2e +package org.apache.spark.deploy.armada import org.apache.spark.deploy.armada.Config -import io.fabric8.kubernetes.client.{DefaultKubernetesClient, KubernetesClient} +import org.apache.spark.deploy.armada.submit +import io.fabric8.kubernetes.client.{ + ConfigBuilder, + DefaultKubernetesClient, + KubernetesClient, + KubernetesClientBuilder +} import io.fabric8.kubernetes.api.model.{NamespaceBuilder, Pod} import io.fabric8.kubernetes.api.model.networking.v1.Ingress +import org.yaml.snakeyaml.Yaml + +import java.io.FileReader import java.util.concurrent.TimeoutException +import java.util.Properties import scala.jdk.CollectionConverters._ import scala.concurrent.duration._ import scala.concurrent.{ExecutionContext, Future} /** Kubernetes client implementation using fabric8 Kubernetes client library. */ -class K8sClient { - private val client: KubernetesClient = new DefaultKubernetesClient() +class K8sClient(props: Properties) { + val armadaMaster: String = props.getProperty("armada.master") + val pattern = """armada://([^:]+):.*""".r + + // If armadaMaster is local, derive k8sApiURL from ~/.kube/config, + // which `kind` will create/update. + val yaml = new Yaml() + val home = System.getProperty("user.home") + val data = yaml.load[java.util.Map[String, Object]](new FileReader(s"$home/.kube/config")) + var k8sApiURL = "no-K8S-server-found" + + val clusters = + data.get("clusters").asInstanceOf[java.util.List[java.util.Map[String, Object]]].asScala + clusters.foreach { entry => + val cluster = entry.get("cluster").asInstanceOf[java.util.Map[String, Object]] + k8sApiURL = cluster.get("server").toString() + } + + println(s"-------- K8sClient(): armadaMaster = ${armadaMaster}") + println(s"-------- K8sClient(): k8sApiURL= ${k8sApiURL}") + + val clientCertFile: String = props.getProperty("client_cert_file", "") + val clientKeyFile: String = props.getProperty("client_key_file", "") + val clusterCaFile: String = props.getProperty("cluster_ca_file", "") + + var cb: ConfigBuilder = new ConfigBuilder() + .withMasterUrl(k8sApiURL) + // .withOauthToken("sha256~secret") + .withNamespace("default") + + if (clusterCaFile.nonEmpty) { + cb = cb.withCaCertFile(clusterCaFile) + } + + if (clientCertFile.nonEmpty) { + cb = cb.withClientCertFile(clientCertFile) + } + + if (clientKeyFile.nonEmpty) { + cb = cb.withClientKeyFile(clientKeyFile) + } + + val cfg = cb + .withClientKeyAlgo("RSA") + .build() + + private val client: KubernetesClient = new KubernetesClientBuilder() + .withConfig(cfg) + .build() def createNamespace(name: String)(implicit ec: ExecutionContext): Future[Unit] = Future { val namespace = new NamespaceBuilder() diff --git a/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala b/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala index 4f09d748..61f8f289 100644 --- a/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala +++ b/src/main/scala/org/apache/spark/deploy/armada/submit/ArmadaUtils.scala @@ -28,11 +28,7 @@ object ArmadaUtils { import ArmadaUtilsExceptions._ def parseMasterUrl(masterUrl: String): (String, Int) = { - val startString = if (masterUrl.startsWith("local")) { - "local://armada://" - } else { - "armada://" - } + val startString = "armada://" Some(masterUrl) .map(_.substring(startString.length).split(":").toSeq) .filter(_.length == 2) diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala index 5d0c1e0e..3e40b611 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaOperations.scala @@ -213,7 +213,23 @@ class ArmadaClient(armadaUrl: String = "localhost:30002") { val armadactlCmd = resolveArmadactlPath.getOrElse { throw new RuntimeException("armadactl not found in system properties or PATH") } - Seq(armadactlCmd) ++ subCommand.split(" ") ++ Seq("--armadaUrl", armadaUrl) + // armadactl command expects the server address to be of the form + // : with no pseudo-protocol prefix + val pattern = """.*armada://(.+)""".r + var armadactlUrl = "undefined-armadactl-url" + + armadaUrl match { + case pattern(hostPort) => armadactlUrl = hostPort // e.g. "localhost:30002" + case _ => + throw new RuntimeException( + s"could not extract valid armadactl URL from armada URL ${armadaUrl}" + ) + } + + // var armadactlUrl = armadaUrl.replaceFirst("^armada://", "") + println(s"-------- buildCommand(): armadaUrl = ${armadaUrl}") + println(s"-------- buildCommand(): armadactlUrl = ${armadactlUrl}") + Seq(armadactlCmd) ++ subCommand.split(" ") ++ Seq("--armadaUrl", armadactlUrl) } /** Resolves the path to `armadactl`: diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala index db092bd9..f7b6b592 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/ArmadaSparkE2E.scala @@ -17,6 +17,8 @@ package org.apache.spark.deploy.armada.e2e +import org.apache.spark.deploy.armada.K8sClient + import org.scalatest.BeforeAndAfterAll import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers @@ -42,9 +44,9 @@ class ArmadaSparkE2E private val baseQueueName = "e2e-template" - private lazy val armadaClient = new ArmadaClient() - private lazy val k8sClient = new K8sClient() - private lazy val orchestrator = new TestOrchestrator(armadaClient, k8sClient) + private var armadaClient: ArmadaClient = _ + private var k8sClient: K8sClient = _ + implicit private var orch: TestOrchestrator = _ private var baseConfig: TestConfig = _ @@ -59,6 +61,11 @@ class ArmadaSparkE2E val props = loadProperties() + val armadaApiUrl = props.getProperty("armada.master", "localhost:30002") + armadaClient = new ArmadaClient(armadaApiUrl) + k8sClient = new K8sClient(props) + orch = new TestOrchestrator(armadaClient, k8sClient) + // Get Scala binary version - either from system property or derive from full version // This should be "2.12" or "2.13", not the full version like "2.12.15" val scalaBinaryVersion = props.getProperty("scala.binary.version") match { @@ -148,8 +155,6 @@ class ArmadaSparkE2E super.afterAll() } - implicit val orch: TestOrchestrator = orchestrator - private def loadProperties(): Properties = { val props = new Properties() diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestAssertions.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestAssertions.scala index f4a5cea0..9b5e9c39 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestAssertions.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestAssertions.scala @@ -20,6 +20,7 @@ package org.apache.spark.deploy.armada.e2e import java.util.concurrent.atomic.AtomicInteger import io.fabric8.kubernetes.api.model.Pod +import org.apache.spark.deploy.armada.K8sClient import org.apache.spark.deploy.armada.Config import scala.jdk.CollectionConverters._ import scala.concurrent.{ExecutionContext, Future} diff --git a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala index ca74df48..28584bb5 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/e2e/TestOrchestrator.scala @@ -20,6 +20,7 @@ package org.apache.spark.deploy.armada.e2e import java.io.File import java.util.UUID import java.util.concurrent.TimeoutException +import org.apache.spark.deploy.armada.K8sClient import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration._ import TestConstants._ @@ -74,6 +75,7 @@ class TestOrchestrator( armadaClient: ArmadaClient, k8sClient: K8sClient )(implicit ec: ExecutionContext) { + val sparkRepoCopy = ".spark-3.5.5" private val jobSubmitTimeout = JobSubmitTimeout private val jobWatchTimeout = JobWatchTimeout @@ -253,6 +255,7 @@ class TestOrchestrator( println(s"Test ID: ${context.testId}") println(s"Namespace: ${context.namespace}") println(s"Queue: $queueName") + println(s"MasterURL: ${config.masterUrl}") println(s"Deployment Mode: $modeType") val resultFuture = for { @@ -356,7 +359,7 @@ class TestOrchestrator( "spark.submit.deployMode" -> deployMode ) - val dockerCommand = buildDockerCommand( + val runTestCommand = buildRunTestCommand( config.imageName, volumeMounts, config.masterUrl, @@ -384,7 +387,7 @@ class TestOrchestrator( println(s"[SUBMIT] $key = $displayValue") } // Properly escape command for shell reproduction - val escapedCommand = dockerCommand.map { arg => + val escapedCommand = runTestCommand.map { arg => if (arg.contains(" ") || arg.contains("'") || arg.contains("\"")) { "'" + arg.replace("'", "'\\''") + "'" } else arg @@ -395,7 +398,7 @@ class TestOrchestrator( def attemptSubmit(attempt: Int = 1): ProcessResult = { // In client mode, spark-submit runs until application completes, so use longer timeout val timeout = if (!modeHelper.isDriverInCluster) jobWatchTimeout else jobSubmitTimeout - val result = ProcessExecutor.executeWithResult(dockerCommand, timeout) + val result = ProcessExecutor.executeWithResult(runTestCommand, timeout) if (result.exitCode != 0) { val allOutput = result.stdout + "\n" + result.stderr @@ -588,7 +591,7 @@ class TestOrchestrator( TestResult(jobSetId, queueName, finalStatus, assertionResults) } - private def buildDockerCommand( + private def buildRunTestCommand( imageName: String, volumeMounts: Seq[String], masterUrl: String, @@ -605,6 +608,11 @@ class TestOrchestrator( val deployMode = if (modeHelper.isDriverInCluster) "cluster" else "client" val isClientMode = !modeHelper.isDriverInCluster + val driverClassPath = Seq( + ".", + "./target/armada-cluster-manager_2.13-1.0.0-SNAPSHOT-all.jar" + ).mkString(":") + val baseCommand = Seq( "docker", "run", diff --git a/src/test/scala/org/apache/spark/deploy/armada/submit/ArmadaClientApplicationSuite.scala b/src/test/scala/org/apache/spark/deploy/armada/submit/ArmadaClientApplicationSuite.scala index 6142a7e0..e8c7ec61 100644 --- a/src/test/scala/org/apache/spark/deploy/armada/submit/ArmadaClientApplicationSuite.scala +++ b/src/test/scala/org/apache/spark/deploy/armada/submit/ArmadaClientApplicationSuite.scala @@ -61,7 +61,6 @@ class ArmadaClientApplicationSuite extends AnyFunSuite with BeforeAndAfter with private val SPARK_DRIVER_URL = "SPARK_DRIVER_URL" // Constants for paths - private val PYTHON_EXAMPLE_PATH = "/opt/spark/examples/src/main/python/pi.py" private val clientArguments = ClientArguments( mainAppResource = JavaMainAppResource(Some("app.jar")), mainClass = "org.example.SparkApp",