diff --git a/Dockerfile b/Dockerfile index 44f75b6..0763f55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,21 +1,7 @@ # select operating system -FROM ubuntu:16.04 +FROM alpine -# install operating system packages -RUN apt-get update -y && apt-get install git curl gettext unzip wget software-properties-common python python-software-properties python-pip python3-pip dnsutils make -y - -## add more packages, if necessary -# install Java8 -RUN add-apt-repository ppa:webupd8team/java -y && apt-get update && apt-get -y install openjdk-8-jdk-headless - -# install boto3 library for PySpark applications to connect to S3 -RUN pip install boto3==1.9 - - -# use bpkg to handle complex bash entrypoints -RUN curl -Lo- "https://raw.githubusercontent.com/bpkg/bpkg/master/setup.sh" | bash -RUN bpkg install cha87de/bashutil -g -## add more bash dependencies, if necessary +# install operating system packages # add config, init and source files # entrypoint @@ -23,26 +9,23 @@ ADD init /opt/docker-init ADD conf /opt/docker-conf # folders -RUN mkdir /opt/apache-livy -RUN mkdir /var/apache-spark-binaries/ - -# binaries -# apache livy -RUN wget http://mirror.23media.de/apache/incubator/livy/0.5.0-incubating/livy-0.5.0-incubating-bin.zip -O /tmp/livy.zip -RUN unzip /tmp/livy.zip -d /opt/ -# Logging dir -RUN mkdir /opt/livy-0.5.0-incubating-bin/logs - -# apache spark -RUN wget https://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz -O /tmp/spark-2.3.1-bin-hadoop2.7.tgz -RUN tar -xvzf /tmp/spark-2.3.1-bin-hadoop2.7.tgz -C /opt/ - -# set Python3 as default -RUN rm /usr/bin/python -RUN ln -s /usr/bin/python3 /usr/bin/python - +RUN apk add unzip wget curl git bash openjdk8 gettext make coreutils procps \ + && apk update \ + && wget https://www-eu.apache.org/dist/incubator/livy/0.7.0-incubating/apache-livy-0.7.0-incubating-bin.zip -O /tmp/livy.zip \ + && apk add --no-cache libc6-compat \ + && ln -s /lib/libc.musl-x86_64.so.1 /lib/ld-linux-x86-64.so.2 \ + && wget https://archive.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz -O /tmp/spark.tgz \ + && unzip /tmp/livy.zip -d /opt/ \ + && tar -xvzf /tmp/spark.tgz -C /opt/ \ + # postgres jar + && wget https://jdbc.postgresql.org/download/postgresql-42.2.10.jar -P /opt/spark-2.4.3-bin-hadoop2.7/jars \ + # spark-excel support + && wget https://oss.sonatype.org/content/repositories/public/com/crealytics/spark-excel_2.11/0.13.1/spark-excel_2.11-0.13.1.jar -P /opt/spark-2.4.3-bin-hadoop2.7/jars \ + && wget https://repo1.maven.org/maven2/org/apache/commons/commons-collections4/4.1/commons-collections4-4.1.jar -P /opt/spark-2.4.3-bin-hadoop2.7/jars \ + && wget https://repo1.maven.org/maven2/org/apache/xmlbeans/xmlbeans/3.1.0/xmlbeans-3.1.0.jar -P /opt/spark-2.4.3-bin-hadoop2.7/jars \ + && wget https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml-schemas/4.1.2/poi-ooxml-schemas-4.1.2.jar -P /opt/spark-2.4.3-bin-hadoop2.7/jars \ + && git clone https://github.com/cha87de/bashutil.git - # expose ports EXPOSE 8998 diff --git a/README.md b/README.md index 2409f44..15e6619 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ Based on the latest release of the [Apache Livy project](https://livy.incubator. ### Supported Versions: -Livy-Server: 0.5.0 +Livy-Server: 0.7.0 -Apache Spark: 2.3.1 +Apache Spark: 2.4.4 Python: Python3 (including boto3 1.9) @@ -37,6 +37,6 @@ Livy Server start on default port 8998 ### Usage: ```bash -docker run -d -p 8998:8998 -e SPARK_MASTER_ENDPOINT=1.2.3.4 -e SPARK_MASTER_PORT=7077 -v /tmp:/tmp cloudiator/livy-server-docker:latest +docker run -d -p 8998:8998 -e SPARK_MASTER_ENDPOINT=1.2.3.4 -e SPARK_MASTER_PORT=7077 -v /tmp:/tmp cloudiator/livy-server:latest ``` diff --git a/examples/pi.py b/examples/pi.py deleted file mode 100644 index 5839cc2..0000000 --- a/examples/pi.py +++ /dev/null @@ -1,47 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import sys -from random import random -from operator import add - -from pyspark.sql import SparkSession - - -if __name__ == "__main__": - """ - Usage: pi [partitions] - """ - spark = SparkSession\ - .builder\ - .appName("PythonPi")\ - .getOrCreate() - - partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2 - n = 100000 * partitions - - def f(_): - x = random() * 2 - 1 - y = random() * 2 - 1 - return 1 if x ** 2 + y ** 2 <= 1 else 0 - - count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add) - print("Pi is roughly %f" % (4.0 * count / n)) - - spark.stop() diff --git a/init/entrypoint b/init/entrypoint index 53252eb..41447ca 100755 --- a/init/entrypoint +++ b/init/entrypoint @@ -1,18 +1,19 @@ #!/bin/bash # include bpkg dependencies -source /usr/local/bin/retry -source /usr/local/bin/bgo -source /usr/local/bin/bgowait - -# global variables -GLOBAL_VAR="xyz" +source /bashutil/retry +source /bashutil/bgo +source /bashutil/bgowait +[[ -z "${LIVY_EXECUTION_MODE}" ]] && mode='local' || mode="${LIVY_EXECUTION_MODE}" ############################################################################## # validate if all container variables are set ############################################################################## function validate(){ + if [[ $mode == "local" ]]; then + return 0 + elif [[ $mode == "cluster" ]]; then vars="SPARK_MASTER_ENDPOINT SPARK_MASTER_PORT DEPLOY_MODE" for var in $vars; do if [[ $(env | awk -F "=" '{print $1}' | grep "^$var$") != "$var" ]]; then @@ -20,22 +21,24 @@ function validate(){ return 1 fi done - if [[ -z ${GLOBAL_VAR+x} ]]; then - echo "GLOBAL_VAR variable cannot be looked up." - return 1 - fi + else + echo -e "Unsupported value $LIVY_EXECUTION_MODE" + return 1 + fi } ############################################################################## # write config vars with configfile template ############################################################################## function writeConfigOptions(){ + if [ $mode == "cluster" ]; then echo "write config options" export SPARK_MASTER_ENDPOINT=$SPARK_MASTER_ENDPOINT export SPARK_MASTER_PORT=$SPARK_MASTER_PORT export DEPLOY_MODE=$DEPLOY_MODE - - cat /opt/docker-conf/livy.conf | envsubst > /opt/livy-0.5.0-incubating-bin/conf/livy.conf + + cat /opt/docker-conf/livy.conf | envsubst > /opt/apache-livy-0.7.0-incubating-bin/conf/livy.conf + fi } function init(){ @@ -58,9 +61,9 @@ function init(){ function livy_server_service(){ - export SPARK_HOME=/opt/spark-2.3.1-bin-hadoop2.7/ + export SPARK_HOME=/opt/spark-2.4.3-bin-hadoop2.7/ echo "starting Livy Server!" - /opt/livy-0.5.0-incubating-bin/bin/livy-server start + /opt/apache-livy-0.7.0-incubating-bin/bin/livy-server start # whatever blocking call tail -f /dev/null