From 8ac1de00d9dcde51d290775defe1411725bdb6c4 Mon Sep 17 00:00:00 2001 From: anaderi Date: Mon, 21 Mar 2016 19:48:07 -0400 Subject: [PATCH 1/7] cluster handling initial commit --- cluster/.gitignore | 2 + cluster/Dockerfile | 6 + cluster/Makefile | 199 +++++++++++++++++++++++++++++++ cluster/cluster.txt.orig | 9 ++ cluster/start_jupyter.sh | 81 +++++++++++++ cluster/tests/test.py | 1 + cluster/tests/test_ipp_full.py | 45 +++++++ cluster/tests/test_ipp_simple.py | 5 + 8 files changed, 348 insertions(+) create mode 100644 cluster/.gitignore create mode 100644 cluster/Dockerfile create mode 100644 cluster/Makefile create mode 100644 cluster/cluster.txt.orig create mode 100755 cluster/start_jupyter.sh create mode 100644 cluster/tests/test.py create mode 100644 cluster/tests/test_ipp_full.py create mode 100644 cluster/tests/test_ipp_simple.py diff --git a/cluster/.gitignore b/cluster/.gitignore new file mode 100644 index 00000000..b41a23b8 --- /dev/null +++ b/cluster/.gitignore @@ -0,0 +1,2 @@ +cluster.txt +_* diff --git a/cluster/Dockerfile b/cluster/Dockerfile new file mode 100644 index 00000000..b5fc9f1e --- /dev/null +++ b/cluster/Dockerfile @@ -0,0 +1,6 @@ +FROM yandex/rep:0.6.5 +ENV HOME /root +RUN bash --login -c "pip install ipyparallel==4.1.0" +COPY start_jupyter.sh /root/start_jupyter.sh + +CMD ["/bin/bash", "--login", "-c", "$HOME/start_jupyter.sh"] diff --git a/cluster/Makefile b/cluster/Makefile new file mode 100644 index 00000000..19a9f62d --- /dev/null +++ b/cluster/Makefile @@ -0,0 +1,199 @@ +# Makefile for building & starting rep-containers +# arguments can be supplied by -e definitions: +# +# IMAGE -- name of image to use +# N -- number of slave nodes +# HEADIP -- IP of head of the cluster +# + +N ?= 1 ## number of nodes in +HEADIP ?= 10.16.23.10 +DOCKER_NODE=docker -H tcp://0.0.0.0:2375 +# NODE_OPTS=-H tcp://0.0.0.0:2375 +PSSH=parallel-ssh +CLUSTER=cluster.txt +CLUSTER_SWARM=_cluster_swarm.txt +SWARM_OPT=-H tcp://localhost:31338 +DOCKER_PACKAGE=docker-engine=1.10.3-0~wheezy +SWARM_IMAGE=swarm:1.1.3 +IMAGE ?= yandex/rep-ipp:0.6.5 +IMAGE_BUILD=yandex/rep-ipp:0.6.5 +HERE:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) +SHELL=/bin/bash +MASTER=ipp_master +LOCAL_IMAGE_BUILD=${HEADIP}:5000/${IMAGE_BUILD} + +help: + @echo Usage: make [-e VARIABLE=VALUE] targets + @echo "variables:" + @grep -h "#\s\+\w\+ -- " $(MAKEFILE_LIST) |sed "s/#\s//" + @echo + @echo targets and corresponding dependencies: + @fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' -e 's/^/ /' | sed -e 's/##//' + +${CLUSTER_SWARM}: ${CLUSTER} + for i in `cat cluster.txt`; do \ + grep $$i /etc/hosts| awk '{print $$1":2375"}' ; done > ${CLUSTER_SWARM} + # cat ${CLUSTER} | sed 's/$$/:2375/' > ${CLUSTER_SWARM} + +lupdate-debian7-kernel: ## update kernel http://unix.stackexchange.com/questions/115898/how-do-i-upgrade-the-debian-wheezy-kernel-offline + echo "deb http://ftp.pl.debian.org/debian wheezy-backports main" | tee -a /etc/apt/sources.list + apt-get update + aptitude -t wheezy-backports install linux-image-amd64 + reboot + +pupdate-debian7-kernel: ${CLUSTER} + ${PSSH} -h ${CLUSTER} --timeout=0 -i " \ + echo 'deb http://ftp.pl.debian.org/debian wheezy-backports main' | tee -a /etc/apt/sources.list ; \ + apt-get update ; \ + aptitude -y -q -t wheezy-backports install linux-image-amd64" + +puname: ${CLUSTER} + ${PSSH} -h ${CLUSTER} --timeout=0 -i "uname -a" + +preboot: ${CLUSTER} + ${PSSH} -h ${CLUSTER} --timeout=0 -i "reboot" + +pdocker-repo-debian7: ${CLUSTER} ## install docker Debian7 (https://docs.docker.com/engine/installation/linux/debian/#debian-wheezy-stable-7-x-64-bit) + ${PSSH} -h ${CLUSTER} --timeout=0 " \ + apt-get purge docker.io ; \ + apt-get -y -q install apt-transport-https ca-certificates ; \ + apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys 58118E89F3A912897C070ADBF76221572C52609D ; \ + echo 'deb https://apt.dockerproject.org/repo debian-wheezy main' | sudo tee /etc/apt/sources.list.d/docker.list ; \ + apt-get update ; \ + apt-cache policy docker-engine && \ + apt-get install -y ${DOCKER_PACKAGE} " + +pdocker-options: ${CLUSTER} + ${PSSH} -h ${CLUSTER} --timeout=0 " \ + echo 'DOCKER_OPTS=\"-H tcp://0.0.0.0:2375 -H unix:///var/run/docker.sock --insecure-registry ${HEADIP}:5000\"' | tee -a /etc/default/docker ; \ + service docker restart " + + +pdocker-repo-ubuntu: + ${PSSH} -h ${CLUSTER} -i 'echo "deb https://apt.dockerproject.org/repo ubuntu-trusty main" | sudo tee /etc/apt/sources.list.d/docker.list; cat /etc/apt/sources.list.d/docker.list' + ${PSSH} -h ${CLUSTER} 'sudo apt-get update' + +pdocker-install: ${CLUSTER} + ${PSSH} -h ${CLUSTER} -i 'sudo apt-get install -y --force-yes docker-engine=1.9.1-0~trusty' + +docker-versions: ${CLUSTER} + ${PSSH} -h ${CLUSTER} -i "${DOCKER_NODE} version" + +pping: ## ping cluster + ${PSSH} -h ${CLUSTER} -i pwd + +pclean-id: ${CLUSTER} + ${PSSH} -h ${CLUSTER} "rm /etc/docker/key.json && service docker restart" + +puptime: ## uptime cluster + ${PSSH} -h ${CLUSTER} -i uptime + +_check-swarm-stopped: + if [[ `docker ps | grep swarm` != "" ]] ; then echo "swarm master is already running" ; exit 1; fi + +start-swarm: _check-swarm-stopped ${CLUSTER_SWARM} ## start swarm master + docker run -v ${HERE}:/cfg -d -p 0.0.0.0:31338:2375 --restart=always --name=swarm_master ${SWARM_IMAGE} manage -H 0.0.0.0:2375 --strategy spread file:///cfg/${CLUSTER_SWARM} + +stop-swarm: ## stop swarm master + docker rm -f swarm_master + +restart-swarm: stop-swarm start-swarm ## restart swarm master + +swarm-info: ## check swarm + docker ${SWARM_OPT} info + +start-master-interactive: + docker run -p 32000-32100:32000-32100 --name ${MASTER} -ti --rm -v /home/ipp/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash + +start-master: + docker run -p 32000-32100:32000-32100 -p 8888:8888 -e IPPHUB_IP=${HEADIP} --name ${MASTER} -d -v /home/ipp/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} + echo "test" | nc localhost 8888 + +start-slaves: + for i in `seq $N` ; do \ + echo "start slave $$i" ; \ + docker ${SWARM_OPT} run -d -v /home/ipp/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash --login -c ipengine || exit 1 ; \ + done + echo "Running `docker ${SWARM_OPT} ps -q |wc -l` clusters" + +is-master-running: + docker ps -a | grep "${MASTER}$$" > /dev/null + +test-master: is-master-running ## test parallel script + docker cp test ${MASTER}:/root + docker exec ${MASTER} bash --login -c 'for p in tests/* ; do echo $$p ; python $$p || exit 1 ; done' + +exec-master: is-master-running + docker exec -ti ${MASTER} bash + +start-cluster: start-master start-slaves test-master + +list-slaves: + docker ${SWARM_OPT} ps --no-trunc + +stop-slaves: + docker ${SWARM_OPT} ps -q | xargs --no-run-if-empty docker ${SWARM_OPT} rm -f + +stop-master: is-master-running + docker stop ${MASTER} + docker rm ${MASTER} + +stop-cluster: stop-slaves stop-master + echo OK + +#### REGISTRY + +start-registry: + mkdir -p /home/ipp/registry + docker run -d -p 5000:5000 --restart=always --name registry -v /home/ipp/registry:/var/lib/registry registry:2 + echo http://${HEADIP}:5000 + +stop-registry: + docker stop registry + docker rm -f registry + +list-registry: + # https://docs.docker.com/registry/spec/api/ + curl http://10.16.23.10:5000/v2/_catalog + +build-image: + #docker build -t ${IMAGE_BUILD} ipp-image + # docker ${SWARM_OPT} build -t ${IMAGE_BUILD} ipp-image + docker tag ${IMAGE_BUILD} ${LOCAL_IMAGE_BUILD} + +push-image: + docker push ${LOCAL_IMAGE_BUILD} + +pull-image-swarm: + docker ${SWARM_OPT} pull ${LOCAL_IMAGE_BUILD} + + +##### Obsolete? + +ps-user-containers: ${CLUSTER} ## list container running on the cluster + ${PSSH} -h ${CLUSTER} -i '${DOCKER_NODE} ps -a' + +images: ${CLUSTER} ## list images created at clusters + ${PSSH} -h ${CLUSTER} -i '${DOCKER_NODE} images' + +rm-images: ${CLUSTER} ## remove all images + ${PSSH} -h ${CLUSTER} -i '${DOCKER_NODE} images -q | xargs ${DOCKER_NODE} rmi' + +rm-user-containers: ${CLUSTER} ## stop & remove user containers + ${PSSH} -h ${CLUSTER} --timeout=0 -i '${DOCKER_NODE} ps -q|xargs ${DOCKER_NODE} -f rm' + +swarm-ps: ## list containers running in swarm + docker ${SWARM_OPT} ps + +swarm-psa: ## list all containers in swarm + docker ${SWARM_OPT} ps -a + +start-monitor: ## start monitoring (bosun) + docker -p 0.0.0.0:4242:4242 -p 0.0.0.0:8070:8070 stackexchange/bosun + +pdf: ${CLUSTER} ## check disk free space on cluster nodes + ${PSSH} -h ${CLUSTER} -i df -h / + +mdu: ## mfs du + du -m --max-depth 1 /opt/mfs diff --git a/cluster/cluster.txt.orig b/cluster/cluster.txt.orig new file mode 100644 index 00000000..02e161d4 --- /dev/null +++ b/cluster/cluster.txt.orig @@ -0,0 +1,9 @@ +farm-wn1 +farm-wn2 +farm-wn3 +farm-wn4 +farm-wn5 +farm-wn6 +farm-wn7 +farm-wn8 +farm-wn9 \ No newline at end of file diff --git a/cluster/start_jupyter.sh b/cluster/start_jupyter.sh new file mode 100755 index 00000000..c8ee70eb --- /dev/null +++ b/cluster/start_jupyter.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# script to start jupyter inside docker container +# fine-tuned by set of environment variables (see the code), +# e.g. runs under jupyterhub environment in case JPY_API_TOKEN is set + +set +xv + +[ -z "$ENV_BIN_DIR" ] && source /etc/profile.d/rep_profile.sh + +if [ "$INSTALL_PIP_MODULES" != "" ] ; then + pip install $INSTALL_PIP_MODULES +fi + +if [ "$JPY_API_TOKEN" != "" ] ; then + echo "Starting under Jupyterhub" + jupyter kernelspec install-self + source activate jupyterhub_py3 + jupyter kernelspec install-self + source activate rep_py2 # default env + + NOTEBOOK_DIR=/notebooks + git clone $JPY_GITHUBURL $NOTEBOOK_DIR + $HOME/miniconda/envs/jupyterhub_py3/bin/jupyterhub-singleuser \ + --port=8888 \ + --ip=0.0.0.0 \ + --user=$JPY_USER \ + --cookie-name=$JPY_COOKIE_NAME \ + --base-url=$JPY_BASE_URL \ + --hub-prefix=$JPY_HUB_PREFIX \ + --hub-api-url=$JPY_HUB_API_URL \ + --notebook-dir=$NOTEBOOK_DIR + exit $? +fi + +if [ "$GENERATE_SSL_HOSTNAME" != "" ] ; then + echo "Setting up SSL support for the Jupyter profile" + SSL_CERTFILE="/root/mycert.pem" + SSL_KEYFILE="" + echo -e "\n\n\n\n${GENERATE_SSL_HOSTNAME}\n\n" | + openssl req -x509 -nodes -days 365 -newkey rsa:1024 -keyout $SSL_CERTFILE -out $SSL_CERTFILE +fi + +if [ "$SSL_CERTFILE" != "" ] ; then + JUPYTER_OPTIONS+=" --certfile=$SSL_CERTFILE" +fi + +if [ "$SSL_KEYFILE" != "" ] ; then + JUPYTER_OPTIONS+=" --keyfile=$SSL_KEYFILE" +fi + +JUPYTER_CONFIG=$HOME/.jupyter/jupyter_notebook_config.py + +if [ "$PASSWORD" != "" ] ; then + sha=`python -c "from notebook.auth import passwd; print passwd('$PASSWORD')"` + echo "c.NotebookApp.password = u'$sha'" >> $JUPYTER_CONFIG +fi + +if [ "$SECRET" != "" ] ; then + echo "c.NotebookNotary.secret = b'$SECRET'" >> $JUPYTER_CONFIG +fi + +if [ "$SECRET_FILE" != "" ] ; then + echo "c.NotebookNotary.secret_file = '$SECRET_FILE'" >> $JUPYTER_CONFIG +fi + +if [ "$JUPYTER_PORT" != "" ] ; then + JUPYTER_OPTIONS+=" --port $JUPYTER_PORT" +fi + +if [ "$IPPHUB_IP" != "" ] ; then + ipython profile create --parallel + ipcontroller --ip=* --port 32000 --location=$IPPHUB_IP --HubFactory.iopub=32001,32002 --HubFactory.hb=32003,32004 --HubFactory.mux=32005,32006 --HubFactory.notifier_port=32007 --HubFactory.task=32008,32009 --HubFactory.control=32010,32011 & +fi + +[[ -d /REP_howto && ! -L /notebooks/rep_howto ]] && ln -s /REP_howto /notebooks/rep_howto + +cat .rep_version +source .rep_version +echo "Starting Jupyter" +jupyter notebook $JUPYTER_OPTIONS /notebooks 2>&1 | tee -a /notebooks/jupyter.log diff --git a/cluster/tests/test.py b/cluster/tests/test.py new file mode 100644 index 00000000..524a48dd --- /dev/null +++ b/cluster/tests/test.py @@ -0,0 +1 @@ +print "Hello python" diff --git a/cluster/tests/test_ipp_full.py b/cluster/tests/test_ipp_full.py new file mode 100644 index 00000000..ee55b8be --- /dev/null +++ b/cluster/tests/test_ipp_full.py @@ -0,0 +1,45 @@ +# import ipyparallel as ipp + +# c = ipp.Client(profile="default") + +import numpy, pandas +import os +from rep.utils import train_test_split +from sklearn.metrics import roc_auc_score +import subprocess +columns = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'g'] +if not os.path.exists("toy_datasets/magic04.data"): + os.makedirs("toy_datasets") + p = subprocess.Popen("wget -O toy_datasets/magic04.data -nc --no-check-certificate https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data", shell=True) + p.wait() + +print "Downloaded magic04.data" +data = pandas.read_csv('toy_datasets/magic04.data', names=columns) +labels = numpy.array(data['g'] == 'g', dtype=int) +data = data.drop('g', axis=1) +import numpy +import numexpr +import pandas +from rep import utils +from sklearn.ensemble import GradientBoostingClassifier +from rep.report.metrics import RocAuc +from rep.metaml import GridOptimalSearchCV, FoldingScorer, RandomParameterOptimizer +from rep.estimators import SklearnClassifier, TMVAClassifier, XGBoostRegressor +# define grid parameters +grid_param = {} +grid_param['learning_rate'] = [0.2, 0.1, 0.05, 0.02, 0.01] +grid_param['max_depth'] = [2, 3, 4, 5] +# use random hyperparameter optimization algorithm +generator = RandomParameterOptimizer(grid_param) +# define folding scorer +scorer = FoldingScorer(RocAuc(), folds=3, fold_checks=3) +estimator = SklearnClassifier(GradientBoostingClassifier(n_estimators=30)) +#grid_finder = GridOptimalSearchCV(estimator, generator, scorer) +#% time grid_finder.fit(data, labels) +grid_finder = GridOptimalSearchCV(estimator, generator, scorer, parallel_profile="default") +print "start grid search" +grid_finder.fit(data, labels) + +grid_finder.params_generator.print_results() + +assert 10 == grid_finder.params_generator.n_evaluations, "oops" diff --git a/cluster/tests/test_ipp_simple.py b/cluster/tests/test_ipp_simple.py new file mode 100644 index 00000000..f64e8d17 --- /dev/null +++ b/cluster/tests/test_ipp_simple.py @@ -0,0 +1,5 @@ +import ipyparallel + +print "Running ", __file__ +c = ipyparallel.Client(profile="default") +print c.ids From dc0d5773ac7089f93a148aaa058422d379b9777b Mon Sep 17 00:00:00 2001 From: anaderi Date: Mon, 21 Mar 2016 20:17:17 -0400 Subject: [PATCH 2/7] README.md added --- cluster/README.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 cluster/README.md diff --git a/cluster/README.md b/cluster/README.md new file mode 100644 index 00000000..e06bbceb --- /dev/null +++ b/cluster/README.md @@ -0,0 +1,32 @@ + +# Running ipyparallel cluster with REP & docker swarm + + +## Setup + +prerequisites: +- Kernel 3.10+ +- docker 1.8+ on all machines +- list all your nodes in `cluster.txt` (just one per line, like `cluster.txt.orig`) + +## Start + +`make start-master` +`make start-slaves` will start slaves +to test +`make test-cluster` + +jupyter with REP will be acccessible by port 8888 of node hosting master instance. + + +## Stop + + make stop-cluster + +## Maintenance + +to add more slaves: + +```make -e N=5 start-slaves``` -- start 5 more slaves + +## Troubleshooting From d7f4b8edc636cb5208710703451e66f3d1e6afff Mon Sep 17 00:00:00 2001 From: anaderi Date: Tue, 22 Mar 2016 20:09:07 -0400 Subject: [PATCH 3/7] some cleanup --- cluster/Dockerfile | 4 ++-- cluster/Makefile | 32 ++++++++++++++++++-------------- cluster/README.md | 7 +++++-- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/cluster/Dockerfile b/cluster/Dockerfile index b5fc9f1e..cae2cc09 100644 --- a/cluster/Dockerfile +++ b/cluster/Dockerfile @@ -1,6 +1,6 @@ FROM yandex/rep:0.6.5 -ENV HOME /root RUN bash --login -c "pip install ipyparallel==4.1.0" -COPY start_jupyter.sh /root/start_jupyter.sh +ENV HOME /root +COPY start_jupyter.sh $HOME/start_jupyter.sh CMD ["/bin/bash", "--login", "-c", "$HOME/start_jupyter.sh"] diff --git a/cluster/Makefile b/cluster/Makefile index 19a9f62d..14a32914 100644 --- a/cluster/Makefile +++ b/cluster/Makefile @@ -89,6 +89,8 @@ pclean-id: ${CLUSTER} puptime: ## uptime cluster ${PSSH} -h ${CLUSTER} -i uptime +### SWARM + _check-swarm-stopped: if [[ `docker ps | grep swarm` != "" ]] ; then echo "swarm master is already running" ; exit 1; fi @@ -103,31 +105,34 @@ restart-swarm: stop-swarm start-swarm ## restart swarm master swarm-info: ## check swarm docker ${SWARM_OPT} info +#### CLUSTER + start-master-interactive: - docker run -p 32000-32100:32000-32100 --name ${MASTER} -ti --rm -v /home/ipp/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash + docker run -p 32000-32100:32000-32100 --name ${MASTER} -ti --rm -v ${HERE}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash start-master: - docker run -p 32000-32100:32000-32100 -p 8888:8888 -e IPPHUB_IP=${HEADIP} --name ${MASTER} -d -v /home/ipp/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} - echo "test" | nc localhost 8888 + docker run -p 32000-32100:32000-32100 -p 8888:8888 -e IPPHUB_IP=${HEADIP} --name ${MASTER} -d -v ${HERE}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} + ( echo -e "GET /\n\n" | nc localhost 8888 ) || exit 1 start-slaves: for i in `seq $N` ; do \ echo "start slave $$i" ; \ - docker ${SWARM_OPT} run -d -v /home/ipp/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash --login -c ipengine || exit 1 ; \ + docker ${SWARM_OPT} run -d -v ${HERE}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash --login -c ipengine || exit 1 ; \ done - echo "Running `docker ${SWARM_OPT} ps -q |wc -l` clusters" + echo "Running `docker ${SWARM_OPT} ps -q |wc -l` slave(s)" is-master-running: docker ps -a | grep "${MASTER}$$" > /dev/null + ( echo -e "GET /\n\n" | nc localhost 8888 ) || exit 1 -test-master: is-master-running ## test parallel script - docker cp test ${MASTER}:/root +test-cluster: is-master-running list-slaves ## test parallel script + docker cp tests ${MASTER}:/root docker exec ${MASTER} bash --login -c 'for p in tests/* ; do echo $$p ; python $$p || exit 1 ; done' exec-master: is-master-running docker exec -ti ${MASTER} bash -start-cluster: start-master start-slaves test-master +start-cluster: start-master start-slaves list-slaves test-cluster list-slaves: docker ${SWARM_OPT} ps --no-trunc @@ -140,13 +145,13 @@ stop-master: is-master-running docker rm ${MASTER} stop-cluster: stop-slaves stop-master - echo OK + echo Cluster stopped #### REGISTRY start-registry: - mkdir -p /home/ipp/registry - docker run -d -p 5000:5000 --restart=always --name registry -v /home/ipp/registry:/var/lib/registry registry:2 + test -d ${HERE}/registry || mkdir -p ${HERE}/registry + docker run -d -p 5000:5000 --restart=always --name registry -v ${HERE}/registry:/var/lib/registry registry:2 echo http://${HEADIP}:5000 stop-registry: @@ -158,8 +163,7 @@ list-registry: curl http://10.16.23.10:5000/v2/_catalog build-image: - #docker build -t ${IMAGE_BUILD} ipp-image - # docker ${SWARM_OPT} build -t ${IMAGE_BUILD} ipp-image + docker build -t ${IMAGE_BUILD} . docker tag ${IMAGE_BUILD} ${LOCAL_IMAGE_BUILD} push-image: @@ -181,7 +185,7 @@ rm-images: ${CLUSTER} ## remove all images ${PSSH} -h ${CLUSTER} -i '${DOCKER_NODE} images -q | xargs ${DOCKER_NODE} rmi' rm-user-containers: ${CLUSTER} ## stop & remove user containers - ${PSSH} -h ${CLUSTER} --timeout=0 -i '${DOCKER_NODE} ps -q|xargs ${DOCKER_NODE} -f rm' + ${PSSH} -h ${CLUSTER} --timeout=0 -i '${DOCKER_NODE} ps -a -q|xargs --no-run-if-empty ${DOCKER_NODE} rm' swarm-ps: ## list containers running in swarm docker ${SWARM_OPT} ps diff --git a/cluster/README.md b/cluster/README.md index e06bbceb..f5f65bcb 100644 --- a/cluster/README.md +++ b/cluster/README.md @@ -25,8 +25,11 @@ jupyter with REP will be acccessible by port 8888 of node hosting master instanc ## Maintenance -to add more slaves: -```make -e N=5 start-slaves``` -- start 5 more slaves +to check cluster status `make test-cluster` + +to add 5 more slaves: ```make -e N=5 start-slaves``` + + ## Troubleshooting From b18816df7d8528f65b7f58096788fb62fd72919c Mon Sep 17 00:00:00 2001 From: anaderi Date: Tue, 22 Mar 2016 20:43:19 -0400 Subject: [PATCH 4/7] fix cluster start --- cluster/Makefile | 10 ++++++---- cluster/tests/test_ipp_full.py | 5 ++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cluster/Makefile b/cluster/Makefile index 14a32914..2d65c36b 100644 --- a/cluster/Makefile +++ b/cluster/Makefile @@ -6,13 +6,13 @@ # HEADIP -- IP of head of the cluster # -N ?= 1 ## number of nodes in HEADIP ?= 10.16.23.10 DOCKER_NODE=docker -H tcp://0.0.0.0:2375 # NODE_OPTS=-H tcp://0.0.0.0:2375 PSSH=parallel-ssh CLUSTER=cluster.txt CLUSTER_SWARM=_cluster_swarm.txt +N ?= $(shell cat ${CLUSTER} | wc -l) ## number of nodes in the cluster SWARM_OPT=-H tcp://localhost:31338 DOCKER_PACKAGE=docker-engine=1.10.3-0~wheezy SWARM_IMAGE=swarm:1.1.3 @@ -21,6 +21,7 @@ IMAGE_BUILD=yandex/rep-ipp:0.6.5 HERE:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) SHELL=/bin/bash MASTER=ipp_master +SHARED_HOME=/home/ipp LOCAL_IMAGE_BUILD=${HEADIP}:5000/${IMAGE_BUILD} help: @@ -108,16 +109,17 @@ swarm-info: ## check swarm #### CLUSTER start-master-interactive: - docker run -p 32000-32100:32000-32100 --name ${MASTER} -ti --rm -v ${HERE}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash + docker run -p 32000-32100:32000-32100 --name ${MASTER} -ti --rm -v ${SHARED_HOME}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash start-master: - docker run -p 32000-32100:32000-32100 -p 8888:8888 -e IPPHUB_IP=${HEADIP} --name ${MASTER} -d -v ${HERE}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} + docker run -p 32000-32100:32000-32100 -p 8888:8888 -e IPPHUB_IP=${HEADIP} --name ${MASTER} -d -v ${SHARED_HOME}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} ( echo -e "GET /\n\n" | nc localhost 8888 ) || exit 1 start-slaves: + sleep 2 for i in `seq $N` ; do \ echo "start slave $$i" ; \ - docker ${SWARM_OPT} run -d -v ${HERE}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash --login -c ipengine || exit 1 ; \ + docker ${SWARM_OPT} run -d -v ${SHARED_HOME}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash --login -c ipengine || exit 1 ; \ done echo "Running `docker ${SWARM_OPT} ps -q |wc -l` slave(s)" diff --git a/cluster/tests/test_ipp_full.py b/cluster/tests/test_ipp_full.py index ee55b8be..4be9349a 100644 --- a/cluster/tests/test_ipp_full.py +++ b/cluster/tests/test_ipp_full.py @@ -1,6 +1,5 @@ -# import ipyparallel as ipp - -# c = ipp.Client(profile="default") +import ipyparallel as ipp +c = ipp.Client(profile="default") import numpy, pandas import os From c5b1f50cee19ae82e19aedcc415a829e7559dd44 Mon Sep 17 00:00:00 2001 From: anaderi Date: Tue, 22 Mar 2016 20:54:37 -0400 Subject: [PATCH 5/7] fix linebr cluster README --- cluster/README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cluster/README.md b/cluster/README.md index f5f65bcb..57b1c535 100644 --- a/cluster/README.md +++ b/cluster/README.md @@ -11,12 +11,11 @@ prerequisites: ## Start -`make start-master` -`make start-slaves` will start slaves -to test -`make test-cluster` +`make start-master` -- start master -jupyter with REP will be acccessible by port 8888 of node hosting master instance. +`make start-slaves` -- start slaves (by number of lines in `cluster.txt`) + +jupyter with REP will be acccessible by port 8888 of master instance. ## Stop @@ -28,7 +27,7 @@ jupyter with REP will be acccessible by port 8888 of node hosting master instanc to check cluster status `make test-cluster` -to add 5 more slaves: ```make -e N=5 start-slaves``` +to add, say, 5 more slaves: ```make -e N=5 start-slaves``` From f4875c817e8d1775578ba5f1cfb1f76288104d69 Mon Sep 17 00:00:00 2001 From: anaderi Date: Tue, 12 Apr 2016 04:27:56 -0400 Subject: [PATCH 6/7] remove extra # --- cluster/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster/Makefile b/cluster/Makefile index 2d65c36b..ba07f2f0 100644 --- a/cluster/Makefile +++ b/cluster/Makefile @@ -12,7 +12,7 @@ DOCKER_NODE=docker -H tcp://0.0.0.0:2375 PSSH=parallel-ssh CLUSTER=cluster.txt CLUSTER_SWARM=_cluster_swarm.txt -N ?= $(shell cat ${CLUSTER} | wc -l) ## number of nodes in the cluster +N ?= $(shell cat ${CLUSTER} | wc -l) # number of nodes in the cluster SWARM_OPT=-H tcp://localhost:31338 DOCKER_PACKAGE=docker-engine=1.10.3-0~wheezy SWARM_IMAGE=swarm:1.1.3 From 54f2244ea70f74d88af126ae1a4f8ab9194fe6b4 Mon Sep 17 00:00:00 2001 From: anaderi Date: Mon, 2 May 2016 17:22:05 -0400 Subject: [PATCH 7/7] describe key targets, number kernels per node --- cluster/Makefile | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/cluster/Makefile b/cluster/Makefile index ba07f2f0..17b52b99 100644 --- a/cluster/Makefile +++ b/cluster/Makefile @@ -12,7 +12,9 @@ DOCKER_NODE=docker -H tcp://0.0.0.0:2375 PSSH=parallel-ssh CLUSTER=cluster.txt CLUSTER_SWARM=_cluster_swarm.txt -N ?= $(shell cat ${CLUSTER} | wc -l) # number of nodes in the cluster +KERNELS_PER_NODE = 2 +N_NODES ?= $(shell cat ${CLUSTER} | wc -l) # number of nodes in the cluster +N_KERNELS = $(shell echo $$(( ${N_NODES} * ${KERNELS_PER_NODE} ))) SWARM_OPT=-H tcp://localhost:31338 DOCKER_PACKAGE=docker-engine=1.10.3-0~wheezy SWARM_IMAGE=swarm:1.1.3 @@ -23,6 +25,11 @@ SHELL=/bin/bash MASTER=ipp_master SHARED_HOME=/home/ipp LOCAL_IMAGE_BUILD=${HEADIP}:5000/${IMAGE_BUILD} +SHARED_VOLUME=/soft +PASSWORD=kraCluster +HOSTNAME=kra.com +RUN_OPTIONS=-v ${SHARED_VOLUME}:/notebooks/${SHARED_VOLUME} -v ${SHARED_HOME}/security:/root/.ipython/profile_default/security -e PASSWORD=${PASSWORD} -e GENERATE_SSL_HOSTNAME=${HOSTNAME} + help: @echo Usage: make [-e VARIABLE=VALUE] targets @@ -109,17 +116,17 @@ swarm-info: ## check swarm #### CLUSTER start-master-interactive: - docker run -p 32000-32100:32000-32100 --name ${MASTER} -ti --rm -v ${SHARED_HOME}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash + docker run -p 32000-32100:32000-32100 --name ${MASTER} -ti --rm ${RUN_OPTIONS} ${LOCAL_IMAGE_BUILD} bash -start-master: - docker run -p 32000-32100:32000-32100 -p 8888:8888 -e IPPHUB_IP=${HEADIP} --name ${MASTER} -d -v ${SHARED_HOME}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} +start-master: ## start master + docker run -p 32000-32100:32000-32100 -p 8888:8888 -e IPPHUB_IP=${HEADIP} --name ${MASTER} -d ${RUN_OPTIONS} ${LOCAL_IMAGE_BUILD} ( echo -e "GET /\n\n" | nc localhost 8888 ) || exit 1 -start-slaves: +start-slaves: ## start slaves sleep 2 - for i in `seq $N` ; do \ + for i in `seq ${N_KERNELS}` ; do \ echo "start slave $$i" ; \ - docker ${SWARM_OPT} run -d -v ${SHARED_HOME}/security:/root/.ipython/profile_default/security ${LOCAL_IMAGE_BUILD} bash --login -c ipengine || exit 1 ; \ + docker ${SWARM_OPT} run -d ${RUN_OPTIONS} ${LOCAL_IMAGE_BUILD} bash --login -c ipengine || exit 1 ; \ done echo "Running `docker ${SWARM_OPT} ps -q |wc -l` slave(s)" @@ -134,9 +141,11 @@ test-cluster: is-master-running list-slaves ## test parallel script exec-master: is-master-running docker exec -ti ${MASTER} bash -start-cluster: start-master start-slaves list-slaves test-cluster +start-cluster: start-master start-slaves list-slaves test-cluster ## start cluster + +restart-cluster: stop-cluster start-cluster ## restart cluster -list-slaves: +list-slaves: ## list slaves docker ${SWARM_OPT} ps --no-trunc stop-slaves: @@ -146,6 +155,9 @@ stop-master: is-master-running docker stop ${MASTER} docker rm ${MASTER} +restart-slaves: stop-slaves start-slaves ## restart slaves + echo Slaves restarted + stop-cluster: stop-slaves stop-master echo Cluster stopped @@ -178,7 +190,7 @@ pull-image-swarm: ##### Obsolete? ps-user-containers: ${CLUSTER} ## list container running on the cluster - ${PSSH} -h ${CLUSTER} -i '${DOCKER_NODE} ps -a' + ${PSSH} -h ${CLUSTER} -i '${DOCKER_NODE} ps ' images: ${CLUSTER} ## list images created at clusters ${PSSH} -h ${CLUSTER} -i '${DOCKER_NODE} images'