diff --git a/cluster/.gitignore b/cluster/.gitignore new file mode 100644 index 00000000..b41a23b8 --- /dev/null +++ b/cluster/.gitignore @@ -0,0 +1,2 @@ +cluster.txt +_* diff --git a/cluster/Dockerfile b/cluster/Dockerfile new file mode 100644 index 00000000..cae2cc09 --- /dev/null +++ b/cluster/Dockerfile @@ -0,0 +1,6 @@ +FROM yandex/rep:0.6.5 +RUN bash --login -c "pip install ipyparallel==4.1.0" +ENV HOME /root +COPY start_jupyter.sh $HOME/start_jupyter.sh + +CMD ["/bin/bash", "--login", "-c", "$HOME/start_jupyter.sh"] diff --git a/cluster/Makefile b/cluster/Makefile new file mode 100644 index 00000000..17b52b99 --- /dev/null +++ b/cluster/Makefile @@ -0,0 +1,217 @@ +# Makefile for building & starting rep-containers +# arguments can be supplied by -e definitions: +# +# IMAGE -- name of image to use +# N -- number of slave nodes +# HEADIP -- IP of head of the cluster +# + +HEADIP ?= 10.16.23.10 +DOCKER_NODE=docker -H tcp://0.0.0.0:2375 +# NODE_OPTS=-H tcp://0.0.0.0:2375 +PSSH=parallel-ssh +CLUSTER=cluster.txt +CLUSTER_SWARM=_cluster_swarm.txt +KERNELS_PER_NODE = 2 +N_NODES ?= $(shell cat ${CLUSTER} | wc -l) # number of nodes in the cluster +N_KERNELS = $(shell echo $$(( ${N_NODES} * ${KERNELS_PER_NODE} ))) +SWARM_OPT=-H tcp://localhost:31338 +DOCKER_PACKAGE=docker-engine=1.10.3-0~wheezy +SWARM_IMAGE=swarm:1.1.3 +IMAGE ?= yandex/rep-ipp:0.6.5 +IMAGE_BUILD=yandex/rep-ipp:0.6.5 +HERE:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) +SHELL=/bin/bash +MASTER=ipp_master +SHARED_HOME=/home/ipp +LOCAL_IMAGE_BUILD=${HEADIP}:5000/${IMAGE_BUILD} +SHARED_VOLUME=/soft +PASSWORD=kraCluster +HOSTNAME=kra.com +RUN_OPTIONS=-v ${SHARED_VOLUME}:/notebooks/${SHARED_VOLUME} -v ${SHARED_HOME}/security:/root/.ipython/profile_default/security -e PASSWORD=${PASSWORD} -e GENERATE_SSL_HOSTNAME=${HOSTNAME} + + +help: + @echo Usage: make [-e VARIABLE=VALUE] targets + @echo "variables:" + @grep -h "#\s\+\w\+ -- " $(MAKEFILE_LIST) |sed "s/#\s//" + @echo + @echo targets and corresponding dependencies: + @fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' -e 's/^/ /' | sed -e 's/##//' + +${CLUSTER_SWARM}: ${CLUSTER} + for i in `cat cluster.txt`; do \ + grep $$i /etc/hosts| awk '{print $$1":2375"}' ; done > ${CLUSTER_SWARM} + # cat ${CLUSTER} | sed 's/$$/:2375/' > ${CLUSTER_SWARM} + +lupdate-debian7-kernel: ## update kernel http://unix.stackexchange.com/questions/115898/how-do-i-upgrade-the-debian-wheezy-kernel-offline + echo "deb http://ftp.pl.debian.org/debian wheezy-backports main" | tee -a /etc/apt/sources.list + apt-get update + aptitude -t wheezy-backports install linux-image-amd64 + reboot + +pupdate-debian7-kernel: ${CLUSTER} + ${PSSH} -h ${CLUSTER} --timeout=0 -i " \ + echo 'deb http://ftp.pl.debian.org/debian wheezy-backports main' | tee -a /etc/apt/sources.list ; \ + apt-get update ; \ + aptitude -y -q -t wheezy-backports install linux-image-amd64" + +puname: ${CLUSTER} + ${PSSH} -h ${CLUSTER} --timeout=0 -i "uname -a" + +preboot: ${CLUSTER} + ${PSSH} -h ${CLUSTER} --timeout=0 -i "reboot" + +pdocker-repo-debian7: ${CLUSTER} ## install docker Debian7 (https://docs.docker.com/engine/installation/linux/debian/#debian-wheezy-stable-7-x-64-bit) + ${PSSH} -h ${CLUSTER} --timeout=0 " \ + apt-get purge docker.io ; \ + apt-get -y -q install apt-transport-https ca-certificates ; \ + apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys 58118E89F3A912897C070ADBF76221572C52609D ; \ + echo 'deb https://apt.dockerproject.org/repo debian-wheezy main' | sudo tee /etc/apt/sources.list.d/docker.list ; \ + apt-get update ; \ + apt-cache policy docker-engine && \ + apt-get install -y ${DOCKER_PACKAGE} " + +pdocker-options: ${CLUSTER} + ${PSSH} -h ${CLUSTER} --timeout=0 " \ + echo 'DOCKER_OPTS=\"-H tcp://0.0.0.0:2375 -H unix:///var/run/docker.sock --insecure-registry ${HEADIP}:5000\"' | tee -a /etc/default/docker ; \ + service docker restart " + + +pdocker-repo-ubuntu: + ${PSSH} -h ${CLUSTER} -i 'echo "deb https://apt.dockerproject.org/repo ubuntu-trusty main" | sudo tee /etc/apt/sources.list.d/docker.list; cat /etc/apt/sources.list.d/docker.list' + ${PSSH} -h ${CLUSTER} 'sudo apt-get update' + +pdocker-install: ${CLUSTER} + ${PSSH} -h ${CLUSTER} -i 'sudo apt-get install -y --force-yes docker-engine=1.9.1-0~trusty' + +docker-versions: ${CLUSTER} + ${PSSH} -h ${CLUSTER} -i "${DOCKER_NODE} version" + +pping: ## ping cluster + ${PSSH} -h ${CLUSTER} -i pwd + +pclean-id: ${CLUSTER} + ${PSSH} -h ${CLUSTER} "rm /etc/docker/key.json && service docker restart" + +puptime: ## uptime cluster + ${PSSH} -h ${CLUSTER} -i uptime + +### SWARM + +_check-swarm-stopped: + if [[ `docker ps | grep swarm` != "" ]] ; then echo "swarm master is already running" ; exit 1; fi + +start-swarm: _check-swarm-stopped ${CLUSTER_SWARM} ## start swarm master + docker run -v ${HERE}:/cfg -d -p 0.0.0.0:31338:2375 --restart=always --name=swarm_master ${SWARM_IMAGE} manage -H 0.0.0.0:2375 --strategy spread file:///cfg/${CLUSTER_SWARM} + +stop-swarm: ## stop swarm master + docker rm -f swarm_master + +restart-swarm: stop-swarm start-swarm ## restart swarm master + +swarm-info: ## check swarm + docker ${SWARM_OPT} info + +#### CLUSTER + +start-master-interactive: + docker run -p 32000-32100:32000-32100 --name ${MASTER} -ti --rm ${RUN_OPTIONS} ${LOCAL_IMAGE_BUILD} bash + +start-master: ## start master + docker run -p 32000-32100:32000-32100 -p 8888:8888 -e IPPHUB_IP=${HEADIP} --name ${MASTER} -d ${RUN_OPTIONS} ${LOCAL_IMAGE_BUILD} + ( echo -e "GET /\n\n" | nc localhost 8888 ) || exit 1 + +start-slaves: ## start slaves + sleep 2 + for i in `seq ${N_KERNELS}` ; do \ + echo "start slave $$i" ; \ + docker ${SWARM_OPT} run -d ${RUN_OPTIONS} ${LOCAL_IMAGE_BUILD} bash --login -c ipengine || exit 1 ; \ + done + echo "Running `docker ${SWARM_OPT} ps -q |wc -l` slave(s)" + +is-master-running: + docker ps -a | grep "${MASTER}$$" > /dev/null + ( echo -e "GET /\n\n" | nc localhost 8888 ) || exit 1 + +test-cluster: is-master-running list-slaves ## test parallel script + docker cp tests ${MASTER}:/root + docker exec ${MASTER} bash --login -c 'for p in tests/* ; do echo $$p ; python $$p || exit 1 ; done' + +exec-master: is-master-running + docker exec -ti ${MASTER} bash + +start-cluster: start-master start-slaves list-slaves test-cluster ## start cluster + +restart-cluster: stop-cluster start-cluster ## restart cluster + +list-slaves: ## list slaves + docker ${SWARM_OPT} ps --no-trunc + +stop-slaves: + docker ${SWARM_OPT} ps -q | xargs --no-run-if-empty docker ${SWARM_OPT} rm -f + +stop-master: is-master-running + docker stop ${MASTER} + docker rm ${MASTER} + +restart-slaves: stop-slaves start-slaves ## restart slaves + echo Slaves restarted + +stop-cluster: stop-slaves stop-master + echo Cluster stopped + +#### REGISTRY + +start-registry: + test -d ${HERE}/registry || mkdir -p ${HERE}/registry + docker run -d -p 5000:5000 --restart=always --name registry -v ${HERE}/registry:/var/lib/registry registry:2 + echo http://${HEADIP}:5000 + +stop-registry: + docker stop registry + docker rm -f registry + +list-registry: + # https://docs.docker.com/registry/spec/api/ + curl http://10.16.23.10:5000/v2/_catalog + +build-image: + docker build -t ${IMAGE_BUILD} . + docker tag ${IMAGE_BUILD} ${LOCAL_IMAGE_BUILD} + +push-image: + docker push ${LOCAL_IMAGE_BUILD} + +pull-image-swarm: + docker ${SWARM_OPT} pull ${LOCAL_IMAGE_BUILD} + + +##### Obsolete? + +ps-user-containers: ${CLUSTER} ## list container running on the cluster + ${PSSH} -h ${CLUSTER} -i '${DOCKER_NODE} ps ' + +images: ${CLUSTER} ## list images created at clusters + ${PSSH} -h ${CLUSTER} -i '${DOCKER_NODE} images' + +rm-images: ${CLUSTER} ## remove all images + ${PSSH} -h ${CLUSTER} -i '${DOCKER_NODE} images -q | xargs ${DOCKER_NODE} rmi' + +rm-user-containers: ${CLUSTER} ## stop & remove user containers + ${PSSH} -h ${CLUSTER} --timeout=0 -i '${DOCKER_NODE} ps -a -q|xargs --no-run-if-empty ${DOCKER_NODE} rm' + +swarm-ps: ## list containers running in swarm + docker ${SWARM_OPT} ps + +swarm-psa: ## list all containers in swarm + docker ${SWARM_OPT} ps -a + +start-monitor: ## start monitoring (bosun) + docker -p 0.0.0.0:4242:4242 -p 0.0.0.0:8070:8070 stackexchange/bosun + +pdf: ${CLUSTER} ## check disk free space on cluster nodes + ${PSSH} -h ${CLUSTER} -i df -h / + +mdu: ## mfs du + du -m --max-depth 1 /opt/mfs diff --git a/cluster/README.md b/cluster/README.md new file mode 100644 index 00000000..57b1c535 --- /dev/null +++ b/cluster/README.md @@ -0,0 +1,34 @@ + +# Running ipyparallel cluster with REP & docker swarm + + +## Setup + +prerequisites: +- Kernel 3.10+ +- docker 1.8+ on all machines +- list all your nodes in `cluster.txt` (just one per line, like `cluster.txt.orig`) + +## Start + +`make start-master` -- start master + +`make start-slaves` -- start slaves (by number of lines in `cluster.txt`) + +jupyter with REP will be acccessible by port 8888 of master instance. + + +## Stop + + make stop-cluster + +## Maintenance + + +to check cluster status `make test-cluster` + +to add, say, 5 more slaves: ```make -e N=5 start-slaves``` + + + +## Troubleshooting diff --git a/cluster/cluster.txt.orig b/cluster/cluster.txt.orig new file mode 100644 index 00000000..02e161d4 --- /dev/null +++ b/cluster/cluster.txt.orig @@ -0,0 +1,9 @@ +farm-wn1 +farm-wn2 +farm-wn3 +farm-wn4 +farm-wn5 +farm-wn6 +farm-wn7 +farm-wn8 +farm-wn9 \ No newline at end of file diff --git a/cluster/start_jupyter.sh b/cluster/start_jupyter.sh new file mode 100755 index 00000000..c8ee70eb --- /dev/null +++ b/cluster/start_jupyter.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# script to start jupyter inside docker container +# fine-tuned by set of environment variables (see the code), +# e.g. runs under jupyterhub environment in case JPY_API_TOKEN is set + +set +xv + +[ -z "$ENV_BIN_DIR" ] && source /etc/profile.d/rep_profile.sh + +if [ "$INSTALL_PIP_MODULES" != "" ] ; then + pip install $INSTALL_PIP_MODULES +fi + +if [ "$JPY_API_TOKEN" != "" ] ; then + echo "Starting under Jupyterhub" + jupyter kernelspec install-self + source activate jupyterhub_py3 + jupyter kernelspec install-self + source activate rep_py2 # default env + + NOTEBOOK_DIR=/notebooks + git clone $JPY_GITHUBURL $NOTEBOOK_DIR + $HOME/miniconda/envs/jupyterhub_py3/bin/jupyterhub-singleuser \ + --port=8888 \ + --ip=0.0.0.0 \ + --user=$JPY_USER \ + --cookie-name=$JPY_COOKIE_NAME \ + --base-url=$JPY_BASE_URL \ + --hub-prefix=$JPY_HUB_PREFIX \ + --hub-api-url=$JPY_HUB_API_URL \ + --notebook-dir=$NOTEBOOK_DIR + exit $? +fi + +if [ "$GENERATE_SSL_HOSTNAME" != "" ] ; then + echo "Setting up SSL support for the Jupyter profile" + SSL_CERTFILE="/root/mycert.pem" + SSL_KEYFILE="" + echo -e "\n\n\n\n${GENERATE_SSL_HOSTNAME}\n\n" | + openssl req -x509 -nodes -days 365 -newkey rsa:1024 -keyout $SSL_CERTFILE -out $SSL_CERTFILE +fi + +if [ "$SSL_CERTFILE" != "" ] ; then + JUPYTER_OPTIONS+=" --certfile=$SSL_CERTFILE" +fi + +if [ "$SSL_KEYFILE" != "" ] ; then + JUPYTER_OPTIONS+=" --keyfile=$SSL_KEYFILE" +fi + +JUPYTER_CONFIG=$HOME/.jupyter/jupyter_notebook_config.py + +if [ "$PASSWORD" != "" ] ; then + sha=`python -c "from notebook.auth import passwd; print passwd('$PASSWORD')"` + echo "c.NotebookApp.password = u'$sha'" >> $JUPYTER_CONFIG +fi + +if [ "$SECRET" != "" ] ; then + echo "c.NotebookNotary.secret = b'$SECRET'" >> $JUPYTER_CONFIG +fi + +if [ "$SECRET_FILE" != "" ] ; then + echo "c.NotebookNotary.secret_file = '$SECRET_FILE'" >> $JUPYTER_CONFIG +fi + +if [ "$JUPYTER_PORT" != "" ] ; then + JUPYTER_OPTIONS+=" --port $JUPYTER_PORT" +fi + +if [ "$IPPHUB_IP" != "" ] ; then + ipython profile create --parallel + ipcontroller --ip=* --port 32000 --location=$IPPHUB_IP --HubFactory.iopub=32001,32002 --HubFactory.hb=32003,32004 --HubFactory.mux=32005,32006 --HubFactory.notifier_port=32007 --HubFactory.task=32008,32009 --HubFactory.control=32010,32011 & +fi + +[[ -d /REP_howto && ! -L /notebooks/rep_howto ]] && ln -s /REP_howto /notebooks/rep_howto + +cat .rep_version +source .rep_version +echo "Starting Jupyter" +jupyter notebook $JUPYTER_OPTIONS /notebooks 2>&1 | tee -a /notebooks/jupyter.log diff --git a/cluster/tests/test.py b/cluster/tests/test.py new file mode 100644 index 00000000..524a48dd --- /dev/null +++ b/cluster/tests/test.py @@ -0,0 +1 @@ +print "Hello python" diff --git a/cluster/tests/test_ipp_full.py b/cluster/tests/test_ipp_full.py new file mode 100644 index 00000000..4be9349a --- /dev/null +++ b/cluster/tests/test_ipp_full.py @@ -0,0 +1,44 @@ +import ipyparallel as ipp +c = ipp.Client(profile="default") + +import numpy, pandas +import os +from rep.utils import train_test_split +from sklearn.metrics import roc_auc_score +import subprocess +columns = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'g'] +if not os.path.exists("toy_datasets/magic04.data"): + os.makedirs("toy_datasets") + p = subprocess.Popen("wget -O toy_datasets/magic04.data -nc --no-check-certificate https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data", shell=True) + p.wait() + +print "Downloaded magic04.data" +data = pandas.read_csv('toy_datasets/magic04.data', names=columns) +labels = numpy.array(data['g'] == 'g', dtype=int) +data = data.drop('g', axis=1) +import numpy +import numexpr +import pandas +from rep import utils +from sklearn.ensemble import GradientBoostingClassifier +from rep.report.metrics import RocAuc +from rep.metaml import GridOptimalSearchCV, FoldingScorer, RandomParameterOptimizer +from rep.estimators import SklearnClassifier, TMVAClassifier, XGBoostRegressor +# define grid parameters +grid_param = {} +grid_param['learning_rate'] = [0.2, 0.1, 0.05, 0.02, 0.01] +grid_param['max_depth'] = [2, 3, 4, 5] +# use random hyperparameter optimization algorithm +generator = RandomParameterOptimizer(grid_param) +# define folding scorer +scorer = FoldingScorer(RocAuc(), folds=3, fold_checks=3) +estimator = SklearnClassifier(GradientBoostingClassifier(n_estimators=30)) +#grid_finder = GridOptimalSearchCV(estimator, generator, scorer) +#% time grid_finder.fit(data, labels) +grid_finder = GridOptimalSearchCV(estimator, generator, scorer, parallel_profile="default") +print "start grid search" +grid_finder.fit(data, labels) + +grid_finder.params_generator.print_results() + +assert 10 == grid_finder.params_generator.n_evaluations, "oops" diff --git a/cluster/tests/test_ipp_simple.py b/cluster/tests/test_ipp_simple.py new file mode 100644 index 00000000..f64e8d17 --- /dev/null +++ b/cluster/tests/test_ipp_simple.py @@ -0,0 +1,5 @@ +import ipyparallel + +print "Running ", __file__ +c = ipyparallel.Client(profile="default") +print c.ids