diff --git a/Dockerfile b/Dockerfile index 024de9f2f..559ae1367 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,7 @@ WORKDIR /app COPY . /app RUN apk add --no-cache --virtual build-dependencies python make git g++ && \ + apk add --no-cache curl jq && \ npm install && \ chown -R node:node . && \ npm cache clean --force && \ diff --git a/documentation/DataLoader.md b/documentation/DataLoader.md new file mode 100644 index 000000000..acee04e7a --- /dev/null +++ b/documentation/DataLoader.md @@ -0,0 +1,66 @@ +# CouchDB Data Loader + +(`scripts/deleteAndLoadSnapsets.sh`) + +This script is used to setup CouchDB database and is executed as a Kubernetes batch Job every time new version of the +universal image is deployed to the cluster (also when cluster is initially created). + +It does following: + +- Converts the preferences in universal into `snapset` Prefs Safes and GPII Keys, +- Optionally deletes existing database, +- Creates a CouchDB database if none exits, +- Updates the database with respect to its `design/views` document, as required, +- Loads the latest snapsets created into the database. + +## Environment Variables + +- `COUCHDB_URL`: URL of the CouchDB database. (required) +- `CLEAR_INDEX`: If set to `true`, the database at $COUCHDB_URL will be deleted and recreated. (optional) +- `STATIC_DATA_DIR`: The directory where the static data to be loaded into CouchDB resides. (optional) +- `BUILD_DATA_DIR`: The directory where the data built from the conversion step resides. (optional) + +The use of environment variables for data directories is useful if you want to mount the database data using a Docker +volume and point the data loader at it. + +Note that since [the docker doesn't support the environment variable type of +array](https://github.com/moby/moby/issues/20169), two separate environment variables are used for inputting data +directories instead of one array that holds these directories. + +## Running + +Example using containers: + +```bash +$ docker run -d -p 5984:5984 --name couchdb couchdb +$ docker run --rm --link couchdb -e COUCHDB_URL=http://couchdb:5984/gpii \ + -e CLEAR_INDEX=true vagrant-universal scripts/deleteAndLoadSnapsets.sh +$ docker run -d -p 8081:8081 --name preferences --link couchdb \ + -e NODE_ENV=gpii.config.preferencesServer.standalone.production \ + -e PREFERENCESSERVER_LISTEN_PORT=8081 -e DATASOURCE_HOSTNAME=http://couchdb \ + -e DATASOURCE_PORT=5984 vagrant-universal +``` + +Below are two versions of loading couchdb data from a different location (e.g. +/home/vagrant/sync/universal/testData/dbData for static data directory and /home/vagrant/sync/universal/build/dbData for +build data directory). The first version has the optional `CLEAR_INDEX` set to true to erase and reset the database +prior to other database changes: + +```bash +$ docker run --name dataloader --link couchdb \ + -v /home/vagrant/sync/universal/testData/dbData:/static_data -e STATIC_DATA_DIR=/static_data \ + -v /home/vagrant/sync/universal/build/dbData:/build_data -e BUILD_DATA_DIR=/build_data \ + -e COUCHDB_URL=http://couchdb:5984/gpii \ + -e CLEAR_INDEX=true vagrant-universal scripts/deleteAndLoadSnapsets.sh +``` + +The second version does not set `CLEAR_INDEX` such that any existing database is left intact prior to subsequent changes +to it (e.g., deleting the snapsets): + +```bash +$ docker run --name dataloader --link couchdb \ + -v /home/vagrant/sync/universal/testData/dbData:/static_data -e STATIC_DATA_DIR=/static_data \ + -v /home/vagrant/sync/universal/build/dbData:/build_data -e BUILD_DATA_DIR=/build_data \ + -e COUCHDB_URL=http://couchdb:5984/gpii \ + vagrant-universal scripts/deleteAndLoadSnapsets.sh +``` diff --git a/documentation/README.md b/documentation/README.md index 2de699630..c3952effe 100644 --- a/documentation/README.md +++ b/documentation/README.md @@ -9,6 +9,7 @@ * [Preferences Server](PreferencesServer.md) * [Data Model for Preferences and OAuth Data](DataModel.md) * [Pouch Manager](PouchManager.md) + * [Data Loader](DataLoader.md) * [MatchMakerFramework](MatchMakerFramework.md) * [Flat Match Maker](FlatMatchMaker.md) * [Apptology](Apptology.md) diff --git a/scripts/convertPrefs.js b/scripts/convertPrefs.js index d3429cabc..3c6a44075 100644 --- a/scripts/convertPrefs.js +++ b/scripts/convertPrefs.js @@ -45,7 +45,7 @@ rimraf(targetDir, function () { filenames.forEach(function (filename) { if (filename.endsWith(".json5")) { var gpiiKey = filename.substr(0, filename.length - 6); - var preferences = fs.readFileSync(inputDir + filename, "utf-8"); + var preferences = fs.readFileSync(inputDir + "/" + filename, "utf-8"); var currentTime = new Date().toISOString(); var prefsSafeId = "prefsSafe-" + gpiiKey; @@ -80,11 +80,11 @@ rimraf(targetDir, function () { }); // Write the target files - var prefsSafesFile = targetDir + "prefsSafes.json"; + var prefsSafesFile = targetDir + "/prefsSafes.json"; console.log("prefsSafesFile: " + prefsSafesFile); fs.writeFileSync(prefsSafesFile, JSON.stringify(prefsSafes, null, 4)); - var gpiiKeysFile = targetDir + "gpiiKeys.json"; + var gpiiKeysFile = targetDir + "/gpiiKeys.json"; fs.writeFileSync(gpiiKeysFile, JSON.stringify(gpiiKeys, null, 4)); console.log("Finished converting preferences data in the source directory " + inputDir + " to the target directory " + targetDir); diff --git a/scripts/deleteAndLoadSnapsets.sh b/scripts/deleteAndLoadSnapsets.sh index e35da74de..f27730d07 100755 --- a/scripts/deleteAndLoadSnapsets.sh +++ b/scripts/deleteAndLoadSnapsets.sh @@ -1,8 +1,12 @@ #!/bin/sh +APP_DIR=${APP_DIR:-"/app"} -UNIVERSAL_DIR=${UNIVERSAL_DIR:-/home/node/universal} -STATIC_DATA_DIR=${STATIC_DATA_DIR:-/home/node/universal/testData/dbData} -BUILD_DATA_DIR=${BUILD_DATA_DIR:-/home/node/universal/build/dbData/snapset} +STATIC_DATA_DIR=${STATIC_DATA_DIR:-"${APP_DIR}/testData/dbData"} +PREFERENCES_DATA_DIR=${PREFERENCES_DATA_DIR:-"${APP_DIR}/testData/preferences"} +BUILD_DATA_DIR=${BUILD_DATA_DIR:-'/tmp/build/dbData'} + +DATALOADER_JS="${APP_DIR}/scripts/deleteAndLoadSnapsets.js" +CONVERT_JS="${APP_DIR}/scripts/convertPrefs.js" log() { echo "$(date +'%Y-%m-%d %H:%M:%S') - $1" @@ -11,60 +15,71 @@ log() { warm_indices(){ log "Warming indices..." - for view in $(curl -s $COUCHDB_URL/_design/views/ | jq -r '.views | keys[]'); do - curl -fsS $COUCHDB_URL/_design/views/_view/$view >/dev/null + for view in $(curl -s "${COUCHDB_URL}/_design/views/" | jq -r '.views | keys[]'); do + curl -fsS "${COUCHDB_URL}/_design/views/_view/${view}" >/dev/null done log "Finished warming indices..." } # Verify variables -if [ -z "$COUCHDB_URL" ]; then +if [ -z "${COUCHDB_URL}" ]; then echo "COUCHDB_URL environment variable must be defined" exit 1 fi -if [ ! -d "$STATIC_DATA_DIR" -o ! "$(ls -A $STATIC_DATA_DIR/*.json)" ]; then - echo "STATIC_DATA_DIR ($STATIC_DATA_DIR) does not exist or does not contain data, using universal's 'testData/dbData' as the default" - STATIC_DATA_DIR=./testData/dbData -fi +COUCHDB_URL_SANITIZED=$(echo "${COUCHDB_URL}" | sed -e 's,\(://\)[^/]*\(@\),\1\2,g') -if [ ! -d "$BUILD_DATA_DIR" -o ! "$(ls -A $BUILD_DATA_DIR/*.json)" ]; then - echo "BUILD_DATA_DIR ($BUILD_DATA_DIR) does not exist or does not contain data, using universal's 'build/dbData/snapset' as the default" - BUILD_DATA_DIR=./build/dbData/snapset -fi +log 'Starting' +log "CouchDB: ${COUCHDB_URL_SANITIZED}" +log "Clear index: ${CLEAR_INDEX}" +log "Static: ${STATIC_DATA_DIR}" +log "Build: ${BUILD_DATA_DIR}" +log "Working directory: $(pwd)" -COUCHDB_URL_SANITIZED=`echo "$COUCHDB_URL" | sed -e 's,\(://\)[^/]*\(@\),\1\2,g'` +# Check we can connect to CouchDB +COUCHDB_URL_ROOT=$(echo "${COUCHDB_URL}" | sed 's/[^\/]*$//g') +RET_CODE=$(curl --write-out '%{http_code}' --silent --output /dev/null "${COUCHDB_URL_ROOT}/_up") +if [ "$RET_CODE" != '200' ]; then + log "[ERROR] Failed to connect to CouchDB: ${COUCHDB_URL_SANITIZED}" + exit 1 +fi -cd "$UNIVERSAL_DIR" -log "Starting" -log "CouchDB: $COUCHDB_URL_SANITIZED" -log "Clear index: $CLEAR_INDEX" -log "Static: $STATIC_DATA_DIR" -log "Build: $BUILD_DATA_DIR" -log "Working directory: `pwd`" +# Create build dir if it does not exist +if [ ! -d "${BUILD_DATA_DIR}" ]; then + mkdir -p "${BUILD_DATA_DIR}" +fi -node scripts/convertPrefs.js testData/preferences/ build/dbData/snapset/ snapset +# Convert preferences json5 to GPII keys and preferences safes +if [ -d "${PREFERENCES_DATA_DIR}" ]; then + node "${CONVERT_JS}" "${PREFERENCES_DATA_DIR}" "${BUILD_DATA_DIR}" snapset + if [ "$?" != '0' ]; then + log "[ERROR] ${CONVERT_JS} failed (exit code: $?)" + exit 1 + fi +else + log "PREFERENCES_DATA_DIR ($PREFERENCES_DATA_DIR) does not exist, nothing to convert" +fi # Initialize (possibly clear) data base -if [ ! -z "$CLEAR_INDEX" ]; then - log "Deleting database at $COUCHDB_URL_SANITIZED" - if ! curl -fsS -X DELETE "$COUCHDB_URL"; then +if [ "${CLEAR_INDEX}" == 'true' ]; then + log "Deleting database at ${COUCHDB_URL_SANITIZED}" + if ! curl -fsS -X DELETE "${COUCHDB_URL}"; then log "Error deleting database" fi fi -log "Creating database at $COUCHDB_URL_SANITIZED" -if ! curl -fsS -X PUT "$COUCHDB_URL"; then - log "Database already exists at $COUCHDB_URL_SANITIZED" +log "Creating database at ${COUCHDB_URL_SANITIZED}" +if ! curl -fsS -X PUT "${COUCHDB_URL}"; then + log "Database already exists at ${COUCHDB_URL_SANITIZED}" fi # Submit data -node scripts/deleteAndLoadSnapsets.js $COUCHDB_URL $STATIC_DATA_DIR $BUILD_DATA_DIR +node "${DATALOADER_JS}" "${COUCHDB_URL}" "${STATIC_DATA_DIR}" "${BUILD_DATA_DIR}" err=$? -if [ $err != 0 ]; then - log "deleteAndLoadSnapsets.js failed with $err, exiting" - exit $err +if [ "${err}" != '0' ]; then + log "${DATALOADER_JS} failed with ${err}, exiting" + exit "${err}" fi # Warm Data diff --git a/scripts/vagrantCloudBasedContainers.sh b/scripts/vagrantCloudBasedContainers.sh index 8c2dc75b3..85c0f2b52 100755 --- a/scripts/vagrantCloudBasedContainers.sh +++ b/scripts/vagrantCloudBasedContainers.sh @@ -35,7 +35,7 @@ COUCHDB_HEALTHCHECK_TIMEOUT=30 if [ "$NO_REBUILD" == "true" ] ; then CLEAR_INDEX= else - CLEAR_INDEX=1 + CLEAR_INDEX='true' fi UNIVERSAL_DIR="/home/vagrant/sync/universal" @@ -45,6 +45,7 @@ BUILD_DATA_DIR="$UNIVERSAL_DIR/build/dbData/snapset" COUCHDB_URL="http://localhost:${COUCHDB_PORT}/gpii" DATASOURCE_HOSTNAME="http://couchdb" +DATALOADER_CMD='/app/scripts/deleteAndLoadSnapsets.sh' GPII_PREFERENCES_CONFIG="gpii.config.preferencesServer.standalone.production" GPII_PREFERENCES_PORT=9081 @@ -81,9 +82,7 @@ docker run -d -p $COUCHDB_PORT:$COUCHDB_PORT --name couchdb $COUCHDB_IMAGE # Wait for CouchDB wget -O /dev/null --retry-connrefused --waitretry=$COUCHDB_HEALTHCHECK_DELAY --read-timeout=20 --timeout=1 --tries=$COUCHDB_HEALTHCHECK_TIMEOUT http://localhost:$COUCHDB_PORT -# Load the CouchDB data -export UNIVERSAL_DIR COUCHDB_URL STATIC_DATA_DIR BUILD_DATA_DIR CLEAR_INDEX -$SCRIPT_DIR/deleteAndLoadSnapsets.sh +docker run --rm --link couchdb -v $STATIC_DATA_DIR:/static_data -e STATIC_DATA_DIR=/static_data -v $BUILD_DATA_DIR:/build_data -e BUILD_DATA_DIR=/build_data -e COUCHDB_URL=$DATALOADER_COUCHDB_URL -e CLEAR_INDEX=$CLEAR_INDEX $UNIVERSAL_IMAGE $DATALOADER_CMD # Wait for the CouchDB views become accessible. Accessing the view URL forced the view index to build which take time. # The URL returns 500 when the index is not ready, so use "--retry-on-http-error" option to continue retries at 500 response code.