diff --git a/.env b/.env
new file mode 100644
index 00000000..2b42e8d7
--- /dev/null
+++ b/.env
@@ -0,0 +1,23 @@
+AIRFLOW_UID=502
+AIRFLOW_GID=0
+
+API_WORKERS=4
+API_PORT=5551
+API_TIMEOUT=10
+
+DATA_DIR=./local_storage
+
+DUG_LOG_LEVEL=INFO
+
+ELASTICSEARCH_PASSWORD=12345
+ELASTICSEARCH_HOST=elasticsearch
+ELASTICSEARCH_USERNAME=elastic
+
+NBOOST_API_HOST=nboost
+
+REDIS_PASSWORD=weak
+REDIS_HOST=merge-redis-master
+REDIS_PORT=6379
+TRANQL_ACCESS_LOG=access.log
+TRANQL_ERROR_LOG=error.log
+ROGER_DUG__INPUTS_DATA__SETS=topmed:v1.0
\ No newline at end of file
diff --git a/.github/workflows/build-push-dev-image.yml b/.github/workflows/build-push-dev-image.yml
new file mode 100644
index 00000000..13f8cfb7
--- /dev/null
+++ b/.github/workflows/build-push-dev-image.yml
@@ -0,0 +1,86 @@
+# Workflow responsible for the 
+# development release processes.
+#
+name: Build-Push-Dev-Image
+on:
+  push:
+    branches:
+      - develop
+    paths-ignore:
+      - README.md
+      - .old_cicd/*
+      - .github/*
+      - .github/workflows/*
+      - LICENSE
+      - .gitignore
+      - .dockerignore
+      - .githooks
+  # Do not build another image on a pull request.
+  # Any push to develop will trigger a new build however.
+  pull_request:
+    branches-ignore:
+      - '*'
+
+jobs:
+  build-push-dev-image:
+    runs-on: ubuntu-latest
+    steps:
+    
+    - name: Checkout Code
+      uses: actions/checkout@v3
+      with:
+        ref: ${{ github.head_ref }} 
+        # fetch-depth: 0 means, get all branches and commits
+        fetch-depth: 0
+
+    - name: Set short git commit SHA
+      id: vars
+      run: |
+        echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT
+    # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/
+
+    - name: Confirm git commit SHA output
+      run: echo ${{ steps.vars.outputs.short_sha }}
+
+    # Docker Buildx is important to caching in the Build And Push Container
+    # step
+    # https://github.com/marketplace/actions/build-and-push-docker-images
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      with:
+        driver-opts: |
+          network=host
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+        logout: true
+
+    - name: Login to Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: containers.renci.org
+        username: ${{ secrets.CONTAINERHUB_USERNAME }}
+        password: ${{ secrets.CONTAINERHUB_TOKEN }}
+        logout: true
+
+
+    # Notes on Cache: 
+    # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache
+    - name: Build Push Container
+      uses: docker/build-push-action@v5
+      with:
+        context: .
+        push: true
+        # Push to renci-registry and dockerhub here.
+        # cache comes from dockerhub.
+        tags: |
+          ${{ github.repository }}:develop
+          ${{ github.repository }}:${{ steps.vars.outputs.short_sha }}
+          containers.renci.org/${{ github.repository }}:develop
+          containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }}
+        cache-from: type=registry,ref=${{ github.repository }}:buildcache-dev
+        cache-to: type=registry,ref=${{ github.repository }}:buildcache-dev,mode=max
\ No newline at end of file
diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml
new file mode 100644
index 00000000..07b22d21
--- /dev/null
+++ b/.github/workflows/build-push-release.yml
@@ -0,0 +1,131 @@
+# Workflow responsible for the 
+# major release processes.
+#
+
+name: Build-Push-Release
+on:
+  push:
+    branches:
+      - master 
+      - main
+    paths-ignore:
+      - README.md
+      - .old_cicd/*
+      - .github/*
+      - .github/workflows/*
+      - LICENSE
+      - .gitignore
+      - .dockerignore
+      - .githooks
+    tags-ignore:
+      - '*'
+jobs:
+  build-push-release:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+      with:
+        ref: ${{ github.head_ref }} 
+        fetch-depth: 0
+
+    - name: Set short git commit SHA
+      id: vars
+      run: |
+        echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT
+    # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/
+
+    - name: Confirm git commit SHA output
+      run: echo ${{ steps.vars.outputs.short_sha }}
+
+    # https://github.com/marketplace/actions/git-semantic-version
+    - name: Semver Check
+      uses: paulhatch/semantic-version@v5.0.3
+      id: version
+      with:
+        # The prefix to use to identify tags
+        tag_prefix: "v"
+        # A string which, if present in a git commit, indicates that a change represents a
+        # major (breaking) change, supports regular expressions wrapped with '/'
+        major_pattern: "/breaking:|major:/"
+        # A string which indicates the flags used by the `major_pattern` regular expression. Supported flags: idgs
+        major_regexp_flags: "ig"
+        # Same as above except indicating a minor change, supports regular expressions wrapped with '/'
+        minor_pattern: "/feat:|feature:|minor:/"
+        # A string which indicates the flags used by the `minor_pattern` regular expression. Supported flags: idgs
+        minor_regexp_flags: "ig"
+        # A string to determine the format of the version output
+        # version_format: "${major}.${minor}.${patch}-prerelease${increment}"
+        version_format: "${major}.${minor}.${patch}"
+        search_commit_body: false
+
+    # Docker Buildx is important to caching in the Build And Push Container
+    # step
+    # https://github.com/marketplace/actions/build-and-push-docker-images
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      with:
+        driver-opts: |
+          network=host
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+        logout: true
+
+    - name: Login to Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: containers.renci.org
+        username: ${{ secrets.CONTAINERHUB_USERNAME }}
+        password: ${{ secrets.CONTAINERHUB_TOKEN }}
+        logout: true
+
+    # Notes on Cache: 
+    # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache
+    - name: Build Push Container
+      uses: docker/build-push-action@v5
+      with:
+        push: true
+        # Push to renci-registry and dockerhub here.
+        # cache comes from dockerhub.
+        tags: |
+          containers.renci.org/${{ github.repository }}:v${{ steps.version.outputs.version }}
+          containers.renci.org/${{ github.repository }}:latest
+          containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }}
+          ${{ github.repository }}:v${{ steps.version.outputs.version }}
+          ${{ github.repository }}:latest
+          ${{ github.repository }}:${{ steps.vars.outputs.short_sha }}
+        cache-from: type=registry,ref=${{ github.repository }}:buildcache-release
+        cache-to: type=registry,ref=${{ github.repository }}:buildcache-release,mode=max
+
+#==========================TAG & RELEASE W/ NOTES =========================
+
+    # Note: GITHUB_TOKEN is autogenerated feature of github app
+    # which is auto-enabled when using github actions.
+    # https://docs.github.com/en/actions/security-guides/automatic-token-authentication
+    # https://docs.github.com/en/rest/git/tags?apiVersion=2022-11-28#create-a-tag-object
+    # https://docs.github.com/en/rest/git/refs?apiVersion=2022-11-28#create-a-reference
+    # This creates a "lightweight" ref tag.
+    - name: Create Tag for Release
+      run: |
+        curl \
+        -s --fail -X POST \
+        -H "Accept: application/vnd.github+json" \
+        -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+        -H "X-GitHub-Api-Version: 2022-11-28" \
+        https://api.github.com/repos/${{ github.repository }}/git/refs \
+        -d '{"ref":"refs/tags/v${{ steps.version.outputs.version }}","sha":"${{ github.sha }}"}'
+        
+#   https://cli.github.com/manual/gh_release_create
+    - name: Create Release
+      env:
+        RELEASE_VERSION: ${{ steps.version.outputs.version }}
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        gh release create ${{ env.RELEASE_VERSION }} \
+          -t "${{ env.RELEASE_VERSION }}" \
+          --generate-notes \
+          --latest
\ No newline at end of file
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
new file mode 100644
index 00000000..b7f3e6a5
--- /dev/null
+++ b/.github/workflows/code-checks.yml
@@ -0,0 +1,129 @@
+# Workflow responsible for core acceptance testing.
+# Tests Currently Run:
+#     - flake8-linter
+#     - PYTest
+#     - Bandit
+# For PR Vulnerability Scanning a separate workflow will run.
+# The build-push-dev-image and build-push-release workflows
+# handle the develop and release image storage respectively.
+#
+#
+
+name: Code-Checks
+on:
+  push:
+    branches-ignore:
+      - master
+      - main
+      - develop
+  pull_request:
+    branches:
+      - develop
+      - master
+      - main
+    types: [opened, synchronize]
+    paths-ignore:
+      - README.md
+      - .old_cicd/*
+      - .github/*
+      - .github/workflows/*
+      - LICENSE
+      - .gitignore
+      - .dockerignore
+      - .githooks
+
+jobs:
+  ############################## flake8-linter ##############################
+  flake8-linter:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      # Currently actions/setup-python supports caching
+      # but the cache is not as robust as cache action.
+      # Here we cache the entire python env which speeds subsequent builds up alot. (alot being scientific term)
+      # Ref: https://blog.allenai.org/python-caching-in-github-actions-e9452698e98d
+      - uses: actions/cache@v3
+        name: Cache Python
+        with:
+          path: ${{ env.pythonLocation }}
+          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pyproject.toml') }}
+
+      - name: Install Requirements
+        run: |
+          pip install -r requirements.txt
+
+      - name: Lint with flake8
+        run: |
+          pip install flake8
+          flake8 --ignore=E,W dags
+        # We continue on error here until the code is clean
+        # flake8 --ignore=E,W --exit-zero .
+        continue-on-error: true
+
+  ################################### PYTEST ###################################
+  # pytest:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #   - uses: actions/checkout@v3
+  #   - name: Set up Python
+  #     uses: actions/setup-python@v4
+  #     with:
+  #       python-version: '3.12'
+
+  #   - name: Install Requirements
+  #     run: |
+  #       pip install -r requirements.txt
+  #       pip install coverage
+  #       pip install ./tests
+
+  #   - name: Test with pytest
+  #     run: |
+  #       make test
+  ############################## test-image-build ##############################
+  test-image-build:
+    runs-on: ubuntu-latest
+    # if: ${{ github.actor == 'dependabot[bot]' }}
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set short git commit SHA
+        id: vars
+        run: |
+          echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT
+      # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/
+      - name: Confirm git commit SHA output
+        run: echo ${{ steps.vars.outputs.short_sha }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+          logout: true
+
+      - name: Parse Github Reference Name
+        id: branch
+        run: |
+          REF=${{ github.ref_name }}
+          echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT
+
+      # Notes on Cache:
+      # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache
+      - name: Build Container
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          tags: |
+            ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }}
+          cache-from: type=registry,ref=${{ github.repository }}:buildcache
+          cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max
diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml
new file mode 100644
index 00000000..1e7bc060
--- /dev/null
+++ b/.github/workflows/trivy-pr-scan.yml
@@ -0,0 +1,67 @@
+name: trivy-pr-scan 
+on:
+  pull_request:
+    branches:
+      - develop
+      - master
+      - main 
+    types: [ opened, synchronize ]
+    paths-ignore:
+    - README.md
+    - .old_cicd/*
+    - .github/*
+    - .github/workflows/*
+    - LICENSE
+    - .gitignore
+    - .dockerignore
+    - .githooks
+
+jobs:
+ trivy-pr-scan:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      with:
+        driver-opts: |
+          network=host
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+        logout: true
+
+    # Notes on Cache: 
+    # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache
+    - name: Build Container
+      uses: docker/build-push-action@v5
+      with:
+        context: .
+        push: false
+        load: true
+        tags: ${{ github.repository }}:vuln-test
+        cache-from: type=registry,ref=${{ github.repository }}:buildcache
+        cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max
+
+    # We will not be concerned with Medium and Low vulnerabilities
+    - name: Run Trivy vulnerability scanner
+      uses: aquasecurity/trivy-action@master
+      with:
+        image-ref: '${{ github.repository }}:vuln-test'
+        format: 'sarif'
+        severity: 'CRITICAL,HIGH'
+        ignore-unfixed: true
+        output: 'trivy-results.sarif'
+        exit-code: '1'
+    # Scan results should be viewable in GitHub Security Dashboard
+    # We still fail the job if results are found, so below will always run
+    # unless manually canceled.
+    - name: Upload Trivy scan results to GitHub Security tab
+      uses: github/codeql-action/upload-sarif@v2
+      if: '!cancelled()'
+      with:
+        sarif_file: 'trivy-results.sarif'
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..6c46fe7f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,154 @@
+# Git ignore bioler plate from https://github.com/github/gitignore/blob/master/Python.gitignore
+.secret-env
+.vscode/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.secrets-env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# PyCharm
+.idea
+
+# Rope project settings
+.ropeproject
+
+# Mac
+.DS_Store
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# Local output directories
+dags/roger/data
+local_storage
+logs
+tests/integration/data/bulk/
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 00000000..e4e7438b
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,4 @@
+[MAIN]
+disable=invalid-name,
+	no-member,
+	no-value-for-parameter
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..47d2c13f
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,33 @@
+FROM bitnami/airflow:2.10.5-debian-12-r7
+
+USER root
+RUN apt-get update &&  apt-get install -y git nano vim gcc rustc cargo
+#RUN useradd -u 1001 -ms /bin/bash airflow && chown -R airflow /home/airflow
+COPY requirements.txt requirements.txt
+RUN source /opt/bitnami/airflow/venv/bin/activate && CARGO_HOME=/tmp/.cargo &&  \
+    pip install setuptools wheel &&  \
+    pip install -r requirements.txt
+
+RUN rm -f requirements.txt
+
+## Vul patches
+## Python lib patches on airflow python env
+RUN source /opt/bitnami/airflow/venv/bin/activate pip install --upgrade \
+    flask-appbuilder==4.5.3 \
+    cryptography==44.0.1 \
+    werkzeug==3.0.6 \
+    urllib3==2.2.2
+RUN source /opt/bitnami/airflow/venv/bin/activate pip uninstall -y  \
+    apache-airflow-providers-mysql==6.2.0
+
+# Uninstall these from non airflow python env
+RUN pip install --upgrade  \
+    flask-appbuilder==4.5.3 \
+    cryptography==44.0.1 \
+    werkzeug==3.0.6 \
+    urllib3==2.2.2
+RUN apt-get autoremove  -y vim
+RUN apt-get autoremove  -y binutils
+RUN apt-get autoremove  -y linux-libc-dev
+
+USER airflow
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..ef227aa4
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,76 @@
+PYTHON       = $(shell which python3)
+PYTHONPATH   = dags
+VERSION_FILE = ./dags/_version.py
+VERSION      = $(shell cut -d " " -f 3 ${VERSION_FILE})
+DOCKER_REPO  = docker.io
+DOCKER_OWNER = helxplatform
+DOCKER_APP	 = roger
+DOCKER_TAG   = ${VERSION}
+DOCKER_IMAGE = ${DOCKER_OWNER}/${DOCKER_APP}:$(DOCKER_TAG)
+
+.DEFAULT_GOAL = help
+
+.PHONY: help clean install test build image publish
+
+help:
+	@grep -E '^#[a-zA-Z\.\-]+:.*$$' $(MAKEFILE_LIST) | tr -d '#' | awk 'BEGIN {FS = ": "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
+
+mk_dirs:
+	mkdir -p {logs,plugins}
+	mkdir -p local_storage/elastic
+	mkdir -p local_storage/redis
+
+rm_dirs:
+	rm -rf logs/*
+	rm -rf local_storage/elastic/*
+	rm -rf local_storage/redis/*
+	rm -rf ./dags/roger/data/*
+
+#install: Install application along with required packages to local environment
+install:
+	${PYTHON} -m pip install --upgrade pip
+	${PYTHON} -m pip install -r requirements.txt
+
+#test.lint: Run flake8 on the source code
+test.lint:
+	${PYTHON} -m flake8 dags
+
+#test.doc: Run doctests in the source code
+test.doc:
+	echo "Running doc tests..."
+	${PYTHON} -m pytest --doctest-modules dags/roger
+
+#test.unit: Run unit tests
+test.unit:
+	${PYTHON} --version
+	${PYTHON} -m pytest tests/unit
+
+#test.integration: Run unit tests
+test.integration:
+	echo "Running integration tests..."
+	${PYTHON} -m pytest tests/integration
+
+#test: Run all tests
+test: test.unit test.integration
+
+#build: Build the Docker image
+build:
+	echo "Building docker image: ${DOCKER_IMAGE}"
+	docker build --no-cache -t ${DOCKER_IMAGE} -f Dockerfile .
+	echo "Successfully built: ${DOCKER_IMAGE}"
+
+#publish: Push the Docker image
+publish:
+	docker tag ${DOCKER_IMAGE} ${DOCKER_REPO}/${DOCKER_IMAGE}
+	docker push ${DOCKER_REPO}/${DOCKER_IMAGE}
+
+#clean: Remove old data
+clean: rm_dirs mk_dirs
+
+#stack.init: Initialize the airflow DB
+stack.init: mk_dirs
+	docker-compose up airflow-init
+
+#stack: Bring up Airflow and all backend services
+stack: stack.init
+	docker-compose up
diff --git a/README.md b/README.md
index 0e7038aa..92ed1652 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,24 @@ cd bin
 make clean install validate
 ```
 
+## Quickstart
+
+You can quickly set up the required dependencies and spin up all the necessary services with:
+
+```shell
+make install
+make stack
+```
+
+Without using make, you can run the necessary commands directly on the shell:
+
+```shell
+mkdir -p {logs,plugins}
+mkdir -p local_storage/elastic
+docker-compose up airflow-init
+docker-compose up
+```
+
 ## Design
 
 Roger's is designed to transform data through well defined and transparent phases.
@@ -59,8 +77,31 @@ Fetches KGX files according to a data version selecting the set of files to use.
 Merges nodes duplicated across files aggregating properties from all nodes
 ### Schema
 Identify and record the schema (properties) of every edge and node type.
+Schema records the type resolved for each property of a node/edge. The **Schema** step generates category 
+schema file for node schema and predicate schema for edges. In these files properties are collected and 
+scoped based on type of the edges and nodes found. For instances where properties do not have consistent data 
+type across a given scope, the following rule is used to resolve to final data type: 
+
+* If the property has fluctuating type among a boolean, a float or an Integer in the same scope, 
+it's final data type would be a string. 
+* If conflicting property is ever a string but never a list in the scope, it's final data type will be string.
+* If conflicting property is ever a list , it's final data type will be a list. 
+
+Using this approach attributes will be casted based on the resolution set here when loading to the graph database
+in subsequent steps. 
 ### Bulk Create
 Create bulk load CSV files conforming to the Redisgraph Bulk Loader's requirements.
+**Bulk create** uses the Schema  generated in **Schema** step to generate csv headers 
+([redis csv headers](https://github.com/RedisGraph/redisgraph-bulk-loader#input-schemas)) with
+the assumed types . Currently redis bulk loader requires every column to have a value. 
+To address this issue, this step groups the entities being processed (edges/nodes)
+based on attributes that have values. Then these groups are written into separate csv files. Nodes 
+are written as csv(s) under `<roger-data-dir>/bulk/nodes` and edges under  `<roger-data-dir>/bulk/edges`. 
+Each csv with these folders has the following naming convention 
+`<entity-type>.csv-<group_index>-<uniqueness-index>`.
+When populating the CSV with values, the appropriate casting is done on the properties to normalize
+them to the data types defined in the **Schema** step. 
+
 ### Bulk Load
 Use the bulk loader to load Redisgraph logging statistics on each type of loaded object.
 ### Validate
@@ -492,14 +533,122 @@ Open localhost:8080 in a browser.
 
 Then run:
 ```
-python tranql_translator.py
+python tranql_translate.py
 ```
 The Airflow interface shows the workflow:
 ![image](https://user-images.githubusercontent.com/306971/97787955-b968f680-1b8b-11eb-86cc-4d93842eafd3.png)
 
-Use the Trigger icon to run the workflow immediatley.
+Use the Trigger icon to run the workflow immediately.
+
+
+### Running Roger in Kubernetes
+
+Roger supports installing on kubernetes via [Helm](helm.sh).
+
+### Prerequisites
+
+#### 1. Setup persistence volume
+        
+   Create a pvc(roger-data-pvc) for storing roger Data with the following definition.
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: roger-data-pvc
+spec:
+  storageClassName: <storage-class>
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: <size>
+```
+
+Then run :
+
+```shell script
+kubectl -n <NAMESPACE> create -f pvc.yaml 
+```
+
+#### 2. Create git ssh secrets:
+
+There are two secrets for airflow required for Git syncronization.
+
+This is used by `airflow.airflow.config.AIRFLOW__KUBERNETES__GIT_SSH_KEY_SECRET_NAME`
+ ```yaml
+    kind: Secret
+    apiVersion: v1
+    metadata:
+      name: airflow-secrets
+    data:
+      gitSshKey: >-
+        <private-key-base64-encoded>
+    type: Opaque
+ ```
+
+This used by `airflow.dags.git.secret`
+
+```yaml
+kind: Secret
+apiVersion: v1
+metadata:
+  name: airflow-git-keys 
+data:
+  id_rsa: <private-key-base64-encoded>    
+  id_rsa.pub: <public-key-base64-encoded>
+  known_hosts: <known-hosts>
+type: Opaque
+```
+
+### Installing 
+
+#### 1. Init helm dependencies 
+
+Navigate to `roger/bin` dir, and run `roger init`. This will initialize helm dependencies for  [airflow helm repo](https://airflow-helm.github.io/charts))
+and [redis helm repo](https://github.com/bitnami/charts/tree/master/bitnami/redis#redis).
+```shell script
+cd bin/
+export NAMESPACE=<your_namespace, default>
+export RELEASE_NAME=<install_name, airflow>
+export CLUSTER_DOMAIN=cluster.local 
+./roger init
+```
+
+
+#### 2. Installing  
+
+Run and flow the notes to access the servers.
+```shell script
+./roger start 
+```
+
+#### 3. Run Roger workflow
 
+In the Notes a port forward command should be printed. Use that to 
+access airflow UI and run the following steps to run Roger workflow. 
 
+The Airflow interface shows the workflow:
+![image](https://user-images.githubusercontent.com/45075777/104513185-403f4400-55bd-11eb-9142-cbfd7879504b.png)
 
+Press Trigger to get to the following page:
+![image](https://user-images.githubusercontent.com/45075777/104513451-b04dca00-55bd-11eb-837c-65d20d697fff.png)
+
+Enter the configuration parameters to get to Redis cluster installed in step 2:
+```json
+{"redisgraph": {"host": "<redis-master-service-name>", "port": 6379 , "graph" : "graph-name" }}
+```
+And run work flow. 
 
 
+#### 4. Other Commands:
+
+To shutdown and remove the setup from k8s:
+```shell script
+./roger stop 
+```
+
+To restart the setup:
+```shell script
+./roger restart
+```
diff --git a/bin/Makefile b/bin/Makefile
index 21ba47bb..a9833163 100644
--- a/bin/Makefile
+++ b/bin/Makefile
@@ -1,61 +1,24 @@
-##########################################################
-##
-##
-##     Make the Roger database in phases.
-##  
-##     Opertions
-##
-##         get: Fetch versioned knowledge graph exchange
-##              (KGX) formatted data files.
-##
-##         merge: Merge nodes, consolidating duplicates
-##                and preserving fields. 
-##
-##         schema: Identify the all properties in each
-##                 predicate and node type. 
-##
-##         tables: Write tabular formatted data for all
-##                 edges and nodes.
-## 
-##         install: Bulk load a Redisgraph instance.
-##
-##         validate: Validate database contents.
-##
-##         clean: Delete all data artifacts.
-##
-##
-##########################################################
-
-# Root of Roger
-ROGER_HOME=$(PWD)/..
-
-# Path to Roger executable
-ROGER=${ROGER_HOME}/bin/roger
-
-# Location of data
-DATA_ROOT=${ROGER_HOME}/roger/data
+ROGER_MAKE_DIR=./roger_graph_build
+ANNOTATE_MAKE_DIR=./dug_annotate
+INDEXING_MAKE_DIR=./dug_indexing
+
 
 RM=/bin/rm
-TIME=/usr/bin/time
 
-clean:
-	$(RM) -rf $(DATA_ROOT)
+DATA_ROOT=${ROGERENV_DATA__ROOT}
 
-get:
-	$(TIME) $(ROGER) kgx get --data-root $(DATA_ROOT)
 
-merge: get
-	$(TIME) $(ROGER) kgx merge --data-root $(DATA_ROOT)
+clean:
+	$(RM) -rf $(DATA_ROOT)
 
-schema: merge
-	$(TIME) $(ROGER) kgx schema --data-root $(DATA_ROOT)
 
-tables: schema
-	$(TIME) $(ROGER) bulk create --data-root $(DATA_ROOT)
+annotate:
+	make -C ${ANNOTATE_MAKE_DIR} all
 
-install: tables
-	$(TIME) $(ROGER) bulk load --data-root $(DATA_ROOT)
+graph:
+	make -C ${ROGER_MAKE_DIR} all
 
-validate:
-	$(TIME) $(ROGER) bulk validate --data-root $(DATA_ROOT)
+index:
+	make -C ${INDEXING_MAKE_DIR} all
 
+all: annotate graph index
\ No newline at end of file
diff --git a/bin/Readme.md b/bin/Readme.md
new file mode 100644
index 00000000..bcade36a
--- /dev/null
+++ b/bin/Readme.md
@@ -0,0 +1,136 @@
+### Running Roger
+
+This document outlines some of the ways that Roger can be run. 
+
+### Roger Configuration
+
+Configuration is mainly managed through `roger/roger/config.yaml`.
+Each values in this config file can be overridden by shell environment
+variables. For instance to override the following : 
+
+```
+ kgx:
+    biolink_model_version: 1.5.0
+    dataset_version: v1.0
+``` 
+
+Overridding variables can be exported as:
+
+```shell script
+export ROGERENV_KGX_BIOLINK__MODEL__VERSION=1.6
+export ROGERENV_KGX_DATASET__VERSION=v1.1
+```
+Some things to note are: 
+* Environment variables should be prefixed by `ROGERENV_`
+* Single Underscore `_` character denotes sub-key in the yaml
+* Double Underscores `__` are treated as regular underscore
+* Keys in yaml are in lower and environment variables that override them should be in upper case. 
+
+### Deploy Script
+
+`roger/bin/deploy` script can be used to deploy Roger's dependencies in either docker or kubernetes.
+For full capabilities use:
+```shell script
+cd roger/bin
+./deploy help 
+```
+
+##### Docker
+
+For local development we can use docker containers to run backend services that roger depends on.
+These are Redis store, Elastic search and Tranql web service.
+
+Eg: 
+```shell script
+cd roger/bin
+./deploy docker config  # to display the configuration (port address and passwords)
+./deploy docker start # to start
+./deploy help # for help on commands
+```
+
+##### Kubernetes
+
+For running on k8s we can configure git branch and docker images by exporting:
+```shell script
+export NAMESPACE=your-namespace
+export RELEASE=roger
+export CLUSTER_DOMAIN=cluster.local
+export WORKING_GIT_BRANCH=develop
+```
+deploy using :
+
+```shell script
+cd roger/bin
+./deploy k8s config  # to display the configuration 
+./deploy k8s start # to start
+./deploy k8s help # for help on commands
+``` 
+
+### Local Development
+
+##### Setup python virtual env
+
+```shell script
+cd roger 
+python -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt  
+```
+
+##### Configuration
+
+Refer to configuration section to override server names and passwords to 
+passwords etc.. to the backend servers. 
+
+For development there is a dev.env file in `roger/bin/` directory with some start
+up variables. Modify as needed. The following command can be used to export them into
+shell.
+```shell script
+export $(grep -v'^#' bin/dev.env | xargs 0)
+```
+
+
+##### Run a task 
+
+To run a single task : 
+
+```shell script
+python cli.py -l # runs annotatation task 
+python cli.py -h # see the full list of available arguments.
+```
+
+##### Using the Makefiles
+
+Another way to run roger is as a pipeline, where each task is 
+In `roger/roger/bin/` there is a root make file and in the `roger/roger/bin/dug_annotate`,
+`roger/roger/bin/dug_indexing` and `roger/roger/bin/roger_graph_build`. 
+
+Running all pipelines end to end:
+
+```shell script
+cd roger/roger/bin/
+make all
+```
+
+Running annotation pipeline:
+
+```shell script
+cd roger/roger/bin/
+make annotate
+```
+
+Running graph pipeline:
+
+```shell script
+cd roger/roger/bin/
+make graph
+```
+
+Running index pipeline:
+
+```shell script
+cd roger/roger/bin/
+make index
+```
+
+
diff --git a/bin/airk8s b/bin/airk8s
deleted file mode 100755
index c6b32aae..00000000
--- a/bin/airk8s
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-set -x
-set -e
-
-namespace=${NAMESPACE:-scox}
-version=v7.11.0
-
-init () {
-    helm repo add stable https://kubernetes-charts.storage.googleapis.com
-    helm repo update
-}
-start () {
-    helm install "airflow" stable/airflow \
-	 --version "$version" \
-	 --namespace "$namespace" \
-	 --values ./custom-values.yaml
-}
-status () {
-    helm status "airflow" --namespace $namespace
-    echo Scheduler:
-    kubectl -n $namespace logs $(kubectl get pods | grep airflow-scheduler | awk '{ print $1 }') -c git-sync
-    echo Worker:
-    kubectl -n $namespace logs $(kubectl get pods | grep airflow-worker | awk '{ print $1 }') -c git-sync
-}
-stop () {
-    helm delete "airflow" --namespace $namespace
-}
-connect () {
-    kubectl exec -it \
-	    --namespace $namespace \
-	    --container airflow-web \
-	    Deployment/airflow-web \
-	    /bin/bash
-}
-web () {
-    export NODE_PORT=$(kubectl get --namespace $namespace -o jsonpath="{.spec.ports[0].nodePort}" services airflow-web)
-    export NODE_IP=$(kubectl get nodes --namespace $namespace -o jsonpath="{.items[0].status.addresses[0].address}")
-    echo http://$NODE_IP:$NODE_PORT/
-    export AIRFLOW_UI=http://$NODE_IP:$NODE_PORT/
-}
-gitsecret () {
-    kubectl create secret generic \
-	    airflow-git-keys \
-	    --from-file=id_rsa=$HOME/.ssh/id_rsa \
-	    --from-file=id_rsa.pub=$HOME/.ssh/id_rsa.pub \
-	    --from-file=known_hosts=$HOME/.ssh/known_hosts \
-	    --namespace $namespace
-}
-
-$*
-
-exit 0
diff --git a/bin/custom-values.yaml b/bin/custom-values.yaml
deleted file mode 100644
index e60f3297..00000000
--- a/bin/custom-values.yaml
+++ /dev/null
@@ -1,157 +0,0 @@
-#
-# NOTE:
-# - This is intended to be a `custom-values.yaml` starting point for non-production deployment (like minikube)
-
-# External Dependencies:
-# - A PUBLIC git repo for DAGs: ssh://git@repo.example.com:my-airflow-dags.git
-#
-
-###################################
-# Airflow - Common Configs
-###################################
-airflow:
-  ## the airflow executor type to use
-  ##
-  executor: CeleryExecutor
-#  executor: KubernetesExecutor
-
-  ## the fernet key used to encrypt the connections in the database
-  ##
-  fernetKey: "7T512UXSSmBOkpWimFHIVb8jK6lfmSAvx4mO6Arehnc="
-
-  ## environment variables for the web/scheduler/worker Pods (for airflow configs)
-  ##
-  config:
-    # Security
-    AIRFLOW__CORE__SECURE_MODE: "True"
-    AIRFLOW__API__AUTH_BACKEND: "airflow.api.auth.backend.deny_all"
-    AIRFLOW__WEBSERVER__EXPOSE_CONFIG: "False"
-    AIRFLOW__WEBSERVER__RBAC: "False"
-
-    # DAGS
-    AIRFLOW__CORE__LOAD_EXAMPLES: "False"
-
-    ## Disable noisy "Handling signal: ttou" Gunicorn log messages
-    GUNICORN_CMD_ARGS: "--log-level WARNING"
-
-###################################
-# Airflow - Scheduler Configs
-###################################
-scheduler:
-
-  ## custom airflow connections for the airflow scheduler
-  ##
-#  connections:
-#    - id: my_aws
-#      type: aws
-#      extra: |
-#        {
-#          "aws_access_key_id": "XXXXXXXXXXXXXXXXXXX",
-#          "aws_secret_access_key": "XXXXXXXXXXXXXXX",
-#          "region_name":"eu-central-1"
-#        }
-
-  ## custom airflow variables for the airflow scheduler
-  ##
-  variables: |
-    { "environment": "dev" }
-
-  ## custom airflow pools for the airflow scheduler
-  ##
-  pools: |
-    {
-      "example": {
-        "description": "This is an example pool with 2 slots.",
-        "slots": 2
-      }
-    }
-
-###################################
-# Airflow - WebUI Configs
-###################################
-web:
-  ## configs for the Service of the web Pods
-  ##
-  service:
-    type: NodePort
-
-###################################
-# Airflow - Worker Configs
-###################################
-workers:
-  ## the number of workers Pods to run
-  ##
-  replicas: 1
-
-###################################
-# Airflow - DAGs Configs
-###################################
-dags:
-  ## configs for the DAG git repository & sync container
-  ##
-  git:
-    ## url of the git repository
-    ##
-    #url: "ssh://git@repo.example.com/my-airflow-dags.git"
-    #url: "ssh://git@github.com/stevencox/airflow.git"
-    url: "ssh://git@github.com/stevencox/roger.git"
-    
-    ## the branch/tag/sha1 which we clone
-    ##
-    ref: main
-
-    ## the name of a pre-created secret containing files for ~/.ssh/
-    ##
-    ## NOTE:
-    ## - this is ONLY RELEVANT for SSH git repos
-    ## - the secret commonly includes files: id_rsa, id_rsa.pub, known_hosts
-    ## - known_hosts is NOT NEEDED if `git.sshKeyscan` is true
-    ##
-    secret: airflow-git-keys
-
-    ## the name of the private key file in your `git.secret`
-    ##
-    ## NOTE:
-    ## - this is ONLY RELEVANT for PRIVATE SSH git repos
-    ##
-    privateKeyName: id_rsa
-
-    ## the host name of the git repo
-    ##
-    ## NOTE:
-    ## - this is ONLY REQUIRED for SSH git repos
-    ##
-    ## EXAMPLE:
-    ##   repoHost: "github.com"
-    ##
-    repoHost: "github.com"
-
-    ## the port of the git repo
-    ##
-    ## NOTE:
-    ## - this is ONLY REQUIRED for SSH git repos
-    ##
-    repoPort: 22
-
-    ## configs for the git-sync container
-    ##
-    gitSync:
-      ## enable the git-sync sidecar container
-      ##
-      enabled: true
-
-      ## the git sync interval in seconds
-      ##
-      refreshTime: 60
-
-###################################
-# Database - PostgreSQL Chart
-###################################
-postgresql:
-  enabled: true
-
-###################################
-# Database - Redis Chart
-###################################
-redis:
-  enabled: true
diff --git a/bin/deploy b/bin/deploy
new file mode 100644
index 00000000..47dd6cba
--- /dev/null
+++ b/bin/deploy
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+# ---- Kubernetes ------
+
+k8s () {
+
+  namespace=${NAMESPACE:-}
+  release=${RELEASE:-roger}
+  cluster_domain=${CLUSTER_DOMAIN:-cluster.local}
+  branch=${WORKING_GIT_BRANCH:-develop}
+
+  help () {
+    echo "
+    Usage :  ./deploy k8s [sub-command]
+
+    Deploys Roger pipeline on kubernetes along airflow.
+
+    Available sub-commands:
+      - config : view configuration
+      - init : Initializes helm dependencies for install.
+      - start : Runs helm upgrade/install.
+      - stop : Stops running instance.
+      - restart : Restarts running instance.
+      - client : If redis is installed on the system, it will try to connect to
+    "
+
+  }
+  config() {
+    echo "
+    Configuration for k8s instance.
+    To modify this values export variables with new values.
+    eg: export NAMESPACE=my-namespace
+
+      NAMESPACE: ${namespace}
+      RELEASE: ${release}
+      CLUSTER_DOMAIN: ${cluster_domain}
+      WORKING_GIT_BRANCH: ${branch}
+    "
+  }
+  init () {
+      helm dependency update ../helm
+  }
+  start () {
+    init
+    helm upgrade --install $release \
+    --set redis.clusterDomain=$cluster_domain \
+    --set airflow.airflow.config.AIRFLOW__KUBERNETES__GIT_BRANCH=$branch \
+    --set airflow.dags.git.ref=$branch \
+    --namespace=$namespace \
+    ../helm
+  }
+  stop () {
+      helm delete $release \
+     --namespace=$namespace
+  }
+  restart () {
+      stop
+      start
+  }
+  status () {
+    helm --namespace=$namespace status $release
+  }
+  client () {
+      redis-cli -h 127.0.0.1 -p 6379 -a $REDIS_PASSWORD
+  }
+  $*
+}
+#---------End Kubernetes-------------------
+
+#---------Docker-compose ------------------
+
+docker() {
+  COMPOSE_FILE=./docker_backend/docker-compose.yaml
+  help () {
+    echo "
+    Usage: ./deploy docker [subcommand]
+
+    Run docker based backends.
+
+    Available sub-commands:
+      config: Print contents of ./.env file
+      init: Export ./.env file contents as shell variables.
+      start: Runs docker containers up using ${COMPOSE_FILE}.
+      stop: Stops running docker containers.
+      restart: Restarts containers.
+    "
+  }
+  config() {
+     grep -v "^#" dev.env
+  }
+  init() {
+    export $(config | xargs -0)
+  }
+  start() {
+    init
+    docker-compose -f ${COMPOSE_FILE} up -d
+  }
+  stop() {
+    init
+    docker-compose -f ${COMPOSE_FILE}  down
+  }
+  $*
+}
+
+help () {
+    echo "
+    Usage : ./deploy [env-type] [subcommand]
+
+    Deploys roger dependencies in docker / k8s
+
+    env-type: either k8s or docker
+
+    Read below for the subcommands avaible or use
+    ./deploy [env-type] help .
+
+    "
+    docker help
+    k8s help
+}
+$*
\ No newline at end of file
diff --git a/bin/dev.env b/bin/dev.env
new file mode 100644
index 00000000..1653a62d
--- /dev/null
+++ b/bin/dev.env
@@ -0,0 +1,4 @@
+ROGERENV_DATA__ROOT=~/roger-data
+ROGERENV_KGX_DATASET__VERSION=test
+ROGERENV_ELASTIC__SEARCH_PASSWORD=changeme
+ROGERENV_REDISGRAPH_PASSWORD=changeme
\ No newline at end of file
diff --git a/bin/docker_backend/docker-compose.yaml b/bin/docker_backend/docker-compose.yaml
new file mode 100644
index 00000000..d87c6ae6
--- /dev/null
+++ b/bin/docker_backend/docker-compose.yaml
@@ -0,0 +1,68 @@
+version: '3.0'
+
+#################################################################################
+##
+## A service stack for the Roger pipeline.
+##
+#################################################################################
+services:
+
+  #################################################################################
+  ##
+  ## The OpenAPI endpoint for search. This is the only service to be
+  ## exposed beyond the internal network.
+  ##
+  #################################################################################
+  tranql:
+    image: renciorg/tranql-app:0.35
+    depends_on:
+      - redis
+    restart: always
+    networks:
+      - roger-network
+    environment:
+      - REDIS_PASSWORD=$ROGERENV_REDISGRAPH_PASSWORD
+    entrypoint: /usr/local/bin/gunicorn --workers=2 --bind=0.0.0.0:8001 --name=tranql --timeout=600 tranql.api:app
+    ports:
+    - 8001:8001
+    volumes:
+    - ./tranql-schema.yaml:/tranql/tranql/conf/schema.yaml
+  #################################################################################
+  ##
+  ## A search engine providing scalable indexing and full text search.
+  ##
+  #################################################################################
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:7.6.1
+    networks:
+      - roger-network
+    environment:
+      - ELASTIC_PASSWORD=$ROGERENV_ELASTIC__SEARCH_PASSWORD
+      - discovery.type=single-node
+      - xpack.security.enabled=true
+    volumes:
+      - ./data/elastic:/bitnami/elasticsearch/data
+    ports:
+      - '9200:9200'
+      - '9300:9300'
+
+  #################################################################################
+  ##
+  ## A memory cache for results of high volume service requests.
+  ##
+  #################################################################################
+  redis:
+    image: 'redislabs/redisgraph'
+    networks:
+      - roger-network
+    command: redis-server --requirepass ${ROGERENV_REDISGRAPH_PASSWORD} --loadmodule /usr/lib/redis/modules/redisgraph.so
+    environment:
+      - REDIS_DISABLE_COMMANDS=FLUSHDB,FLUSHALL
+    volumes:
+      - ./data/redis:/data
+    ports:
+      - '6379:6379'
+
+networks:
+  roger-network:
+    driver: bridge
diff --git a/bin/docker_backend/tranql-schema.yaml b/bin/docker_backend/tranql-schema.yaml
new file mode 100644
index 00000000..965d12c3
--- /dev/null
+++ b/bin/docker_backend/tranql-schema.yaml
@@ -0,0 +1,12 @@
+schema:
+    redis:
+      doc: |
+        Roger is a knowledge graph built by aggregeting several kgx formatted knowledge graphs from several sources.
+      url: "redis:"
+      redis: true
+      redis_connection_params:
+        # Host here is the service name in the docker composed container.
+        host: redis
+        port: 6379
+        # SET USERNAME and PASSWORD
+        # via ROGER_USERNAME , ROGER_PASSWORD Env vars (i.e capitialize service name)
diff --git a/bin/dug_annotate/Makefile b/bin/dug_annotate/Makefile
new file mode 100644
index 00000000..34d350ce
--- /dev/null
+++ b/bin/dug_annotate/Makefile
@@ -0,0 +1,44 @@
+##########################################################
+##
+##
+##     Annotate files using Dug.
+##
+##     Operations
+##
+##         annotate_and_normalize: Annotates Variable files using entity name resolution service with curies.
+##
+##         create_kgx_files: Creates KGX formatted knowledge graphs from annotation result set.
+##
+##         clean: Delete all data artifacts.
+##
+##
+##########################################################
+
+# Root
+THIS_MAKEFILE_PATH:=$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+THIS_DIR:=$(shell cd $(dir $(THIS_MAKEFILE_PATH));pwd)
+
+ROGER_HOME=${THIS_DIR}/../..
+CLI_WRAPPER=${ROGER_HOME}/cli.py
+
+# Override Roger data dir ENV
+ANNOTATE_DIR=${ROGERENV_DATA__ROOT}/dug/annotations
+KGX_DIR=${ROGERENV_DATA__ROOT}/dug/kgx
+
+RM=/bin/rm
+TIME=/usr/bin/time
+
+clean:
+	$(RM) -rf ${ANNOTATE_DIR}
+	$(RM) -rf ${KGX_DIR}
+
+get_input_files:
+	$(TIME) python ${CLI_WRAPPER} -gd
+
+annotate_and_normalize:
+	$(TIME) python ${CLI_WRAPPER} -l
+
+create_kgx_files:
+	$(TIME) python ${CLI_WRAPPER} -t
+
+all: get_input_files annotate_and_normalize create_kgx_files
diff --git a/bin/dug_indexing/Makefile b/bin/dug_indexing/Makefile
new file mode 100644
index 00000000..5d015089
--- /dev/null
+++ b/bin/dug_indexing/Makefile
@@ -0,0 +1,53 @@
+##########################################################
+##
+##
+##     Annotate files using Dug.
+##
+##     Operations
+##
+##         annotate_and_normalize: Annotates Variable files using entity name resolution service with curies.
+##
+##         create_kgx_files: Creates KGX formatted knowledge graphs from annotation result set.
+##
+##         clean: Delete all data artifacts.
+##
+##
+##########################################################
+
+# Root
+THIS_MAKEFILE_PATH:=$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+THIS_DIR:=$(shell cd $(dir $(THIS_MAKEFILE_PATH));pwd)
+
+ROGER_HOME=${THIS_DIR}/../..
+CLI_WRAPPER=${ROGER_HOME}/cli.py
+
+# Override Roger data dir ENV
+INDEXING_DIR=${ROGERENV_DATA__ROOT}/dug/expanded_concepts
+CRAWL_DIR=${ROGERENV_DATA__ROOT}/dug/crawl
+
+
+RM=/bin/rm
+TIME=/usr/bin/time
+
+clean:
+	$(RM) -rf ${INDEXING_DIR}
+	$(RM) -rf ${CRAWL_DIR}
+
+crawl_concepts:
+	$(TIME) python ${CLI_WRAPPER} -C
+
+index_concepts: crawl_concepts
+	$(TIME) python ${CLI_WRAPPER} -ic
+
+index_variables:
+	$(TIME) python ${CLI_WRAPPER} -iv
+
+validate_indexed_concepts: index_concepts
+	$(TIME) python ${CLI_WRAPPER} -vc
+
+validate_indexed_variables: index_variables
+	$(TIME) python ${CLI_WRAPPER} -vv
+
+all: validate_indexed_concepts validate_indexed_variables
+
+
diff --git a/bin/roger b/bin/roger
index 5d96cde8..4626df88 100755
--- a/bin/roger
+++ b/bin/roger
@@ -1,53 +1,7 @@
+#!/usr/bin/env bash
 #set -x
 set -e
 
-namespace=${NAMESPACE:-scox}
-release=redisgraph
-image_repository=redislabs/redisgraph
-image_tag=edge
-
-# https://github.com/bitnami/charts/tree/master/bitnami/redis
-init () {
-    helm repo add bitnami https://charts.bitnami.com/bitnami
-}
-start () {
-    helm install $release \
-	 --set image.repository=$image_repository \
-	 --set image.tag=$image_tag \
-	 --set redis.command="redis-server" \
-	 --set redis.args="--loadmodule /usr/lib/redis/modules/redisgraph.so" \
-	 --set master.command="redis-server --loadmodule /usr/lib/redis/modules/redisgraph.so" \
-	 --set slave.command="redis-server --loadmodule /usr/lib/redis/modules/redisgraph.so" \
-	 --namespace=$namespace \
-	 bitnami/redis
-}
-start () {
-    helm install $release \
-	 --set image.repository=$image_repository \
-	 --set image.tag=$image_tag \
-	 --namespace=$namespace \
-	 bitnami/redis
-}
-stop () {
-    helm delete $release \
-	 --namespace=$namespace
-}
-restart () {
-    stop
-    start
-}
-status () {
-    kubectl --namespace=$namespace get pods | grep $release
-    export REDIS_PASSWORD=$(kubectl get secret --namespace $namespace redisgraph -o jsonpath="{.data.redis-password}" | base64 --decode)
-}
-client () {
-    #kubectl port-forward --namespace $namespace svc/redisgraph-master 6380:6379 &
-    redis-cli -h 127.0.0.1 -p 6380 -a $REDIS_PASSWORD
-}
-#----------------------------
-
-
-
 
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 export ROGER_HOME=$( dirname $DIR )
@@ -56,7 +10,7 @@ export PYTHONPATH=$ROGER_HOME:$ROGER_HOME/../kgx
 export DB_NAME=test
 
 roger () {
-    python $ROGER_HOME/roger/core.py $*
+    python $ROGER_HOME/dags/roger/core.py $*
 }
 
 kgx () {
diff --git a/bin/roger_graph_build/Makefile b/bin/roger_graph_build/Makefile
new file mode 100644
index 00000000..f43c9c8c
--- /dev/null
+++ b/bin/roger_graph_build/Makefile
@@ -0,0 +1,66 @@
+##########################################################
+##
+##
+##     Make the Roger database in phases.
+##  
+##     Opertions
+##
+##         get: Fetch versioned knowledge graph exchange
+##              (KGX) formatted data files.
+##
+##         merge: Merge nodes, consolidating duplicates
+##                and preserving fields. 
+##
+##         schema: Identify the all properties in each
+##                 predicate and node type. 
+##
+##         tables: Write tabular formatted data for all
+##                 edges and nodes.
+## 
+##         install: Bulk load a Redisgraph instance.
+##
+##         validate: Validate database contents.
+##
+##         clean: Delete all data artifacts.
+##
+##
+##########################################################
+
+# Root of Roger
+# Root
+THIS_MAKEFILE_PATH:=$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+THIS_DIR:=$(shell cd $(dir $(THIS_MAKEFILE_PATH));pwd)
+
+ROGER_HOME=${THIS_DIR}/../..
+
+# Path to Roger executable
+CLI_WRAPPER=${ROGER_HOME}/cli.py
+
+# Location of data
+DATA_ROOT=${ROGER_HOME}/roger/data
+
+RM=/bin/rm
+TIME=/usr/bin/time
+
+clean:
+	$(RM) -rf $(DATA_ROOT)
+
+get:
+	$(TIME) python ${CLI_WRAPPER} -g
+
+merge: get
+	$(TIME) python ${CLI_WRAPPER} -m
+
+schema: merge
+	$(TIME) python ${CLI_WRAPPER} -s
+
+tables: schema
+	$(TIME) python ${CLI_WRAPPER} -b
+
+install: tables
+	$(TIME) python ${CLI_WRAPPER} -i
+
+validate:
+	$(TIME) python ${CLI_WRAPPER} -a
+
+all: install validate
\ No newline at end of file
diff --git a/cli.py b/cli.py
new file mode 100644
index 00000000..be77525a
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,112 @@
+import roger.core.base as RogerUtil
+from roger.config import config
+from roger.logger import get_logger
+from dug_helpers.dug_utils import DugUtil, get_topmed_files, get_dbgap_files, get_sparc_files, get_anvil_files, get_nida_files
+import sys
+import argparse
+import os
+import time
+
+
+log = get_logger()
+
+if __name__ == "__main__":
+    start = time.time()
+    log.info(f"Start TIME:{start}")
+    parser = argparse.ArgumentParser(description='Roger common cli tool.')
+    """ Common CLI. """
+    parser.add_argument('-d', '--data-root', help="Root of data hierarchy", default=None)
+
+    """ Roger CLI. """
+    parser.add_argument('-v', '--dataset-version', help="Dataset version.", default="v1.0")
+    parser.add_argument('-g', '--get-kgx', help="Get KGX objects", action='store_true')
+    parser.add_argument('-s', '--create-schema', help="Infer schema", action='store_true')
+    parser.add_argument('-m', '--merge-kgx', help="Merge KGX nodes", action='store_true')
+    parser.add_argument('-b', '--create-bulk', help="Create bulk load", action='store_true')
+    parser.add_argument('-i', '--insert', help="Do the bulk insert", action='store_true')
+    parser.add_argument('-a', '--validate', help="Validate the insert", action='store_true')
+
+    """ Dug Annotation CLI. """
+    parser.add_argument('-gd', '--get_dug_input_files', help="Gets input files for annotation",
+                        action="store_true")
+    parser.add_argument('-l', '--load-and-annotate',help="Annotates and normalizes datasets of varaibles.",
+                        action="store_true")
+    parser.add_argument('-t', '--make-tagged-kg', help="Creates KGX files from annotated variable datesets.",
+                        action="store_true")
+
+    """ Dug indexing CLI . """
+    parser.add_argument('-iv', '--index-variables', help="Index annotated variables to elastic search.",
+                        action="store_true")
+    parser.add_argument('-C', '--crawl-concepts', help="Crawl tranql and index concepts",
+                        action="store_true")
+
+    parser.add_argument('-ic', '--index-concepts', help="Index expanded concepts to elastic search.",
+                        action="store_true")
+
+    parser.add_argument('-vc', '--validate-concepts', help="Validates indexing of concepts",
+                        action="store_true")
+
+    parser.add_argument('-vv', '--validate-variables', help="Validates indexing of variables",
+                        action="store_true")
+
+    args = parser.parse_args ()
+
+    if args.data_root is not None:
+        data_root = args.data_root
+        config.data_root = data_root
+        log.info (f"data root:{data_root}")
+
+    # When all lights are on...
+
+    # Annotation comes first
+    if args.get_dug_input_files:
+        get_topmed_files(config)
+        get_dbgap_files(config)
+        get_anvil_files(config)
+        # get_sparc_files(config)
+        # get_nida_files(config)
+
+    if args.load_and_annotate:
+        DugUtil.clear_annotation_cached(config=config)
+        DugUtil.annotate_db_gap_files(config=config)
+        DugUtil.annotate_topmed_files(config=config)
+        DugUtil.annotate_anvil_files(config=config)
+    if args.make_tagged_kg:
+        DugUtil.make_kg_tagged(config=config)
+
+    # Roger things
+    if args.get_kgx:
+        RogerUtil.get_kgx(config=config)
+    if args.merge_kgx:
+        RogerUtil.merge_nodes(config=config)
+    if args.create_schema:
+        RogerUtil.create_schema(config=config)
+    if args.create_bulk:
+        RogerUtil.create_bulk_load(config=config)
+    if args.insert:
+        RogerUtil.bulk_load(config=config)
+    if args.validate:
+        RogerUtil.validate(config=config)
+        RogerUtil.check_tranql(config=config)
+
+    # Back to dug indexing
+    if args.index_variables:
+        DugUtil.index_variables(config=config)
+
+    if args.validate_variables:
+        DugUtil.validate_indexed_variables(config=config)
+
+    if args.crawl_concepts:
+        DugUtil.crawl_tranql(config=config)
+
+    if args.index_concepts:
+        DugUtil.index_concepts(config=config)
+
+    if args.validate_concepts:
+        DugUtil.validate_indexed_concepts(config=config)
+
+    end = time.time()
+    time_elapsed = end - start
+    log.info(f"Completion TIME:{time_elapsed}")
+    
+    sys.exit (0)
diff --git a/dags/__init__.py b/dags/__init__.py
new file mode 100644
index 00000000..f0aee1ff
--- /dev/null
+++ b/dags/__init__.py
@@ -0,0 +1 @@
+from ._version import version as __version__
diff --git a/dags/_version.py b/dags/_version.py
new file mode 100644
index 00000000..adcf54c7
--- /dev/null
+++ b/dags/_version.py
@@ -0,0 +1,2 @@
+version = "0.10.4"
+
diff --git a/dags/annotate_and_index.py b/dags/annotate_and_index.py
new file mode 100644
index 00000000..884cd149
--- /dev/null
+++ b/dags/annotate_and_index.py
@@ -0,0 +1,44 @@
+"""DAG which performs Dug annotate and index operations
+
+This DAG differes slightly from prior versions of the same functionality in
+Roger not only in that the annotation and indexing happen in the same DAG, but
+also those tasks are broken out into sub-DAGs organized by dataset. Each dataset
+has a subdag for all tasks.
+"""
+
+import os
+
+from airflow.models import DAG
+from airflow.operators.empty import EmptyOperator
+from roger.tasks import default_args, create_pipeline_taskgroup
+
+env_enabled_datasets = os.getenv(
+    "ROGER_DUG__INPUTS_DATA__SETS", "topmed,anvil").split(",")
+
+with DAG(
+        dag_id='annotate_and_index',
+        default_args=default_args,
+        schedule_interval=None
+) as dag:
+    init = EmptyOperator(task_id="init", dag=dag)
+    finish = EmptyOperator(task_id="finish", dag=dag)
+
+    from roger import pipelines
+    from roger.config import config
+    envspec = os.getenv("ROGER_DUG__INPUTS_DATA__SETS","topmed:v2.0")
+    data_sets = envspec.split(",")
+    pipeline_names = {x.split(':')[0]: x.split(':')[1] for x in data_sets}
+    for pipeline_class in pipelines.get_pipeline_classes(pipeline_names):
+        # Only use pipeline classes that are in the enabled datasets list and
+        # that have a properly defined pipeline_name attribute
+
+        # TODO
+        # Overriding environment variable just to see if this is working.
+        # name = getattr(pipeline_class, 'pipeline_name', '*not defined*')
+        # if not name in env_enabled_datasets:
+        #     continue
+
+        # Do the thing to add the pipeline's subdag to the dag in the right way
+        # . . .
+
+        init >> create_pipeline_taskgroup(dag, pipeline_class, config) >> finish
diff --git a/dags/dug_helpers/__init__.py b/dags/dug_helpers/__init__.py
new file mode 100644
index 00000000..9e28ad28
--- /dev/null
+++ b/dags/dug_helpers/__init__.py
@@ -0,0 +1,3 @@
+from pathlib import Path
+
+DUG_DATA_DIR = Path(__file__).parent.resolve() / 'dug_data'
diff --git a/dags/dug_helpers/dug_utils.py b/dags/dug_helpers/dug_utils.py
new file mode 100644
index 00000000..52db9624
--- /dev/null
+++ b/dags/dug_helpers/dug_utils.py
@@ -0,0 +1,1019 @@
+import asyncio
+import hashlib
+import logging
+import os
+import re
+import tarfile
+import traceback
+from functools import reduce
+from io import StringIO
+from pathlib import Path
+from typing import Union, List
+
+import requests
+from dug.core import get_parser, get_annotator, get_plugin_manager, DugConcept
+from dug.core.annotators._base import Annotator
+from dug.core.concept_expander import ConceptExpander
+from dug.core.crawler import Crawler
+from dug.core.factory import DugFactory
+from dug.core.parsers import Parser, DugElement
+from dug.core.async_search import Search
+from dug.core.index import Index
+
+from roger.config import RogerConfig
+from roger.core import storage
+from roger.models.biolink import BiolinkModel
+from roger.logger import get_logger
+from utils.s3_utils import S3Utils
+
+log = get_logger()
+
+
+
+class Dug:
+
+    def __init__(self, config: RogerConfig, to_string=True):
+        self.config = config
+        self.bl_toolkit = BiolinkModel()
+        dug_conf = config.to_dug_conf()
+        self.element_mapping = config.indexing.element_mapping
+        self.factory = DugFactory(dug_conf)
+        self.cached_session = self.factory.build_http_session()
+        self.event_loop = asyncio.new_event_loop()
+        if to_string:
+            self.log_stream = StringIO()
+            self.string_handler = logging.StreamHandler(self.log_stream)
+            log.addHandler(self.string_handler)
+
+        self.annotator_name: str = config.annotation.annotator_type
+
+        self.tranqlizer: ConceptExpander = self.factory.build_tranqlizer()
+
+        graph_name = self.config["redisgraph"]["graph"]
+        source = f"redis:{graph_name}"
+        self.tranql_queries: dict = self.factory.build_tranql_queries(source)
+        self.node_to_element_queries: list = self.factory.build_element_extraction_parameters(source)
+
+        indexing_config = config.indexing
+        self.variables_index = indexing_config.get('variables_index')
+        self.concepts_index = indexing_config.get('concepts_index')
+        self.kg_index = indexing_config.get('kg_index')
+
+        self.search_obj: Search = self.factory.build_search_obj([
+            self.variables_index,
+            self.concepts_index,
+            self.kg_index,
+        ])
+        self.index_obj: Index = self.factory.build_indexer_obj([
+                self.variables_index,
+                self.concepts_index,
+                self.kg_index,
+
+        ])
+
+    def __enter__(self):
+        self.event_loop = asyncio.new_event_loop()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # close elastic search connection
+        self.event_loop.run_until_complete(self.search_obj.es.close())
+        # close async loop
+        if self.event_loop.is_running() and not self.event_loop.is_closed():
+            self.event_loop.close()
+        if exc_type or exc_val or exc_tb:
+            traceback.print_exc()
+            log.error(f"{exc_val} {exc_val} {exc_tb}")
+            log.exception("Got an exception")
+
+    def annotate_files(self, parser_name, parsable_files,
+                       output_data_path=None):
+        """
+        Annotates a Data element file using a Dug parser.
+        :param parser_name: Name of Dug parser to use.
+        :param parsable_files: Files to parse.
+        :return: None.
+        """
+        dug_plugin_manager = get_plugin_manager()
+        parser: Parser = get_parser(dug_plugin_manager.hook, parser_name)
+        annotator: Annotator = get_annotator(dug_plugin_manager.hook, annotator_name=self.annotator_name, config=self.config.to_dug_conf())
+        if not output_data_path:
+            output_data_path = storage.dug_annotation_path('')
+        log.info("Parsing files")
+        for parse_file in parsable_files:
+            log.debug("Creating Dug Crawler object")
+            crawler = Crawler(
+                crawl_file=parse_file,
+                parser=parser,
+                annotator=annotator,
+                tranqlizer='',
+                tranql_queries=[],
+                http_session=self.cached_session
+            )
+
+            # configure output space.
+            current_file_name = '.'.join(os.path.basename(parse_file).split('.')[:-1])
+            elements_file_path = os.path.join(output_data_path, current_file_name)
+            elements_file_name = 'elements.pickle'
+            concepts_file_name = 'concepts.pickle'
+
+            # create an empty elements file. This also creates output dir if it doesn't exist.
+            log.debug(f"Creating empty file:  {elements_file_path}/element_file.json")
+            storage.write_object({}, os.path.join(elements_file_path, 'element_file.json'))
+            log.debug(parse_file)
+            log.debug(parser)
+            elements = parser(parse_file)
+            log.debug(elements)
+            crawler.elements = elements
+
+            # @TODO propose for Dug to make this a crawler class init parameter(??)
+            crawler.crawlspace = elements_file_path
+            log.debug(f"Crawler annotator: {crawler.annotator}")
+            crawler.annotate_elements()
+
+            # Extract out the concepts gotten out of annotation
+            # Extract out the elements
+            non_expanded_concepts = crawler.concepts
+            elements = crawler.elements
+
+            # Write pickles of objects to file
+            log.info(f"Parsed and annotated: {parse_file}")
+            elements_out_file = os.path.join(elements_file_path, elements_file_name)
+            storage.write_object(elements, elements_out_file)
+            log.info(f"Pickled annotated elements to : {elements_file_path}/{elements_file_name}")
+            concepts_out_file = os.path.join(elements_file_path, concepts_file_name)
+            storage.write_object(non_expanded_concepts, concepts_out_file)
+            log.info(f"Pickled annotated concepts to : {elements_file_path}/{concepts_file_name}")
+
+    def make_edge(self,
+                  subj,
+                  obj,
+                  predicate='biolink:related_to',
+                  predicate_label='related to',
+                  relation='biolink:related_to',
+                  relation_label='related to'
+                  ):
+        """
+        Create an edge between two nodes.
+
+        :param subj: The identifier of the subject.
+        :param pred: The predicate linking the subject and object.
+        :param obj: The object of the relation.
+        :param predicate: Biolink compatible edge type.
+        :param predicate_label: Edge label.
+        :param relation: Ontological edge type.
+        :param relation_label: Ontological edge type label.
+        :returns: Returns and edge.
+        """
+        edge_id = hashlib.md5(f'{subj}{predicate}{obj}'.encode('utf-8')).hexdigest()
+        return {
+            "subject": subj,
+            "predicate": predicate,
+            "predicate_label": predicate_label,
+            "id": edge_id,
+            "relation": relation,
+            "relation_label": relation_label,
+            "object": obj,
+            "provided_by": "renci.bdc.semanticsearch.annotator"
+        }
+
+    def convert_to_kgx_json(self, elements, written_nodes=set()):
+        """
+        Given an annotated and normalized set of study variables,
+        generate a KGX compliant graph given the normalized annotations.
+        Write that grpah to a graph database.
+        See BioLink Model for category descriptions. https://biolink.github.io/biolink-model/notes.html
+        """
+        graph = {
+            "nodes": [],
+            "edges": []
+        }
+        edges = graph['edges']
+        nodes = graph['nodes']
+
+        for index, element in enumerate(elements):
+            # DugElement means a variable (Study variable...)
+            if not isinstance(element, DugElement):
+                continue
+            study_id = element.collection_id
+            if study_id not in written_nodes:
+                nodes.append({
+                    "id": study_id,
+                    "category": ["biolink:Study"],
+                    "name": study_id
+                })
+                written_nodes.add(study_id)
+            """ connect the study and the variable. """
+            edges.append(self.make_edge(
+                subj=element.id,
+                relation_label='part of',
+                relation='BFO:0000050',
+                obj=study_id,
+                predicate='biolink:part_of',
+                predicate_label='part of'))
+            edges.append(self.make_edge(
+                subj=study_id,
+                relation_label='has part',
+                relation="BFO:0000051",
+                obj=element.id,
+                predicate='biolink:has_part',
+                predicate_label='has part'))
+
+            """ a node for the variable. Should be BL compatible """
+            variable_node = {
+                "id": element.id,
+                "name": element.name,
+                "category": ["biolink:StudyVariable"],
+                "description": element.description.replace("'", '`').replace('\n', ' ') # bulk loader parsing issue
+            }
+            if element.id not in written_nodes:
+                nodes.append(variable_node)
+                written_nodes.add(element.id)
+
+            for identifier, metadata in element.concepts.items():
+                identifier_object = metadata.identifiers.get(identifier)
+                # This logic is treating DBGap files.
+                # First item in current DBGap xml files is a topmed tag,
+                # This is treated as a DugConcept Object. But since its not
+                # a concept we get from annotation (?) its never added to
+                # variable.concepts.items  (Where variable is a DugElement object)
+                # The following logic is trying to extract types, and for the
+                # aformentioned topmed tag it adds `biolink:InfomrmationContentEntity`
+                # Maybe a better solution could be adding types on DugConcept objects
+                # More specifically Biolink compatible types (?)
+                #
+                if identifier_object:
+                    category = identifier_object.types
+                elif identifier.startswith("TOPMED.TAG:"):
+                    category = ["biolink:InformationContentEntity"]
+                else:
+                    continue
+                if identifier not in written_nodes:
+                    if isinstance(category, str):
+                        bl_element = self.bl_toolkit.toolkit.get_element(category)
+                        category = [bl_element.class_uri or bl_element.slot_uri]
+                    nodes.append({
+                        "id": identifier,
+                        "category": category,
+                        "name": metadata.name
+                    })                    
+                    written_nodes.add(identifier)
+                # related to edge
+                edges.append(self.make_edge(
+                    subj=element.id,
+                    obj=identifier
+                    ))
+                # related to edge
+                edges.append(self.make_edge(
+                    subj=identifier,
+                    obj=element.id))
+        return graph
+
+    def make_tagged_kg(self, elements):
+        """ Make a Translator standard knowledge graph representing
+        tagged study variables.
+        :param variables: The variables to model.
+        :param tags: The tags characterizing the variables.
+        :returns: Returns dictionary with nodes and edges modeling a Translator/Biolink KG.
+        """
+        graph = {
+            "nodes": [],
+            "edges": []
+        }
+        edges = graph['edges']
+        nodes = graph['nodes']
+        studies = {}
+
+        """ Create graph elements to model tags and their
+        links to identifiers gathered by semantic tagging. """
+        tag_map = {}
+        # @TODO extract this into config or maybe dug ??
+        topmed_tag_concept_type = "TOPMed Phenotype Concept"
+        nodes_written = set()
+        for tag in elements:
+            if not (isinstance(tag, DugConcept) and tag.type == topmed_tag_concept_type):
+                continue
+            tag_id = tag.id
+            tag_map[tag_id] = tag
+            nodes.append({
+                "id": tag_id,
+                "name": tag.name,
+                "description": tag.description.replace("'", "`"),
+                "category": ["biolink:InformationContentEntity"]
+            })
+            """ Link ontology identifiers we've found for this tag via nlp. """
+            for identifier, metadata in tag.identifiers.items():
+                if isinstance(metadata.types, str):
+                    bl_element = self.bl_toolkit.toolkit.get_element(metadata.types)
+                    category = [bl_element.class_uri or bl_element.slot_uri]
+                else:
+                    category = metadata.types 
+                synonyms = metadata.synonyms if metadata.synonyms else []
+                nodes.append({
+                    "id": identifier,
+                    "name": metadata.label,
+                    "category": category,
+                    "synonyms": synonyms
+                })
+                nodes_written.add(identifier)
+                edges.append(self.make_edge(
+                    subj=tag_id,
+                    obj=identifier))
+                edges.append(self.make_edge(
+                    subj=identifier,
+                    obj=tag_id))
+
+        concepts_graph = self.convert_to_kgx_json(elements, written_nodes=nodes_written)
+        graph['nodes'] += concepts_graph['nodes']
+        graph['edges'] += concepts_graph['edges']
+
+        return graph
+
+    def index_elements(self, elements_file):
+        log.info(f"Indexing {elements_file}...")
+        elements = storage.read_object(elements_file)
+        count = 0
+        total = len(elements)
+        # Index Annotated Elements
+        log.info(f"found {len(elements)} from elements files.")
+        for element in elements:
+            count += 1
+            # Only index DugElements as concepts will be indexed differently in next step
+            if not isinstance(element, DugConcept):
+                # override data-type with mapping values 
+                if element.type.lower() in self.element_mapping:
+                    element.type = self.element_mapping[element.type.lower()]
+                self.index_obj.index_element(element, index=self.variables_index)
+            percent_complete = (count / total) * 100
+            if percent_complete % 10 == 0:
+                log.info(f"{percent_complete} %")
+        log.info(f"Done indexing {elements_file}.")
+
+    def validate_indexed_elements(self, elements_file):
+        elements = [x for x in storage.read_object(elements_file) if not isinstance(x, DugConcept)]
+        # Pick ~ 10 %
+        sample_size = int(len(elements) * 0.1)
+        test_elements = elements[:sample_size]  # random.choices(elements, k=sample_size)
+        log.info(f"Picked {len(test_elements)} from {elements_file} for validation.")
+        for element in test_elements:
+            # Pick a concept
+            concepts = [element.concepts[curie] for curie in element.concepts if element.concepts[curie].name]
+
+            if len(concepts):
+                # Pick the first concept
+                concept = concepts[0]
+                curie = concept.id
+                search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', concept.name)
+                log.debug(f"Searching for Concept: {curie} and Search term: {search_term}")
+                all_elements_ids = self._search_elements(curie, search_term)
+                present = element.id in all_elements_ids
+                if not present:
+                    log.error(f"Did not find expected variable {element.id} in search result.")
+                    log.error(f"Concept id : {concept.id}, Search term: {search_term}")
+                    raise Exception(f"Validation exception - did not find variable {element.id} "
+                                    f"from {str(elements_file)}"
+                                    f"when searching variable index with"
+                                    f" Concept ID : {concept.id} using Search Term : {search_term} ")
+            else:
+                log.info(
+                    f"{element.id} has no concepts annotated. Skipping validation for it."
+                )
+
+    def _search_elements(self, curie, search_term):
+        response = self.event_loop.run_until_complete(self.search_obj.search_vars_unscored(
+            concept=curie,
+            query=search_term
+        ))
+        ids_dict = []
+        if 'total_items' in response:
+            if response['total_items'] == 0:
+                log.error(f"No search elements returned for variable search: {self.variables_index}.")
+                log.error(f"Concept id : {curie}, Search term: {search_term}")
+                raise Exception(f"Validation error - Did not find {curie} for"
+                                f"Search term: {search_term}")
+            else:
+                del response['total_items']
+                for element_type in response:
+                    all_elements_ids = [e['id'] for e in
+                                        reduce(lambda x, y: x + y['elements'], response[element_type], [])]
+                    ids_dict += all_elements_ids
+        return ids_dict
+
+    def crawl_concepts(self, concepts, data_set_name):
+        """
+        Adds tranql KG to Concepts, terms grabbed from KG are also added as search terms
+        :param concepts:
+        :param data_set_name:
+        :return:
+        """
+        crawl_dir = storage.dug_crawl_path('crawl_output')
+        output_file_name = os.path.join(data_set_name, 'expanded_concepts.pickle')
+        extracted_dug_elements_file_name = os.path.join(data_set_name, 'extracted_graph_elements.pickle')
+        output_file = storage.dug_expanded_concepts_path(output_file_name)
+        extracted_output_file = storage.dug_expanded_concepts_path(extracted_dug_elements_file_name)
+        Path(crawl_dir).mkdir(parents=True, exist_ok=True)
+        extracted_dug_elements = []
+        log.debug("Creating Dug Crawler object")
+        crawler = Crawler(
+            crawl_file="",
+            parser=None,
+            annotator=None,
+            tranqlizer=self.tranqlizer,
+            tranql_queries=self.tranql_queries,
+            http_session=self.cached_session,
+        )
+        crawler.crawlspace = crawl_dir
+        counter = 0
+        total = len(concepts)
+        for concept_id, concept in concepts.items():
+            counter += 1
+            try:
+                crawler.expand_concept(concept)
+                concept.set_search_terms()
+                concept.set_optional_terms()
+            except Exception as e:
+                log.error(concept)
+                raise e
+            for query in self.node_to_element_queries:
+                log.info(query)
+                casting_config = query['casting_config']
+                tranql_source = query['tranql_source']
+                dug_element_type = query['output_dug_type']
+                new_elements =  crawler.expand_to_dug_element(
+                    concept=concept,
+                    casting_config=casting_config,
+                    dug_element_type=dug_element_type,
+                    tranql_source=tranql_source
+                )
+                log.debug("extracted:")
+                log.debug(str(list([el.get_searchable_dict() for el in new_elements])))
+                extracted_dug_elements += new_elements
+            concept.clean()
+            percent_complete = int((counter / total) * 100)
+            if percent_complete % 10 == 0:
+                log.info(f"{percent_complete}%")
+        storage.write_object(obj=concepts, path=output_file)
+        storage.write_object(obj=extracted_dug_elements, path=extracted_output_file)
+
+    def index_concepts(self, concepts):
+        log.info("Indexing Concepts")
+        total = len(concepts)
+        count = 0
+        for concept_id, concept in concepts.items():
+            count += 1
+            self.index_obj.index_concept(concept, index=self.concepts_index)
+            # Index knowledge graph answers for each concept
+            for kg_answer_id, kg_answer in concept.kg_answers.items():
+                self.index_obj.index_kg_answer(
+                    concept_id=concept_id,
+                    kg_answer=kg_answer,
+                    index=self.kg_index,
+                    id_suffix=kg_answer_id
+                )
+            percent_complete = int((count / total) * 100)
+            if percent_complete % 10 == 0:
+                log.info(f"{percent_complete} %")
+        log.info("Done Indexing concepts")
+
+    def validate_indexed_concepts(self, elements, concepts):
+        """
+        Validates linked concepts are searchable
+        :param elements: Annotated dug elements
+        :param concepts: Crawled (expanded) concepts
+        :return:
+        """
+        # 1 . Find concepts with KG <= 10% of all concepts,
+        # <= because we might have no results for some concepts from tranql
+        sample_concepts = {key: value for key, value in concepts.items() if value.kg_answers}
+        if len(concepts) == 0:
+            log.info(f"No Concepts found.")
+            return
+        log.info(
+            f"Found only {len(sample_concepts)} Concepts with Knowledge graph out of {len(concepts)}. {(len(sample_concepts) / len(concepts)) * 100} %")
+        # 2. pick elements that have concepts in the sample concepts set
+        sample_elements = {}
+        for element in elements:
+            if isinstance(element, DugConcept):
+                continue
+            for concept in element.concepts:
+                # add elements that have kg
+                if concept in sample_concepts:
+                    sample_elements[concept] = sample_elements.get(concept, set())
+                    sample_elements[concept].add(element.id)
+
+        # Time for some validation
+        for curie in concepts:
+            concept = concepts[curie]
+            if not len(concept.kg_answers):
+                continue
+            search_terms = []
+            for key in concept.kg_answers:
+                kg_object = concept.kg_answers[key]
+                search_terms += kg_object.get_node_names()
+                search_terms += kg_object.get_node_synonyms()
+                # reduce(lambda x,y: x + y, [[node.get("name")] + node.get("synonyms", [])
+                #             for node in concept.kg_answers["knowledge_graph"]["nodes"]], [])
+            # validation here is that for any of these nodes we should get back
+            # the variable.
+            # make unique
+            search_terms_cap = 10
+            search_terms = list(set(search_terms))[:search_terms_cap]
+            log.debug(f"Using {len(search_terms)} Search terms for concept {curie}")
+            for search_term in search_terms:
+                # avoids elastic failure due to some reserved characters
+                # 'search_phase_execution_exception', 'token_mgr_error: Lexical error ...
+                search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', search_term)
+
+                searched_element_ids = self._search_elements(curie, search_term)
+
+                if curie not in sample_elements:
+                    log.error(f"Did not find Curie id {curie} in Elements.")
+                    log.error(f"Concept id : {concept.id}, Search term: {search_term}")
+                    raise Exception(f"Validation error - Did not find {element.id} for"
+                                    f" Concept id : {concept.id}, Search term: {search_term}")
+                else:
+                    present = bool(len([x for x in sample_elements[curie] if x in searched_element_ids]))
+                    if not present:
+                        log.error(f"Did not find expected variable {element.id} in search result.")
+                        log.error(f"Concept id : {concept.id}, Search term: {search_term}")
+                        raise Exception(f"Validation error - Did not find {element.id} for"
+                                        f" Concept id : {concept.id}, Search term: {search_term}")
+
+    def clear_index(self, index_id):
+        exists = self.search_obj.es.indices.exists(index=index_id)
+        if exists:
+            log.info(f"Deleting index {index_id}")
+            response = self.event_loop.run_until_complete(self.search_obj.es.indices.delete(index=index_id))
+            log.info(f"Cleared Elastic : {response}")
+        log.info("Re-initializing the indicies")
+        self.index_obj.init_indices()
+
+    def clear_variables_index(self):
+        self.clear_index(self.variables_index)
+
+    def clear_kg_index(self):
+        self.clear_index(self.kg_index)
+
+    def clear_concepts_index(self):
+        self.clear_index(self.concepts_index)
+
+
+class DugUtil():
+
+    @staticmethod
+    def clear_annotation_cached(config=None, to_string=False):
+        with Dug(config, to_string=to_string) as dug:
+            annotation_path = storage.dug_annotation_path("")
+            storage.clear_dir(annotation_path)
+            # Clear http session cache
+            if config.annotation.clear_http_cache:
+                dug.cached_session.cache.clear()
+
+    @staticmethod
+    def annotate_db_gap_files(config=None, to_string=False, input_data_path=None, output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            if not input_data_path:
+                files = storage.dug_dd_xml_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
+            parser_name = "DbGaP"
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def annotate_anvil_files(config=None, to_string=False,
+                             input_data_path=None, output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            if not input_data_path:
+                files = storage.dug_anvil_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
+            parser_name = "Anvil"
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def annotate_cancer_commons_files(config=None, to_string=False,
+                                      input_data_path=None,
+                                      output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            if not input_data_path:
+                files = storage.dug_crdc_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
+            parser_name = "crdc"
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def annotate_kids_first_files(config=None, to_string=False,
+                                  input_data_path=None, output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            if not input_data_path:
+                files = storage.dug_kfdrc_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
+            parser_name = "kfdrc"
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def annotate_nida_files(config=None, to_string=False,
+                            input_data_path=None, output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            if not input_data_path:
+                files = storage.dug_nida_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
+            parser_name = "NIDA"
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def annotate_sparc_files(config=None, to_string=False, 
+                             input_data_path=None, output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            if not input_data_path:
+                files = storage.dug_sparc_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
+            parser_name = "SciCrunch"
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def annotate_sprint_files(config=None, to_string=False,
+                              input_data_path=None, output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            if not input_data_path:
+                files = storage.dug_sprint_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
+            parser_name = "SPRINT"
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def annotate_topmed_files(config=None, to_string=False,
+                              input_data_path=None, output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            if not input_data_path:
+                files = storage.dug_topmed_objects(
+                    input_data_path=None)
+            else:
+                files = storage.get_files_recursive(
+                        lambda x: True, input_data_path
+                    )
+            parser_name = "TOPMedTag"
+            log.info(files)
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def annotate_bacpac_files(config=None, to_string=False, 
+                              input_data_path=None, output_data_path=None):
+        
+        log.info(f"Input data path is: {input_data_path}")
+        with Dug(config, to_string=to_string) as dug:
+            files = storage.dug_bacpac_objects(
+                input_data_path=input_data_path)
+               
+            parser_name = "BACPAC"
+            log.info(files)
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+ 
+    
+    @staticmethod
+    def annotate_heal_study_files(config=None, to_string=False,
+                                  input_data_path=None, output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            if not input_data_path:
+                files = storage.dug_heal_study_objects(
+                    input_data_path=None)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
+
+            parser_name = "heal-studies"
+            log.info(files)
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    
+    @staticmethod
+    def annotate_heal_research_program_files(config=None, to_string=False,
+                                             input_data_path=None,
+                                             output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            if not input_data_path:
+                files = storage.dug_heal_research_program_objects(
+                    input_data_path=None)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
+            parser_name = "heal-research"
+            log.info(files)
+            dug.annotate_files(parser_name=parser_name,
+                               parsable_files=files,
+                               output_data_path=output_data_path)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def make_kg_tagged(config=None, to_string=False, input_data_path=None, output_data_path=None):
+        with Dug(config, to_string=to_string) as dug:
+            output_base_path = output_data_path
+            if not output_data_path:
+                output_base_path = storage.dug_kgx_path("")            
+            storage.clear_dir(output_base_path)            
+            log.info("Starting building KGX files")
+            if not input_data_path:
+                elements_files = storage.dug_elements_objects()
+            else:
+                import glob
+                glob_pattern = str(input_data_path / "**" /  'elements.pickle')
+                elements_files = glob.glob(glob_pattern, recursive=True)
+            log.info(f"making kgx files for the following pickles: {elements_files}")
+            for file in elements_files:
+                elements = storage.read_object(file)
+                if "topmed_" in file:
+                    kg = dug.make_tagged_kg(elements)
+                else:
+                    kg = dug.convert_to_kgx_json(elements)
+                dug_base_file_name = file.split(os.path.sep)[-2]
+                output_file_path = os.path.join(output_base_path, dug_base_file_name + '_kgx.json')
+                storage.write_object(kg, output_file_path)
+                log.info(f"Wrote {len(kg['nodes'])} nodes and {len(kg['edges'])} edges, to {output_file_path}.")
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def index_variables(config=None, to_string=False):
+        with Dug(config, to_string=to_string) as dug:
+            dug.clear_variables_index()
+            elements_object_files = storage.dug_elements_objects()
+            for file in elements_object_files:
+                dug.index_elements(file)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def index_extracted_elements(config=None, to_string=False):
+        with Dug(config, to_string=to_string) as dug:
+            elements_object_files = storage.dug_extracted_elements_objects()
+            for file in elements_object_files:
+                dug.index_elements(file)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def index_concepts(config=None, to_string=False):
+        with Dug(config=config, to_string=to_string) as dug:
+            # These are concepts that have knowledge graphs  from tranql
+            # clear out concepts and kg indicies from previous runs
+            dug.clear_concepts_index()
+            dug.clear_kg_index()
+            expanded_concepts_files = storage.dug_expanded_concept_objects()
+            for file in expanded_concepts_files:
+                concepts = storage.read_object(file)
+                dug.index_concepts(concepts=concepts)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def validate_indexed_variables(config=None, to_string=False):
+        with Dug(config, to_string=to_string) as dug:
+            elements_object_files = storage.dug_elements_objects()
+            for elements_object_file in elements_object_files:
+                log.info(f"Validating {elements_object_file}")
+                dug.validate_indexed_elements(elements_object_file)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def crawl_tranql(config=None, to_string=False):
+        log.info(config.dict)
+        with Dug(config, to_string=to_string) as dug:
+            concepts_files = storage.dug_concepts_objects()
+            crawl_dir = storage.dug_crawl_path('crawl_output')
+            log.info(f'Clearing crawl output dir {crawl_dir}')
+            storage.clear_dir(crawl_dir)
+            expanded_concepts_dir = storage.dug_expanded_concepts_path("")
+            log.info(f'Clearing expanded concepts dir: {expanded_concepts_dir}')
+            storage.clear_dir(expanded_concepts_dir)
+            log.info(f'Crawling Dug Concepts, found {len(concepts_files)} file(s).')
+            for file in concepts_files:
+                data_set = storage.read_object(file)
+                original_variables_dataset_name = os.path.split(os.path.dirname(file))[-1]
+                dug.crawl_concepts(concepts=data_set,
+                                   data_set_name=original_variables_dataset_name)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    @staticmethod
+    def validate_indexed_concepts(config=None, to_string=False):
+        with Dug(config, to_string=to_string) as dug:
+            get_data_set_name = lambda file: os.path.split(os.path.dirname(file))[-1]
+            expanded_concepts_files_dict = {
+                get_data_set_name(file): file for file in storage.dug_expanded_concept_objects()
+            }
+            annotated_elements_files_dict = {
+                get_data_set_name(file): file for file in storage.dug_elements_objects()
+            }
+            try:
+                assert len(expanded_concepts_files_dict) == len(annotated_elements_files_dict)
+            except:
+                log.error("Files Annotated Elements files and Expanded concepts files, should be pairs")
+                if len(expanded_concepts_files_dict) > len(annotated_elements_files_dict):
+                    log.error("Some Annotated Elements files (from load_and_annotate task) are missing")
+                else:
+                    log.error("Some Expanded Concepts files (from crawl task) are missing")
+                log.error(f"Annotated Datasets : {list(annotated_elements_files_dict.keys())}")
+                log.error(f"Expanded Concepts Datasets: {list(expanded_concepts_files_dict.keys())}")
+                exit(-1)
+            for data_set_name in annotated_elements_files_dict:
+                log.debug(f"Reading concepts and elements for dataset {data_set_name}")
+                elements_file_path = annotated_elements_files_dict[data_set_name]
+                concepts_file_path = expanded_concepts_files_dict[data_set_name]
+                dug_elements = storage.read_object(elements_file_path)
+                dug_concepts = storage.read_object(concepts_file_path)
+                log.debug(f"Read {len(dug_elements)} elements, and {len(dug_concepts)} Concepts")
+                log.info(f"Validating {data_set_name}")
+                dug.validate_indexed_concepts(elements=dug_elements, concepts=dug_concepts)
+            output_log = dug.log_stream.getvalue() if to_string else ''
+        return output_log
+
+
+class FileFetcher:
+
+    def __init__(
+            self,
+            remote_host: str,
+            remote_dir: Union[str, Path],
+            local_dir: Union[str, Path] = "."
+    ):
+        self.remote_host = remote_host
+        self.remote_dir = remote_dir.rstrip("/") if isinstance(remote_dir, str) else str(remote_dir.as_posix())
+        self.local_dir = Path(local_dir).resolve()
+
+    def __call__(self, remote_file_path: Union[str, Path]) -> Path:
+        remote_path = self.remote_dir + "/" + remote_file_path
+        local_path = self.local_dir / remote_file_path
+        url = f"{self.remote_host}{remote_path}"
+        log.debug(f"Fetching {url}")
+        try:
+            response = requests.get(url, allow_redirects=True)
+        except Exception as e:
+            log.error(f"Unexpected {e.__class__.__name__}: {e}")
+            raise RuntimeError(f"Unable to fetch {url}")
+        else:
+            log.debug(f"Response: {response.status_code}")
+            if response.status_code == 200:
+                with local_path.open('wb') as file_obj:
+                    file_obj.write(response.content)
+                return local_path
+            else:
+                log.debug(f"Unable to fetch {url}: {response.status_code}")
+                raise RuntimeError(f"Unable to fetch {url}")
+
+
+def get_versioned_files(config: RogerConfig, data_format, output_file_path, data_store="s3", unzip=False):
+    """
+       Fetches a dug inpu data files to input file directory
+    """
+    meta_data = storage.read_relative_object("../../metadata.yaml")
+    output_dir: Path = storage.dug_input_files_path(output_file_path)
+    # clear dir
+    storage.clear_dir(output_dir)
+    data_sets = config.dug_inputs.data_sets
+    log.info(f"dataset: {data_sets}")
+    pulled_files = []
+    s3_utils = S3Utils(config.s3_config)
+    for data_set in data_sets:
+        data_set_name, current_version = data_set.split(':')
+        for item in meta_data["dug_inputs"]["versions"]:
+            if item["version"] == current_version and item["name"] == data_set_name and item["format"] == data_format:
+                if data_store == "s3":
+                    for filename in item["files"]["s3"]:
+                        log.info(f"Fetching {filename}")
+                        output_name = filename.split('/')[-1]
+                        output_path = output_dir / output_name
+                        s3_utils.get(
+                            str(filename),
+                            str(output_path),
+                        )
+                        if unzip:
+                            log.info(f"Unzipping {output_path}")
+                            tar = tarfile.open(str(output_path))
+                            tar.extractall(path=output_dir)
+                        pulled_files.append(output_path)
+                else:
+                    for filename in item["files"]["stars"]:
+                        log.info(f"Fetching {filename}")
+                        # fetch from stars
+                        remote_host = config.annotation_base_data_uri
+                        fetch = FileFetcher(
+                            remote_host=remote_host,
+                            remote_dir=current_version,
+                            local_dir=output_dir)
+                        output_path = fetch(filename)
+                        if unzip:
+                            log.info(f"Unzipping {output_path}")
+                            tar = tarfile.open(str(output_path))
+                            tar.extractall(path=output_dir)
+                        pulled_files.append(output_path)
+    return [str(filename) for filename in pulled_files]
+
+
+def get_dbgap_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, 'dbGaP', 'db_gap', data_store=config.dug_inputs.data_source, unzip=True)
+
+
+def get_nida_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, "nida", "nida", data_store=config.dug_inputs.data_source, unzip=True)
+
+
+def get_sparc_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, "sparc", "sparc", data_store=config.dug_inputs.data_source, unzip=True)
+
+
+def get_anvil_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, "anvil", "anvil", data_store=config.dug_inputs.data_source, unzip=True)
+
+
+def get_kids_first_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, "kfdrc", "kfdrc", data_store=config.dug_inputs.data_source, unzip=True)
+
+
+def get_cancer_data_commons_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, "crdc", "crdc", data_store=config.dug_inputs.data_source, unzip=True)
+
+
+def get_sprint_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, "sprint", "sprint", data_store=config.dug_inputs.data_source, unzip=True)
+
+def get_bacpac_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, "bacpac", "bacpac", data_store=config.dug_inputs.data_source, unzip=True)
+
+def get_topmed_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, "topmed", "topmed", data_store=config.dug_inputs.data_source, unzip=False)
+
+def get_heal_study_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, "heal-studies", "heal-study-imports", data_store=config.dug_inputs.data_source, unzip=True)
+
+def get_heal_research_program_files(config: RogerConfig, to_string=False) -> List[str]:
+    return get_versioned_files(config, "heal-research", "heal-research-programs", data_store=config.dug_inputs.data_source, unzip=True)
diff --git a/dags/knowledge_graph_build.py b/dags/knowledge_graph_build.py
new file mode 100644
index 00000000..de28aa78
--- /dev/null
+++ b/dags/knowledge_graph_build.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+#
+
+"""
+An Airflow workflow for the Roger Translator KGX data pipeline.
+"""
+
+from airflow.models import DAG
+from airflow.operators.empty import EmptyOperator
+import roger
+from roger.tasks import default_args, create_python_task
+from roger.config import config
+
+""" Build the workflow's tasks and DAG. """
+with DAG(
+    dag_id='knowledge_graph_build',
+    default_args=default_args,
+    schedule_interval=None
+) as dag:
+
+    """ Build the workflow tasks. """
+    intro = EmptyOperator(task_id='Intro')
+
+    # Merge nodes needs inputs from two sources
+    # 1. baseline and/or CDE KGX files from LakeFS (External repo)
+    # 2. Infer which local kgx files are needed based on dug_inputs and grab them from the current repo
+
+    # build the annotate and index pipeline output locations
+    #lakefs://yk-heal/main/annotate_and_index/crdc_dataset_pipeline_task_group.make_kgx_crdc/
+    working_repo = config.lakefs_config.repo
+    branch = config.lakefs_config.branch
+    kgx_repos = config.kgx.data_sets
+    input_repos = [{
+        'name': repo.split(':')[0],
+        'branch': repo.split(':')[1],
+        'path': '*'
+    } for repo in kgx_repos]
+
+    # Figure out a way to extract paths
+    get_path_on_lakefs = lambda d: f"annotate_and_index/{d}_dataset_pipeline_task_group.make_kgx_{d}/"
+
+
+    for dataset in config.dug_inputs.data_sets:
+        dataset_name = dataset.split(":")[0]
+        # add datasets from the other pipeline
+        input_repos.append(
+            {
+                'name': working_repo,
+                'branch': branch,
+                'path': get_path_on_lakefs(dataset_name)
+            }
+        )
+
+    merge_nodes = create_python_task (dag, name="MergeNodes",
+                                      a_callable=roger.merge_nodes,
+                                      external_repos=input_repos
+                                      )
+
+    # The rest of these  guys can just operate on the local lakefs repo/branch
+    # we need to add input dir and output dir similar to what we did for dug tasks
+
+    create_nodes_schema = create_python_task(dag,
+                                            name="CreateNodesSchema",
+                                            a_callable=roger.create_nodes_schema
+                                            )
+    create_edges_schema = create_python_task(dag,
+                                             name="CreateEdgesSchema",
+                                             a_callable=roger.create_edges_schema)
+
+    create_bulk_load_nodes = create_python_task(dag,
+                                                name="CreateBulkLoadNodes",
+                                                a_callable=roger.create_bulk_nodes)
+    create_bulk_load_edges = create_python_task(dag,
+                                                name="CreateBulkLoadEdges",
+                                                a_callable=roger.create_bulk_edges)
+    bulk_load = create_python_task(dag,
+                                   name="BulkLoad",
+                                   a_callable=roger.bulk_load,
+                                   no_output_files=True)
+    check_tranql = create_python_task(dag,
+                                      name="CheckTranql",
+                                      a_callable=roger.check_tranql,
+                                      no_output_files=True)
+    validate = create_python_task(dag,
+                                  name="Validate",
+                                  a_callable=roger.validate,
+                                  no_output_files=True)
+
+
+    """ Build the DAG. """
+    merge_nodes.set_upstream(intro)
+    create_nodes_schema.set_upstream(merge_nodes)
+    create_edges_schema.set_upstream(merge_nodes)
+    create_bulk_load_nodes.set_upstream(create_nodes_schema)
+    create_bulk_load_nodes.set_upstream(merge_nodes)
+    create_bulk_load_edges.set_upstream(create_edges_schema)
+    create_bulk_load_edges.set_upstream(merge_nodes)
+    bulk_load.set_upstream(create_bulk_load_nodes)
+    bulk_load.set_upstream(create_bulk_load_edges)
+    validate.set_upstream(bulk_load)
+    check_tranql.set_upstream(bulk_load)
+
diff --git a/dags/metadata.yaml b/dags/metadata.yaml
new file mode 100644
index 00000000..0cedb6a0
--- /dev/null
+++ b/dags/metadata.yaml
@@ -0,0 +1,206 @@
+kgx:
+  versions:
+  - files:
+    - biolink-v1.0.json
+    - ctd-v1.0.json
+    - gtopdb-v1.0.json
+    - hetio-v1.0.json
+    - hgnc-v1.0.json
+    - hmdb-v1.0.json
+    - kegg-v1.0.json
+    - mychem-v1.0.json
+    - ontological-hierarchy-v1.0.json
+    - panther-v1.0.json
+    - foodb-v1.0.json
+    - pharos-v1.0.json
+    - intact-v1.0.json
+    - human-goa-v1.0.json
+    - uberongraph-v1.0.json
+    - viral-proteome-v1.0.json
+    version: v1.0
+    name: baseline-graph
+    format: json
+  - files:
+    - biolink-v2.0.json
+    - ctd-v2.0.json
+    - gtopdb-v2.0.json
+    - hetio-v2.0.json
+    - hgnc-v2.0.json
+    - hmdb-v2.0.json
+    - kegg-v2.0.json
+    - mychem-v2.0.json
+    - ontological-hierarchy-v2.0.json
+    - panther-v2.0.json
+    - foodb-v2.0.json
+    - pharos-v2.0.json
+    - intact-v2.0.json
+    - human-goa-v2.0.json
+    - uberongraph-v2.0.json
+    - viral-proteome-v2.0.json
+    version: v2.0
+    name: baseline-graph
+    format: json
+  - files:
+      - heal/sparc/curation-export-processed.json
+    version: v2.0
+    name: sparc-kgx
+    format: json
+  - files:
+      - Biolink_edges_v3.0.jsonl
+      - Biolink_nodes_v3.0.jsonl
+      - CTD_edges_v3.0.jsonl
+      - CTD_nodes_v3.0.jsonl
+      - DrugCentral_edges_v3.0.jsonl
+      - DrugCentral_nodes_v3.0.jsonl
+      - GtoPdb_edges_v3.0.jsonl
+      - GtoPdb_nodes_v3.0.jsonl
+      - Hetio_edges_v3.0.jsonl
+      - Hetio_nodes_v3.0.jsonl
+      - HGNC_edges_v3.0.jsonl
+      - HGNC_nodes_v3.0.jsonl
+      - HMDB_edges_v3.0.jsonl
+      - HMDB_nodes_v3.0.jsonl
+      - HumanGOA_edges_v3.0.jsonl
+      - HumanGOA_nodes_v3.0.jsonl
+      - IntAct_edges_v3.0.jsonl
+      - IntAct_nodes_v3.0.jsonl
+      - OntologicalHierarchy_edges_v3.0.jsonl
+      - OntologicalHierarchy_nodes_v3.0.jsonl
+      - PANTHER_edges_v3.0.jsonl
+      - PANTHER_nodes_v3.0.jsonl
+      - PHAROS_edges_v3.0.jsonl
+      - PHAROS_nodes_v3.0.jsonl
+      - UberGraph_edges_v3.0.jsonl
+      - UberGraph_nodes_v3.0.jsonl
+    version: v3.0
+    name: baseline-graph
+    format: jsonl
+  - version: test
+    files:
+    - panther.json
+    name: test
+  - version: v3.0
+    name: cde-graph
+    format: jsonl
+    files:
+      - cde/annotated_edges_v3.0.jsonl
+      - cde/annotated_nodes_v3.0.jsonl
+  - version: v4.0
+    name: baseline-graph
+    format: jsonl
+    files:
+      - baseline-4.0/edges_v4.0.jsonl
+      - baseline-4.0/nodes_v4.0.jsonl
+  - version: v4.0
+    name: cde-graph
+    format: jsonl
+    files:
+      - cde/annotated_edges_v4.0.jsonl
+      - cde/annotated_nodes_v4.0.jsonl
+  - version: v5.0
+    name: baseline-graph
+    format: jsonl
+    files:
+      - baseline-5.0/edges_v5.0.jsonl
+      - baseline-5.0/nodes_v5.0.jsonl
+  - version: v5.0
+    name: cde-graph
+    format: jsonl
+    files:
+      - cde/annotated_edges_v5.0.jsonl
+      - cde/annotated_nodes_v5.0.jsonl
+dug_inputs:
+  versions:
+    - name: bdc
+      version: v1.0
+      files:
+        s3:
+          - "bdc/v1.0/bdc_dbgap_data_dicts.tar.gz"
+        stars:
+          - "bdc_dbgap_data_dicts.tar.gz"
+      format: dbGaP
+    - name: bdc
+      version: v2.0
+      files:
+        s3:
+          - "bdc/v2.0/bdc_dbgap_data_dicts.tar.gz"
+        stars:
+          - "bdc_dbgap_data_dicts.tar.gz"
+      format: dbGaP
+    - name: bdc
+      version: v3.0
+      files:
+        s3:
+          - "bdc/v3.0/bdc_dbgap_data_dicts.tar.gz"
+      format: dbGaP
+    - name: nida
+      version: v1.0
+      files:
+        s3:
+          - "nida/v1.0/nida-12studies.tar.gz"
+        stars:
+          - "nida-12studies.tar.gz"
+      format: nida
+    - name: sparc
+      version: v1.0
+      files:
+        s3:
+          - "sparc/v1.0/sparc-dbgap-xml-formatted.tar.gz"
+        stars:
+          - "sparc-dbgap-xml-formatted.tar.gz"
+      format: sparc
+    - name: topmed
+      version: v2.0
+      files:
+        s3:
+          - "topmed/v2.0/topmed_tags_v2.0.json"
+          - "topmed/v2.0/topmed_variables_v2.0.csv"
+        stars:
+          - topmed_variables_v2.0.csv
+          - topmed_tags_v2.0.json
+      format: topmed
+    - name: anvil
+      version: v1.0
+      files:
+        s3:
+          - "bdc/v1.0/anvil_dbgap_data_dicts.tar.gz"
+        stars:
+          - "anvil_dbgap_data_dicts.tar.gz"
+      format: anvil
+    - name: kfdrc
+      version: v1.0
+      files:
+        s3:
+          - "bdc/v1.0/KFDRC.tar.gz"
+      format: kfdrc
+    - name: crdc
+      version: v1.0
+      files:
+        s3:
+          - "bdc/v1.0/CRDC.tar.gz"
+      format: crdc
+    - name: sprint
+      version: v1.0
+      files:
+        s3:
+          - "sprint/v1.0/StanfordSPRINT_DataDictionary_2020-12-16.tar.gz"
+      format: sprint
+    - name: bacpac
+      version: v1.0
+      files:
+        s3:
+          - "heal-datasets/bacpac/bacpac_baseline_do_measures.tar.gz"
+      format: bacpac
+    - name: heal-studies
+      version: v1.0
+      files:
+        s3:
+          - heal-datasets/ingest-8-23/heal_studies.tar.gz      
+          - heal-datasets/ingest-8-23/heal_mds_import.tar.gz
+      format: heal-studies
+    - name: heal-research-programs
+      version: v1.0
+      files:
+        s3:
+          - heal-datasets/ingest-8-23/heal_research_programs.tar.gz
+      format: heal-research
diff --git a/dags/roger/__init__.py b/dags/roger/__init__.py
new file mode 100644
index 00000000..e950b109
--- /dev/null
+++ b/dags/roger/__init__.py
@@ -0,0 +1,20 @@
+"Roger: an automated graph data curation pipeline."
+
+from roger.core.base import (
+    Roger,
+    roger_cli,
+    get_kgx,
+    create_schema,
+    create_edges_schema,
+    create_nodes_schema,
+    merge_nodes,
+    create_bulk_load,
+    create_bulk_nodes,
+    create_bulk_edges,
+    bulk_load,
+    validate,
+    check_tranql,
+)
+
+if __name__ == "__main__":
+    roger_cli()
diff --git a/dags/roger/components/__init__.py b/dags/roger/components/__init__.py
new file mode 100644
index 00000000..49314f47
--- /dev/null
+++ b/dags/roger/components/__init__.py
@@ -0,0 +1 @@
+"Data conversion utilities"
diff --git a/dags/roger/components/data_conversion.py b/dags/roger/components/data_conversion.py
new file mode 100644
index 00000000..46dd61fe
--- /dev/null
+++ b/dags/roger/components/data_conversion.py
@@ -0,0 +1,71 @@
+"Data conversion utility methods"
+
+from typing import Any
+
+
+_type_map = {
+    list.__name__: {
+        'priority': 0,
+        'constructor': lambda x: list([x])
+    },
+    str.__name__: {
+        'priority': 1,
+        'constructor': lambda x: str(x)
+    },
+    bool.__name__: {
+        'priority': 2,
+        'constructor': lambda x: True if x else False
+    },
+    float.__name__: {
+        'priority': 2,
+        'constructor': lambda x: float(x),
+    },
+    int.__name__: {
+        'priority': 2,
+        'constructor': lambda x: int(x)
+    },
+    type(None).__name__: {
+        'priority': 3,
+        'constructor': lambda x: '',
+    }
+}
+
+def cast(value: Any, to_type: str):
+    """
+    Parses a value to dest type.
+    :param value: value to parse
+    :param to_type: destination type
+    :return: parsed value
+    """
+    if to_type not in _type_map:
+        raise TypeError(
+            f'Type {to_type} not found in conversion map. '
+            f'Available types are {_type_map.keys()}')
+    dest_type_constructor = _type_map[to_type]['constructor']
+    return dest_type_constructor(value)
+
+def compare_types(data_type: str, data_type_2: str):
+    """
+    Of two python types selects the one we would like to upcast to.
+    :param data_type:
+    :param data_type_2:
+    :return:
+    """
+    assert data_type in _type_map, (
+        f"Unrecognised type {data_type} From types:"
+        f"{list(_type_map.keys())}")
+
+    assert data_type_2 in _type_map, (
+        f"Unrecognised type {data_type} From types: "
+        f"{list(_type_map.keys())}")
+
+    d1_val = _type_map[data_type]['priority']
+    d2_val = _type_map[data_type_2]['priority']
+
+    if data_type != data_type_2 and d1_val == d2_val:
+        # For float int and bool have same priority
+        # treat them as strings.
+        d1_val = (d1_val - 1)
+        data_type = str.__name__
+
+    return data_type if d1_val < d2_val else data_type_2
diff --git a/dags/roger/components/data_conversion_utils.py b/dags/roger/components/data_conversion_utils.py
new file mode 100644
index 00000000..f6f60eb0
--- /dev/null
+++ b/dags/roger/components/data_conversion_utils.py
@@ -0,0 +1,69 @@
+from typing import Any
+
+
+class TypeConversionUtil:
+
+    type_map = {
+        list.__name__: {
+            'priority': 0,
+            'constructor': lambda x: list([x])
+        },
+        str.__name__: {
+            'priority': 1,
+            'constructor': lambda x: str(x)
+        },
+        bool.__name__: {
+            'priority': 2,
+            'constructor': lambda x: True if x else False
+        },
+        float.__name__: {
+            'priority': 2,
+            'constructor': lambda x: float(x),
+        },
+        int.__name__: {
+            'priority': 2,
+            'constructor': lambda x: int(x)
+        },
+        type(None).__name__: {
+            'priority': 3,
+            'constructor': lambda x: '',
+        }
+    }
+
+    @staticmethod
+    def cast(value: Any, to_type: str):
+        """
+        Parses a value to dest type.
+        :param value: value to parse
+        :param to_type: destination type
+        :return: parsed value
+        """
+        if to_type not in TypeConversionUtil.type_map:
+            raise TypeError(f'Type {to_type} not found in conversion map. Available types are {TypeConversionUtil.type_map.keys()}')
+        dest_type_constructor = TypeConversionUtil.type_map[to_type]['constructor']
+        return dest_type_constructor(value)
+
+    @staticmethod
+    def compare_types(data_type: str, data_type_2: str):
+        """
+        Of two python types selects the one we would like to upcast to.
+        :param data_type:
+        :param data_type_2:
+        :return:
+        """
+        assert data_type in TypeConversionUtil.type_map, f"Unrecognised type {data_type} From types:" \
+                                                         f"{list(TypeConversionUtil.type_map.keys())}"
+
+        assert data_type_2 in TypeConversionUtil.type_map, f"Unrecognised type {data_type} From types: " \
+                                                           f"{list(TypeConversionUtil.type_map.keys())}"
+
+        d1_val = TypeConversionUtil.type_map[data_type]['priority']
+        d2_val = TypeConversionUtil.type_map[data_type_2]['priority']
+
+        if data_type != data_type_2 and d1_val == d2_val:
+            # For float int and bool have same priority
+            # treat them as strings.
+            d1_val = (d1_val - 1)
+            data_type = str.__name__
+
+        return data_type if d1_val < d2_val else data_type_2
diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py
new file mode 100644
index 00000000..71111f39
--- /dev/null
+++ b/dags/roger/config/__init__.py
@@ -0,0 +1,399 @@
+import json
+import os
+import warnings
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, Optional, List
+
+import yaml
+from dug.config import Config as DugConfig
+from flatten_dict import flatten, unflatten
+
+from ._base import DictLike
+from .s3_config import S3Config
+
+CONFIG_FILENAME = Path(__file__).parent.resolve() / "config.yaml"
+
+@dataclass
+class RedisConfig(DictLike):
+    username: str = ""
+    password: str = ""
+    host: str = "redis"
+    graph: str = "test"
+    port: int = 6379
+
+    def __post_init__(self):
+        self.port = int(self.port)
+
+
+@dataclass
+class LakefsConfig(DictLike):
+    host: str
+    access_key_id: str
+    secret_access_key: str
+    branch: str
+    repo: str
+    enabled: bool = False
+
+    def __post_init__(self):
+        if isinstance(self.enabled, str):
+            self.enabled = self.enabled.lower() == "true"
+
+
+
+@dataclass
+class LoggingConfig(DictLike):
+    level: str = "DEBUG"
+    format: str = '[%(name)s][%(filename)s][%(lineno)d][%(funcName)20s] %(levelname)s: %(message)s'
+
+
+@dataclass
+class KgxConfig(DictLike):
+    biolink_model_version: str = "1.5.0"
+    merge_db_temp_dir: str = "workspace"
+    data_sets: List = field(default_factory=lambda: ['baseline-graph:v5.0'])
+
+    def __post_init__(self):
+        # Convert strings to list. In cases where this is passed as env variable with a single value
+        # cast it to a list. eg ROGER_KGX_DATA__SET="spark,baseline-data" could be converted to
+        # config.kgx.data_set = ["spark", "baseline-data"]
+        self.data_sets = [data_set.strip(" ") for data_set in self.data_sets.split(",")] \
+            if isinstance(self.data_sets, str) else self.data_sets
+
+
+@dataclass
+class DugInputsConfig(DictLike):
+    data_source: str = 'stars'
+    data_sets: List = field(default_factory=lambda: ['topmed', 'bdc'])
+
+    def __post_init__(self):
+        # Convert strings to list. In cases where this is passed as env variable with a single value
+        # cast it to a list. eg ROGER_KGX_DATA__SET="spark,baseline-data" could be converted to
+        # config.kgx.data_set = ["spark", "baseline-data"]
+        self.data_sets = [data_set.strip(" ") for data_set in self.data_sets.split(",")] \
+            if isinstance(self.data_sets, str) else self.data_sets
+
+
+@dataclass
+class BulkLoaderConfig(DictLike):
+    separator: str = "0x1E"
+    enforce_schema: bool = False
+    skip_invalid_nodes: bool = False
+    skip_invalid_edges: bool = False
+    quote: int = 0
+    max_token_count: int = 1024
+    max_buffer_size: int = 2048
+    max_token_size: int = 500
+    index: list = field(default_factory=list)
+    full_text_index: list = field(default_factory=list)
+
+
+@dataclass
+class AnnotationConfig(DictLike):
+    annotator_type: str = "monarch"
+    annotator_args: dict = field(
+        default_factory=lambda: {
+            "monarch": {
+                "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content="
+            },
+            "sapbert": {
+                "classification_url": "https://med-nemo.apps.renci.org/annotate/",
+                "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/",
+                "score_threshold": 0.8,
+                "bagel": {
+                    "enabled": False,
+                    "url": "https://bagel.apps.renci.org/group_synonyms_openai",
+                    "prompt": "bagel/ask_classes",
+                    "llm_args": {
+                        "llm_model_name": "gpt-4o-2024-05-13",
+                        "organization": "",
+                        "access_key": "",
+                        "llm_model_args": {
+                            "top_p": 0,
+                            "temperature": 0.1
+                        }
+                    }
+                }
+            },
+        }
+    )
+    normalizer: str = "https://nodenormalization-sri.renci.org/get_normalized_nodes?curie="
+    synonym_service: str = "https://onto.renci.org/synonyms/"
+    ontology_metadata: str = "https://api.monarchinitiative.org/api/bioentity/"
+    clear_http_cache: bool = False
+    preprocessor: dict = field(default_factory=lambda:
+        {
+            "debreviator": {
+                "BMI": "body mass index"
+            },
+            "stopwords": "the",
+        }
+   )
+
+    ontology_greenlist: List[str] = field(default_factory=lambda: [
+        "PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS"
+    ])
+
+    def __post_init__(self):
+        self.annotator_args["sapbert"]["bagel"]["enabled"] = self.annotator_args["sapbert"]["bagel"][
+                                                                 "enabled"].lower() == "true"
+
+
+@dataclass
+class IndexingConfig(DictLike):
+    variables_index: str = "variables_index"
+    concepts_index: str = "concepts_index"
+    kg_index: str = "kg_index"
+    tranql_min_score: float = 0.2
+    excluded_identifiers: List[str] = field(default_factory=lambda: [
+        "CHEBI:17336"
+    ])
+
+    queries: dict = field(default_factory=lambda: {
+        "disease": ["disease", "phenotypic_feature"],
+        "pheno": ["phenotypic_feature", "disease"],
+        "anat": ["disease", "anatomical_entity"],
+        "chem_to_disease": ["chemical_substance", "disease"],
+        "phen_to_anat": ["phenotypic_feature", "anatomical_entity"],
+        "anat_to_disease": ["anatomical_entity", "disease"],
+        "anat_to_pheno": ["anatomical_entity", "phenotypic_feature"],
+    })
+    tranql_endpoint: str = "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false"
+    # by default skips node to element queries
+    node_to_element_queries: dict = field(default_factory=lambda: {})
+    element_mapping: str = ""
+    def __post_init__(self):
+        # convert element mapping to dict
+        if self.element_mapping and len(self.element_mapping.split(',')):
+            final_element_mapping = {}
+            for mapping in self.element_mapping.split(','):
+                if not mapping:
+                    continue
+                original_name = mapping.split(':')[0].lower().strip()
+                final_name = mapping.split(':')[1].strip()
+                final_element_mapping[original_name] = final_name
+            self.element_mapping = final_element_mapping    
+        node_to_el_enabled = True if str(self.node_to_element_queries.get("enabled")).lower() == "true" else False
+        final_node_to_element_queries = {}
+        if node_to_el_enabled:
+            for key in filter(lambda k: k != "enabled", self.node_to_element_queries.keys()):
+                final_node_to_element_queries[key] = self.node_to_element_queries[key]
+        self.node_to_element_queries = final_node_to_element_queries
+
+@dataclass
+class ElasticsearchConfig(DictLike):
+    host: str = "elasticsearch"
+    username: str = "elastic"
+    password: str = ""
+    nboost_host: str = ""
+    scheme: str = "http"
+    ca_path: str = ""
+
+
+
+class RogerConfig(DictLike):
+
+    OS_VAR_PREFIX = "ROGER_"
+
+    def __init__(self, **kwargs):
+        self.redisgraph = RedisConfig(**kwargs.pop('redisgraph', {}))
+        self.logging = LoggingConfig(**kwargs.pop('logging', {}))
+        self.kgx = KgxConfig(**kwargs.pop('kgx', {}))
+        self.dug_inputs = DugInputsConfig(**kwargs.pop('dug_inputs', {}))
+        self.bulk_loader = BulkLoaderConfig(**kwargs.pop('bulk_loader', {}))
+        self.annotation = AnnotationConfig(**kwargs.pop('annotation', {}))
+        self.indexing = IndexingConfig(**kwargs.pop('indexing', {}))
+        self.elasticsearch = ElasticsearchConfig(**kwargs.pop('elasticsearch'))
+        self.s3_config = S3Config(**kwargs.pop('s3', {}))
+
+        self.data_root: str = kwargs.pop("data_root", "")
+        self.dug_data_root: str = kwargs.pop("dug_data_root", "")
+        self.kgx_base_data_uri: str = kwargs.pop("kgx_base_data_uri", "")
+        self.annotation_base_data_uri: str = kwargs.pop("annotation_base_data_uri", "")
+        self.validation = kwargs.pop("validation")
+        self.dag_run = kwargs.pop('dag_run', None)
+        self.lakefs_config  = LakefsConfig(**kwargs.pop("lakefs_config"))
+
+    def to_dug_conf(self) -> DugConfig:
+        return DugConfig(
+            elastic_host=self.elasticsearch.host,
+            elastic_password=self.elasticsearch.password,
+            elastic_username=self.elasticsearch.username,
+            elastic_scheme=self.elasticsearch.scheme,
+            elastic_ca_path=self.elasticsearch.ca_path,
+            redis_host=self.redisgraph.host,
+            redis_password=self.redisgraph.password,
+            redis_port=self.redisgraph.port,
+            nboost_host=self.elasticsearch.nboost_host,
+            preprocessor=self.annotation.preprocessor,
+            annotator_type=self.annotation.annotator_type,
+            annotator_args=self.annotation.annotator_args,
+            normalizer={
+                'url': self.annotation.normalizer,
+            },
+            synonym_service={
+                'url': self.annotation.synonym_service,
+            },
+            ontology_helper={
+                'url': self.annotation.ontology_metadata,
+            },
+            tranql_exclude_identifiers=self.indexing.excluded_identifiers,
+            tranql_queries=self.indexing.queries,
+            concept_expander={
+                'url': self.indexing.tranql_endpoint,
+                'min_tranql_score': self.indexing.tranql_min_score,
+            },
+            ontology_greenlist=self.annotation.ontology_greenlist,
+            node_to_element_queries=self.indexing.node_to_element_queries,
+        )
+
+    @property
+    def dict(self):
+        output = {}
+        for key, value in self.__dict__.items():
+            if hasattr(value, '__dict__'):
+                output[key] = value.__dict__
+            else:
+                output[key] = value
+        return output
+
+    @classmethod
+    def factory(cls, file_path: str):
+        file_path = Path(file_path).resolve()
+        with file_path.open() as config_file:
+            file_data = yaml.load(config_file, Loader=yaml.FullLoader)
+
+        override_data = cls.get_override_data(cls.OS_VAR_PREFIX)
+
+        combined_data = cls.merge_dicts(file_data, override_data)
+
+        return RogerConfig(**combined_data)
+
+    @staticmethod
+    def merge_dicts(dict_a, dict_b):
+        flat_a = flatten(dict_a, reducer='dot')
+        flat_b = flatten(dict_b, reducer='dot')
+        flat_a.update(flat_b)
+        return unflatten(flat_a, 'dot')
+
+    @staticmethod
+    def get_override_data(prefix):
+        override_data = {}
+        os_var_keys = os.environ.keys()
+        keys_of_interest = filter(lambda x: x.startswith(prefix), os_var_keys)
+        for key in keys_of_interest:
+            value = os.environ.get(key)
+            var_name = key.replace(prefix, "", 1)
+            var_name = var_name.lstrip("_")
+            var_name = var_name.replace("__", "~")
+            var_name = var_name.replace("_", ".")
+            var_name = var_name.replace("~", "_")
+            var_name = var_name.lower()
+            override_data[var_name] = value
+        return unflatten(override_data, 'dot')
+
+
+class Config:
+    """
+    Singleton config wrapper
+    """
+    __instance__: Optional[Dict] = None
+    os_var_prefix = "ROGERENV_"
+
+    def __init__(self, file_name: str):
+        if not Config.__instance__:
+            Config.__instance__ = Config.read_config_file(file_name=file_name)
+            os_var_keys = os.environ.keys()
+            keys_of_interest = [x for x in os_var_keys if x.startswith(Config.os_var_prefix)]
+            for key in keys_of_interest:
+                new_key = key.replace(Config.os_var_prefix, "")
+                value = os.environ[key]
+                new_dict = Config.os_var_to_dict(new_key, value)
+                try:
+                    Config.update(new_dict)
+                except ValueError as e:
+                    warnings.warn(f"{e} encountered trying to assign string from "
+                                  f"OS variable `{key}` to a dictionary object."
+                                  f"Please specify inner keys.")
+
+    @staticmethod
+    def os_var_to_dict(var_name, value):
+        var_name = var_name.replace("__", "~")
+        var_name = var_name.replace("_", ".")
+        var_name = var_name.replace("~", "_")
+        var_name = var_name.lower()
+        m = {var_name: value}
+        result = unflatten(m, "dot")
+        return result
+
+    @staticmethod
+    def read_config_file(file_name: str):
+        return yaml.load(open(file_name), Loader=yaml.FullLoader)
+
+    def __getattr__(self, item):
+        """
+        Proxies calls to instance dict.
+        Note: dict.update is overridden to do partial updates.
+        Refer to Config.update method.
+        :param item: method called
+        :return: proxied method
+        """
+        if item == 'update':
+            # overrides default dict update method
+            return self.update
+        return getattr(Config.__instance__, item)
+
+    def __getitem__(self, item):
+        """
+        Makes config object subscriptable
+        :param item: key to lookup
+        :return: value stored in key
+        """
+        return self.__instance__.get(item)
+
+    @staticmethod
+    def update(new_value: Dict):
+        """
+        Updates dictionary partially.
+        Given a config {'name': {'first': 'name', 'last': 'name'}}
+        and a partial update {'name': {'first': 'new name'} }
+        result would be {'name': {'first': 'new name', 'last': 'name'}}
+        :param new_value: parts to update
+        :return: updated dict
+        """
+        config_flat = flatten(Config.__instance__)
+        new_value_flat = flatten(new_value)
+        config_flat.update(new_value_flat)
+        Config.__instance__ = unflatten(config_flat)
+        return Config.__instance__
+
+    def __str__(self):
+        flat = flatten(Config.__instance__)
+        for k in flat:
+            if 'PASSWORD' in k or 'password' in k or 'key' in k.lower():
+                flat[k] = '******'
+        flat = unflatten(flat)
+        result = json.dumps(flat)
+        return f"""{result}"""
+
+
+def get_default_config(file_name: str = CONFIG_FILENAME) -> RogerConfig:
+    """
+    Get config as a dictionary
+
+    Parameters
+    ----------
+    file_name: str
+        The filename with all the configuration
+
+    Returns
+    -------
+    dict
+        A dictionary containing all the entries from the config YAML
+
+    """
+    config_instance = RogerConfig.factory(file_name)
+    return config_instance
+
+
+config: RogerConfig = get_default_config()
diff --git a/dags/roger/config/_base.py b/dags/roger/config/_base.py
new file mode 100644
index 00000000..77309666
--- /dev/null
+++ b/dags/roger/config/_base.py
@@ -0,0 +1,11 @@
+class DictLike:
+    def __getitem__(self, item):
+        if not hasattr(self, item):
+            raise KeyError(item)
+        return getattr(self, item)
+
+    def __setitem__(self, key, value):
+        setattr(self, key, value)
+
+    def get(self, key, default=None):
+        return getattr(self, key, default)
\ No newline at end of file
diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml
new file mode 100644
index 00000000..c407555f
--- /dev/null
+++ b/dags/roger/config/config.yaml
@@ -0,0 +1,178 @@
+redisgraph:
+  username: ""
+  password: "weak"
+  host: localhost
+  graph: test
+  port: 6379
+
+logging:
+  level: DEBUG
+  format: '[%(name)s][%(filename)s][%(lineno)d][%(funcName)20s] %(levelname)s: %(message)s'
+
+data_root: roger/data
+
+kgx_base_data_uri: https://stars.renci.org/var/kgx_data/
+annotation_base_data_uri: https://stars.renci.org/var/dug/
+
+kgx:
+  biolink_model_version: v3.1.2
+  merge_db_temp_dir: workspace
+  data_sets:
+    - baseline-graph:v5.0
+
+dug_inputs:
+  data_source: s3
+  data_sets:
+    - topmed:v1.0
+    - bdc:v1.0
+    - anvil:v1.0
+
+#https://github.com/RedisGraph/redisgraph-bulk-loader/blob/master/redisgraph_bulk_loader/bulk_insert.py#L43
+bulk_loader:
+  separator: 0x1E
+  enforce_schema: False
+  skip_invalid_nodes: False
+  skip_invalid_edges: False
+  quote: 0
+  max_token_count: 1024
+  max_buffer_size: 2048
+  max_token_size: 500
+  index: []
+  full_text_index: []
+
+annotation:
+  clear_http_cache: false
+  annotator_type: sapbert
+  annotator_args:
+    monarch:
+      url: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content="
+    sapbert: 
+      classification_url: "https://med-nemo.apps.renci.org/annotate/"
+      annotator_url: "https://sap-qdrant.apps.renci.org/annotate/"
+      score_threshold: 0.8
+      bagel:
+        enabled: false
+        url: "http://localhost:9099/group_synonyms_openai"
+        prompt: "bagel/ask_classes"
+        llm_args:
+          llm_model_name: "gpt-4o-2024-05-13"
+          organization:
+          access_key:
+          llm_model_args:
+            top_p: 0
+            temperature: 0.1
+  normalizer: "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie="
+  synonym_service: "https://name-resolution-sri.renci.org/reverse_lookup"
+  ontology_metadata: "https://api.monarchinitiative.org/api/bioentity/"
+
+  preprocessor:
+    debreviator:
+      BMI: "body mass index"
+    stopwords: "the"
+  ontology_greenlist: ["PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS"]
+
+indexing:
+  # colon seperated mappings list by comma
+  # eg : dbgap:Non-HEAL Studies,bacpac:HEAL Research Programs
+  element_mapping: ""
+  variables_index: "variables_index"
+  concepts_index: "concepts_index"
+  kg_index: "kg_index"
+  tranql_min_score: 0.2
+  excluded_identifiers:
+    - "CHEBI:17336"
+  queries:
+    "disease": ["disease", "phenotypic_feature"]
+    "pheno": ["phenotypic_feature", "disease"]
+    "anat": ["disease", "anatomical_entity"]
+    "chem_to_disease": ["chemical_entity", "disease"]
+    "small_molecule_to_disease": ["small_molecule", "disease"]
+    "chemical_mixture_to_disease": ["chemical_mixture", "disease"]
+    "phen_to_anat": ["phenotypic_feature", "anatomical_entity"]
+  tranql_endpoint: "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false"
+  node_to_element_queries:
+    enabled: false
+    cde:
+      node_type: biolink:Publication
+      curie_prefix: "HEALCDE"
+      list_field_choose_first: 
+        - "files"
+      attribute_mapping:
+        name: "name"
+        desc: "summary"
+        collection_name: "cde_category"
+        collection_id: "cde_category"
+        action: "files"
+
+elasticsearch:
+  host: localhost
+  username: elastic
+  password: "12345"
+  nboost_host: ""
+  scheme: "http"
+  ca_path: ""
+
+validation:
+  queries:
+    count_nodes:
+      name: "Count Nodes"
+      query: "MATCH (a) RETURN COUNT(a)"
+    count_edges:
+      name: "Count Edges"
+      query: "MATCH (a)-[e]-(b) RETURN COUNT(e)"
+    connectivity:
+      name: TOPMED Connectivity
+      query: "MATCH (a { id : '$var' })--(b) RETURN a.category, b.id"
+      args:
+      - var: TOPMED.TAG:8
+      - var: TOPMED.VAR:phv00000484.v1.p10
+      - var: TOPMED.VAR:phv00000487.v1.p10
+      - var: TOPMED.VAR:phv00000496.v1.p10
+      - var: TOPMED.VAR:phv00000517.v1.p10
+      - var: TOPMED.VAR:phv00000518.v1.p10
+      - var: TOPMED.VAR:phv00000528.v1.p10
+      - var: TOPMED.VAR:phv00000529.v1.p10
+      - var: TOPMED.VAR:phv00000530.v1.p10
+      - var: TOPMED.VAR:phv00000531.v1.p10
+    count_connected_nodes:
+      name: Count Connected Nodes
+      query: "MATCH (a)-[e]-(b) RETURN count(a), count(b)"
+    query_by_type:
+      name: Query by Type
+      query: "MATCH (a:gene)-[e]-(b) WHERE 'chemical_substance' IN b.category RETURN count(distinct(a)), count(distinct(b))"
+    smiles_values:
+      name: Query Chemicals with smiles that look like arrays
+      query: "Match (a: chemical_substance { simple_smiles: '$var' }) RETURN a.id"
+      args:
+      - var: "[Os+6]"
+      - var: "[SiH2]"
+      - var: "[CH]"
+      - var: "[S-2]"
+      - var: "[Ti+4]"
+      - var: "[P-3]"
+      - var: "[Ca+2]"
+      - var: "[Au+3]"
+      - var: "[TeH2]"
+      - var: "[Pb]"
+      - var: "[B+]"
+      - var: "[AsH]"
+      - var: "[O-][I+2]([O-])[O-]"
+      - var: "[He+]"
+      - var: "[Mo+6]"
+      - var: "[N-]=[N+]=[N-]"
+      - var: "[Ag+]"
+      - var: "[Zn+2]"
+      - var: "[C-]#[O+]"
+s3:
+  host: ""
+  bucket: ""
+  access_key: ""
+  secret_key: ""
+
+lakefs_config:
+  enabled: false
+  access_key_id: ""
+  secret_access_key: ""
+  host: ""
+  branch: ""
+  repo: ""
diff --git a/dags/roger/config/dev-config.yaml b/dags/roger/config/dev-config.yaml
new file mode 100644
index 00000000..bece11a8
--- /dev/null
+++ b/dags/roger/config/dev-config.yaml
@@ -0,0 +1,118 @@
+redisgraph:
+  username: ""
+  password: ""
+  host: redis
+  graph: test
+  port: 6379
+
+logging:
+  level: DEBUG
+  format: '[%(name)s][%(filename)s][%(funcName)20s] %(levelname)s: %(message)s'
+
+data_root: "/Users/schreepc/Projects/helxplatform/roger/roger/test/data"
+dug_data_root: dug_helpers/dug_data/topmed_data
+base_data_uri: https://stars.renci.org/var/kgx_data/trapi-1.0/
+kgx:
+  biolink_model_version: test
+
+#https://github.com/RedisGraph/redisgraph-bulk-loader/blob/master/redisgraph_bulk_loader/bulk_insert.py#L43
+bulk_loader:
+  separator: 0x1E
+  enforce_schema: False
+  skip_invalid_nodes: False
+  skip_invalid_edges: False
+  quote: 0
+  max_token_count: 1024
+  max_buffer_size: 2048
+  max_token_size: 500
+  index: []
+  full_text_index: []
+
+annotation:
+  annotator: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content="
+  normalizer: "https://nodenormalization-sri.renci.org/get_normalized_nodes?curie="
+  synonym_service: "https://onto.renci.org/synonyms/"
+  ontology_metadata: "https://api.monarchinitiative.org/api/ontology/term/"
+  # The following are neo4j params that would not be used
+  # need to remove them from annotator constructor.
+  db_url: ""
+  username: ""
+  password: ""
+
+indexing:
+  variables_index: "variables_index"
+  concepts_index: "concepts_index"
+  kg_index: "kg_index"
+  tranql_min_score: 0.2
+  excluded_identifiers:
+    - "CHEBI:17336"
+  queries:
+    "disease": ["disease", "phenotypic_feature"]
+    "pheno": ["phenotypic_feature", "disease"]
+    "anat": ["disease", "anatomical_entity"]
+    "chem_to_disease": ["chemical_substance", "disease"]
+    "phen_to_anat": ["phenotypic_feature", "anatomical_entity"]
+    "anat_to_disease": ["anatomical_entity", "disease"]
+    "anat_to_pheno": ["anatomical_entity", "phenotypic_feature"]
+  tranql_endpoint: "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false"
+
+elasticsearch:
+  host: elasticsearch
+  username: elastic
+  # temporary
+  password: "13431"
+  nboost_host: ""
+
+
+
+validation:
+  queries:
+    count_nodes:
+      name: "Count Nodes"
+      query: "MATCH (a) RETURN COUNT(a)"
+    count_edges:
+      name: "Count Edges"
+      query: "MATCH (a)-[e]-(b) RETURN COUNT(e)"
+    connectivity:
+      name: TOPMED Connectivity
+      query: "MATCH (a { id : '$var' })--(b) RETURN a.category, b.id"
+      args:
+      - var: TOPMED.TAG:8
+      - var: TOPMED.VAR:phv00000484.v1.p10
+      - var: TOPMED.VAR:phv00000487.v1.p10
+      - var: TOPMED.VAR:phv00000496.v1.p10
+      - var: TOPMED.VAR:phv00000517.v1.p10
+      - var: TOPMED.VAR:phv00000518.v1.p10
+      - var: TOPMED.VAR:phv00000528.v1.p10
+      - var: TOPMED.VAR:phv00000529.v1.p10
+      - var: TOPMED.VAR:phv00000530.v1.p10
+      - var: TOPMED.VAR:phv00000531.v1.p10
+    count_connected_nodes:
+      name: Count Connected Nodes
+      query: "MATCH (a)-[e]-(b) RETURN count(a), count(b)"
+    query_by_type:
+      name: Query by Type
+      query: "MATCH (a:gene)-[e]-(b) WHERE 'chemical_substance' IN b.category RETURN count(distinct(a)), count(distinct(b))"
+    smiles_values:
+      name: Query Chemicals with smiles that look like arrays
+      query: "Match (a: chemical_substance { simple_smiles: '$var' }) RETURN a.id"
+      args:
+      - var: "[Os+6]"
+      - var: "[SiH2]"
+      - var: "[CH]"
+      - var: "[S-2]"
+      - var: "[Ti+4]"
+      - var: "[P-3]"
+      - var: "[Ca+2]"
+      - var: "[Au+3]"
+      - var: "[TeH2]"
+      - var: "[Pb]"
+      - var: "[B+]"
+      - var: "[AsH]"
+      - var: "[O-][I+2]([O-])[O-]"
+      - var: "[He+]"
+      - var: "[Mo+6]"
+      - var: "[N-]=[N+]=[N-]"
+      - var: "[Ag+]"
+      - var: "[Zn+2]"
+      - var: "[C-]#[O+]"
diff --git a/dags/roger/config/s3_config.py b/dags/roger/config/s3_config.py
new file mode 100644
index 00000000..41fcccab
--- /dev/null
+++ b/dags/roger/config/s3_config.py
@@ -0,0 +1,11 @@
+from dataclasses import dataclass
+
+from ._base import DictLike
+
+
+@dataclass
+class S3Config(DictLike):
+    host: str = ""
+    bucket: str = ""
+    access_key: str = ""
+    secret_key: str = ""
\ No newline at end of file
diff --git a/dags/roger/core/__init__.py b/dags/roger/core/__init__.py
new file mode 100644
index 00000000..5a5c5e90
--- /dev/null
+++ b/dags/roger/core/__init__.py
@@ -0,0 +1,4 @@
+"Core roger modules, now broken out into a submodule"
+
+from roger.core.enums import SchemaType, FileFormat
+from roger.core.bulkload import BulkLoad
diff --git a/dags/roger/core/base.py b/dags/roger/core/base.py
new file mode 100644
index 00000000..7ba9409a
--- /dev/null
+++ b/dags/roger/core/base.py
@@ -0,0 +1,217 @@
+"Core Roger object and utilities"
+
+import argparse
+import sys
+from io import StringIO
+import logging
+
+from roger.config import get_default_config as get_config
+from roger.logger import get_logger
+from roger.core.bulkload import BulkLoad
+from roger.models.kgx import KGXModel
+from roger.models.biolink import BiolinkModel
+
+log = get_logger()
+
+class Roger:
+    """ Consolidate Roger functionality for a cleaner interface. """
+
+    def __init__(self, to_string=False, config=None):
+        """ Initialize.
+        :param to_string: Log to str, available as self.log_stream.getvalue()
+        after execution completes.
+        """
+        self.has_string_handler = to_string
+        if not config:
+            config = get_config()
+        self.config = config
+        if to_string:
+            # Add a stream handler to enable to_string.
+            self.log_stream = StringIO()
+            self.string_handler = logging.StreamHandler (self.log_stream)
+            log.addHandler (self.string_handler)
+        log.debug("config is %s", config.kgx.biolink_model_version)
+        self.biolink = BiolinkModel (config.kgx.biolink_model_version)
+        self.kgx = KGXModel (self.biolink, config=config)
+        self.bulk = BulkLoad (self.biolink, config=config)
+
+    def __enter__(self):
+        """ Implement Python's Context Manager interface. """
+        return self
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        """ Implement Python's Context Manager interface. We use this finalizer
+        to detach the stream handler appended in the constructor.
+        :param exception_type: Type of exception, if one occurred.
+        :param exception_value: The exception, if one occurred.
+        :param traceback: The stack trace explaining the exception.
+        """
+        if exception_type or exception_value or traceback:
+            log.error (msg="Error:",
+                       exc_info=(exception_type, exception_value, traceback))
+        if self.has_string_handler:
+            log.removeHandler (self.string_handler)
+
+# interfaces abstracting Roger's inner workings to make it easier to
+# incorporate into external tools like workflow engines.
+
+def get_kgx (to_string=False, config=None):
+    "get KGX dataset"
+    output = None
+    log.debug("Getting KGX method called.")
+    with Roger (to_string, config=config) as roger:
+        dataset_version=config.get('kgx', {}).get('dataset_version')
+        log.debug("dataset_version is %s", dataset_version)
+        roger.kgx.get(dataset_version=dataset_version)
+        output = roger.log_stream.getvalue() if to_string else None
+    return output
+
+def create_schema(to_string=False, config=None):
+    "Create noders and edges schemata"
+    o1 = create_nodes_schema(to_string=to_string, config=config)
+    o2 = create_edges_schema(to_string=to_string, config=config)
+    output = (o1 + o2 ) if to_string else None
+    return output
+
+def create_edges_schema(to_string=False, config=None, input_data_path=None, output_data_path=None):
+    "Create edges schema on KGX object"
+    output = None
+    with Roger(to_string, config=config) as roger:
+        roger.kgx.create_edges_schema(
+            input_data_path=input_data_path,
+            output_data_path=output_data_path
+        )
+        output = roger.log_stream.getvalue() if to_string else None
+    return output
+
+def create_nodes_schema(to_string=False, config=None, input_data_path=None, output_data_path=None):
+    "Create nodes schema on KGX object"
+    output = None
+    with Roger(to_string, config=config) as roger:
+        roger.kgx.create_nodes_schema(input_data_path=input_data_path,
+                                      output_data_path=output_data_path)
+        output = roger.log_stream.getvalue() if to_string else None
+    return output
+
+def merge_nodes(to_string=False, config=None, input_data_path=None, output_data_path=None):
+    "Run KGX merge"
+    output = None
+    with Roger (to_string, config=config) as roger:
+        roger.kgx.merge(input_path=input_data_path, output_path=output_data_path)
+        output = roger.log_stream.getvalue () if to_string else None
+    return output
+
+def create_bulk_load(to_string=False, config=None, input_data_path=None, output_data_path=None):
+    "Generate bulk load files"
+    o1 = create_bulk_nodes(to_string=to_string, config=config)
+    o2 = create_bulk_edges(to_string=to_string, config=config)
+    output = (o1 + o2) if to_string else None
+    return output
+
+def create_bulk_nodes(to_string=False, config=None, input_data_path=None, output_data_path=None):
+    "Generate bulk node CSV file"
+    output = None
+    with Roger(to_string, config=config) as roger:
+        log.info("input path: %s", input_data_path)
+        log.info("output path: %s", output_data_path)
+        roger.bulk.create_nodes_csv_file(input_data_path, output_data_path)
+        output = roger.log_stream.getvalue() if to_string else None
+    return output
+
+def create_bulk_edges(to_string=False, config=None, input_data_path=None, output_data_path=None):
+    "Create bulk edges CSV file"
+    output = None
+    with Roger(to_string, config=config) as roger:
+        roger.bulk.create_edges_csv_file(input_data_path, output_data_path)
+        output = roger.log_stream.getvalue() if to_string else None
+    return output
+
+def bulk_load(to_string=False, config=None, input_data_path=None, output_data_path=None):
+    "Run bulk load insert process"
+    output = None
+    with Roger (to_string, config=config) as roger:
+        roger.bulk.insert(input_data_path=input_data_path)
+        output = roger.log_stream.getvalue () if to_string else None
+    return output
+
+def validate (to_string=False, config=None, input_data_path=None, output_data_path=None):
+    "Run bulk validate process"
+    output = None
+    with Roger (to_string, config=config) as roger:
+        roger.bulk.validate()
+        output = roger.log_stream.getvalue () if to_string else None
+    return output
+
+def check_tranql(to_string=False, config=None, input_data_path=None, output_data_path=None):
+    "Tranql server smoke check"
+    output = None
+    with Roger(to_string, config=config) as roger:
+        roger.bulk.wait_for_tranql()
+        output = roger.log_stream.getvalue() if to_string else None
+    return output
+
+def roger_cli():
+    " Roger CLI. "
+    parser = argparse.ArgumentParser(description='Roger')
+    parser.add_argument('-v',
+                        '--dataset-version',
+                        help="Dataset version.",
+                        default="v1.0")
+    parser.add_argument('-d',
+                        '--data-root',
+                        help="Root of data hierarchy",
+                        default=None)
+    parser.add_argument('-g',
+                        '--get-kgx',
+                        help="Get KGX objects",
+                        action='store_true')
+    parser.add_argument('-l',
+                        '--load-kgx',
+                        help="Load via KGX",
+                        action='store_true')
+    parser.add_argument('-s',
+                        '--create-schema',
+                        help="Infer schema",
+                        action='store_true')
+    parser.add_argument('-m',
+                        '--merge-kgx',
+                        help="Merge KGX nodes",
+                        action='store_true')
+    parser.add_argument('-b',
+                        '--create-bulk',
+                        help="Create bulk load",
+                        action='store_true')
+    parser.add_argument('-i',
+                        '--insert',
+                        help="Do the bulk insert",
+                        action='store_true')
+    parser.add_argument('-a',
+                        '--validate',
+                        help="Validate the insert",
+                        action='store_true')
+    args = parser.parse_args ()
+
+    biolink = BiolinkModel ()
+    kgx = KGXModel (biolink)
+    bulk = BulkLoad (biolink)
+    if args.data_root is not None:
+        config = get_config()
+        data_root = args.data_root
+        config.update({'data_root': data_root})
+        log.info("data root: %s", data_root)
+    if args.get_kgx:
+        kgx.get (dataset_version=args.dataset_version)
+    if args.load_kgx:
+        kgx.load ()
+    if args.merge_kgx:
+        kgx.merge ()
+    if args.create_schema:
+        kgx.create_schema ()
+    if args.create_bulk:
+        bulk.create ()
+    if args.insert:
+        bulk.insert ()
+    if args.validate:
+        bulk.validate ()
+
+    sys.exit (0)
diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py
new file mode 100644
index 00000000..a8ddbc15
--- /dev/null
+++ b/dags/roger/core/bulkload.py
@@ -0,0 +1,432 @@
+"Bulk loader for Roger"
+
+import os
+import glob
+import shutil
+from collections import defaultdict
+from functools import reduce
+from string import Template
+import time
+
+import requests
+import redis
+from falkordb_bulk_loader.bulk_insert import bulk_insert
+
+from roger.config import get_default_config as get_config
+from roger.logger import get_logger
+from roger.core.redis_graph import RedisGraph
+from roger.core.enums import SchemaType
+from roger.models.biolink import BiolinkModel
+from roger.components.data_conversion import cast
+from roger.core import storage
+
+log = get_logger()
+
+class BulkLoad:
+    """ Tools for creating a Redisgraph bulk load dataset. """
+    def __init__(self, biolink, config=None):
+        self.biolink = biolink
+        if not config:
+            config = get_config()
+        self.config = config
+        separator = self.config.get('bulk_loader',{}).get('separator', '|')
+        self.separator =(chr(separator) if isinstance(separator, int)
+                         else separator)
+
+    def create (self):
+        """Used in the CLI on args.create_bulk"""
+        self.create_nodes_csv_file()
+        self.create_edges_csv_file()
+
+    def create_nodes_csv_file(self, input_data_path=None, output_data_path=None):
+        # clear out previous data
+        bulk_path = storage.bulk_path("nodes", output_data_path)
+        if os.path.exists(bulk_path):
+            shutil.rmtree(bulk_path)
+        categories_schema = storage.read_schema (SchemaType.CATEGORY, input_data_path)
+        state = defaultdict(lambda: None)
+        log.info(f"processing nodes")
+        """ Write node data for bulk load. """
+
+        categories = defaultdict(lambda: [])
+        category_error_nodes = set()
+        merged_nodes_file = storage.merged_objects('nodes', input_data_path)
+        counter = 1
+        for node in storage.json_line_iter(merged_nodes_file):
+            if not node.get('category'):
+                category_error_nodes.add(node['id'])
+                node['category'] = [BiolinkModel.root_type]
+            index = self.biolink.get_leaf_class(node['category'])
+            categories[index].append(node)
+            if category_error_nodes:
+                log.error(
+                    f"some nodes didn't have category assigned. "
+                    f"KGX file has errors. "
+                    f"Nodes {len(category_error_nodes)}. "
+                    f"They will be typed {BiolinkModel.root_type}. "
+                    f"Showing first 10: {list(category_error_nodes)[:10]}.")
+            # flush every 100K
+            if counter % 100_000 == 0:
+                self.write_bulk(storage.bulk_path("nodes", output_data_path),
+                                categories, categories_schema,
+                                state=state, is_relation=False)
+                # reset variables.
+                category_error_nodes = set()
+                categories = defaultdict(lambda: [])
+            counter += 1
+        # write back if any thing left.
+        if len(categories):
+            self.write_bulk(storage.bulk_path("nodes", output_data_path),
+                            categories, categories_schema,
+                            state=state, is_relation=False)
+
+    def create_edges_csv_file(self, input_data_path=None, output_data_path=None):
+        """ Write predicate data for bulk load. """
+        # Clear out previous data
+        bulk_path = storage.bulk_path("edges", output_data_path)
+        if os.path.exists(bulk_path):
+            shutil.rmtree(bulk_path)
+        predicates_schema = storage.read_schema(SchemaType.PREDICATE, input_data_path)
+        predicates = defaultdict(lambda: [])
+        edges_file = storage.merged_objects('edges', input_data_path)
+        counter = 1
+        state = {}
+        for edge in storage.json_line_iter(edges_file):
+            predicates[edge['predicate']].append(edge)
+            # write out every 100K , to avoid large predicate dict.
+            if counter % 100_000 == 0:
+                self.write_bulk(
+                    storage.bulk_path("edges", output_data_path),predicates, predicates_schema,
+                    state=state, is_relation=True)
+                predicates = defaultdict(lambda : [])
+            counter += 1
+        # if there are some items left (if loop ended before counter reached the
+        # specified value)
+        if len(predicates):
+            self.write_bulk(storage.bulk_path("edges", output_data_path), predicates,
+                            predicates_schema,state=state, is_relation=True)
+
+    @staticmethod
+    def create_redis_schema_header(attributes: dict, is_relation=False):
+        """Creates col headers for csv to be used by redis bulk loader
+
+        Column headers are generated by assigning redis types
+        :param attributes: dict of data labels with values as python type strs
+        :param separator: CSV separator
+        :return: list of attrs, each item is attributeLabel:redisGraphDataType
+        """
+        redis_type_conversion_map = {
+            'str': 'STRING',
+            'float': 'FLOAT',  # Do we need to handle double
+            'int': 'INT',
+            'bool': 'BOOL',
+            'list': 'ARRAY'
+        }
+        col_headers = []
+        def format_for_redis(label, typ):
+            return f'{label}:{typ}'
+        for attribute, attribute_type in attributes.items():
+            col_headers.append(format_for_redis(
+                attribute, redis_type_conversion_map[attribute_type]))
+        # Note this two fields are only important to bulk loader
+        # they will not be members of the graph
+        # https://github.com/RedisGraph/redisgraph-bulk-loader/tree/master#input-schemas
+        if is_relation:
+            col_headers.append('internal_start_id:START_ID')
+            col_headers.append('internal_end_id:END_ID')
+        # replace id:STRING with id:ID
+        col_headers.append('id:ID')
+        col_headers = list(filter(lambda x: x != 'id:STRING', col_headers))
+        return col_headers
+
+    @staticmethod
+    def group_items_by_attributes_set(objects: list, processed_object_ids: set):
+        """ Groups items into a dictionary
+
+        The keys the output dictionary are sets of attributes set for all
+        items accessed in that key.
+
+        Eg.:
+        { set(id,name,category): [{id:'xx0',name:'bbb', 'category':['type']}....
+        {id:'xx1', name:'bb2', category: ['type1']}] }
+        :param objects: list of nodes or edges
+        :param processed_object_ids: ids to skip since they are processed.
+        :return: dictionary grouping based on set attributes
+        """
+        clustered_by_set_values = {}
+        improper_keys = set()
+        def value_set_test(val):
+            "Converted from lambda function, is this just 'if x:'?"
+            if (val is not None and val != [] and val != ''):
+                return True
+            return False
+        for obj in objects:
+            # redis bulk loader needs columns not to include ':'
+            # till backticks are implemented we should avoid these.
+            def key_filter(key):
+                # Make sure no colons in key names
+                return ':' not in key
+            keys_with_values = frozenset(
+                [k for k in obj.keys()
+                 if value_set_test(obj[k]) and key_filter(k)])
+            for key in [k for k in obj.keys() if obj[k] and not key_filter(k)]:
+                improper_keys.add(key)
+            # group by attributes that have values. # Why?
+            # Redis bulk loader has one issue
+            # imagine we have:
+            #
+            #{'name': 'x'} , {'name': 'y', 'is_metabolite': true}
+            #
+            # we have a common schema name:STRING,is_metabolite:
+            #
+            # BOOL values `x,` and `y,true`
+            #
+            # but x not having value for is_metabolite is not handled well,
+            # redis bulk loader says we should give it default if we were to
+            # enforce schema but due to the nature of the data assigning
+            # defaults is very not an option.  hence grouping data into several
+            # csv's might be the right way (?)
+            if obj['id'] not in processed_object_ids:
+                val_list = clustered_by_set_values.get(keys_with_values, [])
+                val_list.append(obj)
+                clustered_by_set_values[keys_with_values] = val_list
+        return clustered_by_set_values, improper_keys
+
+    def write_bulk(self, bulk_path, obj_map, schema, state={},
+                   is_relation=False):
+        """ Write a bulk load group of objects.
+        :param bulk_path: Path to the bulk loader object to write.
+        :param obj_map: A map of biolink type to list of objects.
+        :param schema: The schema (nodes or predicates) containing identifiers.
+        :param state: Track state of already written objs to avoid duplicates.
+        """
+
+        os.makedirs (bulk_path, exist_ok=True)
+        processed_objects_id = state.get('processed_id', set())
+        called_x_times = state.get('called_times', 0)
+        called_x_times += 1
+        for key, objects in obj_map.items ():
+            if len(objects) == 0:
+                continue
+            try:
+                all_keys = schema[key]
+            except Exception as e:
+                log.error(f"{key} not in {schema.keys()} " )
+                raise Exception("error") from e
+            """ Make all objects conform to the schema. """
+            clustered_by_set_values, improper_redis_keys = (
+                self.group_items_by_attributes_set(objects,
+                                                   processed_objects_id))
+
+            if improper_redis_keys:
+                log.warning(
+                    "The following keys were skipped since they include "
+                    "conflicting `:` that would cause errors while bulk "
+                    "loading to redis. [%s]", str(improper_redis_keys))
+            for index, set_attributes in enumerate(
+                    clustered_by_set_values.keys()):
+                items = clustered_by_set_values[set_attributes]
+                # When parted files are saved let the file names be collected
+                # here
+                state['file_paths'] = state.get('file_paths', {})
+                state['file_paths'][key] = state['file_paths'].get(key, {})
+                out_file = state['file_paths'][key][set_attributes] = (
+                    state['file_paths'].get(key, {}).get(set_attributes, ''))
+
+                # When calling write bulk , lets say we have processed some
+                # chemicals from file 1 and we start processing file 2 if we are
+                # using just index then we might (rather will) end up adding
+                # records to the wrong file so we need this to be unique as
+                # possible by adding called_x_times , if we already found
+                # out-file from state obj we are sure that the schemas match.
+
+                # biolink:<TYPE> is not valid name so we need to remove :
+                file_key = key.replace(':', '~')
+
+                out_file = (
+                    f"{bulk_path}/{file_key}.csv-{index}-{called_x_times}"
+                    if not out_file
+                    else out_file)
+                # store back file name
+                state['file_paths'][key][set_attributes] = out_file
+                new_file = not os.path.exists(out_file)
+                keys_for_header = {x: all_keys[x] for x in all_keys
+                                   if x in set_attributes}
+                redis_schema_header = self.create_redis_schema_header(
+                    keys_for_header, is_relation)
+                with open(out_file, "a", encoding='utf-8') as stream:
+                    if new_file:
+                        state['file_paths'][key][set_attributes] = out_file
+                        log.info(f"  --creating {out_file}")
+                        stream.write(self.separator.join(redis_schema_header))
+                        stream.write("\n")
+                    else:
+                        log.info(f"  --appending to {out_file}")
+
+                    # Write fields, skipping duplicate objects.
+                    for obj in items:
+                        oid = str(obj['id'])
+                        if oid in processed_objects_id:
+                            continue
+                        processed_objects_id.add(oid)
+
+                        # Add ID / START_ID / END_ID depending
+                        internal_id_fields = {
+                            'internal_id': obj['id']
+                        }
+                        if is_relation:
+                            internal_id_fields.update({
+                                'internal_start_id': obj['subject'],
+                                'internal_end_id': obj['object']
+                            })
+                        obj.update(internal_id_fields)
+                        values = []
+
+                        # uses redis schema header to preserve order when
+                        # writing lines out.
+                        for column_name in redis_schema_header:
+                            # last key is the type
+                            obj_key = ':'.join(column_name.split(':')[:-1])
+                            value = obj[obj_key]
+
+                            if obj_key not in internal_id_fields:
+                                current_type = type(value).__name__
+                                expected_type = all_keys[obj_key]
+                                # cast it if it doesn't match type in schema
+                                # keys i.e all_keys
+                                value = (
+                                    cast(obj[obj_key], all_keys[obj_key])
+                                    if expected_type != current_type
+                                    else value)
+                            # escape quotes .
+                            values.append(str(value).replace("\"", "\\\""))
+                        s = self.separator.join(values)
+                        stream.write(s)
+                        stream.write("\n")
+        state['processed_id'] = processed_objects_id
+        state['called_times'] = called_x_times
+
+    def insert (self, input_data_path=None):
+        redisgraph = self.config.redisgraph
+        nodes = sorted(glob.glob (storage.bulk_path ("**/nodes/**.csv*", input_data_path), recursive=True))
+        edges = sorted(glob.glob (storage.bulk_path ("**/edges/**.csv*", input_data_path), recursive=True))
+        graph = redisgraph['graph']
+        log.info(f"bulk loading \n  nodes: {nodes} \n  edges: {edges}")
+
+        try:
+            log.info (f"deleting graph {graph} in preparation for bulk load.")
+            db = self.get_redisgraph()
+            db.redis_graph.delete ()
+        except redis.exceptions.ResponseError:
+            log.info("no graph to delete")
+
+        log.info ("bulk loading graph: %s", str(graph))
+        args = []
+        if len(nodes) > 0:
+            bulk_path_root = glob.glob(storage.bulk_path('**/nodes', path=input_data_path), recursive=True)[0] + os.path.sep
+            nodes_with_type = []
+            collect_labels = set()
+            for x in nodes:
+                """ 
+                    These lines prep nodes bulk load by:
+                    1) appending to labels 'biolink.'
+                    2) combine labels to create a multilabel redis node i.e. "biolink.OrganismalEntity:biolink.SubjectOfInvestigation" 
+                """
+                file_name_type_part = x.replace(bulk_path_root, '').split('.')[0].split('~')[1]
+                all_labels = "biolink." + file_name_type_part + ":" + ":".join([f'biolink.{v.lstrip("biolink:")}' for v in self.biolink.toolkit.get_ancestors("biolink:" + file_name_type_part, reflexive=False, formatted=True )] )
+                collect_labels.add("biolink." + file_name_type_part)
+                for v in self.biolink.toolkit.get_ancestors("biolink:" + file_name_type_part, reflexive=False,
+                                                            formatted=True):
+                    collect_labels.add(f'biolink.{v.lstrip("biolink:")}')
+                nodes_with_type.append(f"{all_labels} {x}")
+            args.extend(("-N " + " -N ".join(nodes_with_type)).split())
+        if len(edges) > 0:
+            bulk_path_root = glob.glob(storage.bulk_path('**/edges', path=input_data_path), recursive=True)[0] + os.path.sep
+            edges_with_type = [f"biolink.{x.replace(bulk_path_root, '').strip(os.path.sep).split('.')[0].split('~')[1]} {x}"
+                               for x in edges]
+            # Edge label now no longer has 'biolink:'
+            args.extend(("-R " + " -R ".join(edges_with_type)).split())
+        args.extend([f"--separator={self.separator}"])
+        args.extend([f"--server-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}"])
+        args.extend(['--enforce-schema'])
+        args.extend(['-e'])
+        for lbl in collect_labels:
+            args.extend([f'-i `{lbl}`:id', f'-f {lbl}:name', f'-f {lbl}:synonyms'])
+        args.extend([f"{redisgraph['graph']}"])
+        """ standalone_mode=False tells click not to sys.exit() """
+        log.debug(f"Calling bulk_insert with extended args: {args}")
+        try:
+            bulk_insert(args, standalone_mode=False)
+            # self.add_indexes()
+        except Exception as e:
+            log.error(f"Unexpected {e.__class__.__name__}: {e}")
+            raise
+
+    def add_indexes(self):
+        redis_connection = self.get_redisgraph()
+        all_labels = redis_connection.query(
+            "Match (c) return distinct labels(c)").result_set
+        all_labels = reduce(lambda x, y: x + y, all_labels, [])
+        id_index_queries = [
+            f'CREATE INDEX on  :`{label}`(id)' for label in all_labels
+        ]
+        name_index_queries = (
+            "CALL db.labels() YIELD label "
+            "CALL db.idx.fulltext.createNodeIndex(label, 'name', 'synonyms')")
+
+        for query in id_index_queries:
+            redis_connection.query(query=query)
+        redis_connection.query(query=name_index_queries)
+        log.info(f"Indexes created for {len(all_labels)} labels.")
+
+    def get_redisgraph(self):
+        return RedisGraph(
+            host=self.config.redisgraph.host,
+            port=self.config.redisgraph.port,
+            password=self.config.redisgraph.password,
+            graph=self.config.redisgraph.graph,
+        )
+
+    def validate(self):
+
+        db = self.get_redisgraph()
+        validation_queries = self.config.get(
+            'validation', {}).get('queries', [])
+        for key, query in validation_queries.items ():
+            text = query['query']
+            name = query['name']
+            args = query.get('args', [{}])
+            for arg in args:
+                start = storage.current_time_in_millis ()
+                instance = Template (text).safe_substitute (arg)
+                db.query (instance)
+                duration = storage.current_time_in_millis () - start
+                log.info (f"Query {key}:{name} ran in {duration}ms: {instance}")
+
+    def wait_for_tranql(self):
+        retry_secs = 3
+        tranql_endpoint = self.config.indexing.tranql_endpoint
+        log.info(f"Contacting {tranql_endpoint}")
+        graph_name = self.config["redisgraph"]["graph"]
+        test_query = "SELECT disease-> phenotypic_feature " \
+                     f"FROM 'redis:{graph_name}'" \
+                     f"WHERE  disease='MONDO:0004979'"
+        is_done_loading = False
+        try:
+            while not is_done_loading:
+                response = requests.post(tranql_endpoint, data=test_query)
+                response_code = response.status_code
+                response = response.json()
+                is_done_loading = "message" in response and response_code == 200
+                if is_done_loading:
+                    break
+                else:
+                    log.info(f"Tranql responsed with response: {response}")
+                    log.info(f"Retrying in {retry_secs} secs...")
+                time.sleep(retry_secs)
+        except ConnectionError as e:
+            # convert exception to be more readable.
+            raise ConnectionError(
+                f"Attempting to contact {tranql_endpoint} "
+                f"failed due to connection error. "
+                f"Please check status of Tranql server.") from e
diff --git a/dags/roger/core/enums.py b/dags/roger/core/enums.py
new file mode 100644
index 00000000..b44323af
--- /dev/null
+++ b/dags/roger/core/enums.py
@@ -0,0 +1,15 @@
+"Enums for Roger"
+
+from enum import Enum
+
+class SchemaType(Enum):
+    """ High level semantic metatdata concepts.
+    Categories are classes in an ontological model like Biolink.
+    Predicates are links between nodes. """
+    CATEGORY = "category"
+    PREDICATE = "predicate"
+
+class FileFormat(Enum):
+    """ File formats this module knows about. """
+    JSON = "json"
+    YAML = "yaml"
diff --git a/roger/roger_db.py b/dags/roger/core/redis_graph.py
similarity index 87%
rename from roger/roger_db.py
rename to dags/roger/core/redis_graph.py
index 75b94e6b..ca65ddce 100644
--- a/roger/roger_db.py
+++ b/dags/roger/core/redis_graph.py
@@ -1,18 +1,22 @@
-import logging
+import copy
+
 import redis
-from redisgraph import Node, Edge, Graph
-from redis.exceptions import ResponseError
-from roger.roger_util import get_config, get_logger
+# from redisgraph import Node, Edge, Graph
+# https://redis-py.readthedocs.io/en/v4.5.1/redismodules.html#redisgraph-commands
+from redis.commands.graph.node import Node
+from redis.commands.graph.edge import Edge
+
+from roger.logger import get_logger
 
 logger = get_logger ()
 
 class RedisGraph:
     """ Graph abstraction over RedisGraph. A thin wrapper but provides us some options. """
     
-    def __init__(self, host='localhost', port=6379, graph='default'):
+    def __init__(self, host='localhost', port=6379, graph='default', password=''):
         """ Construct a connection to Redis Graph. """
-        self.r = redis.Redis(host=host, port=port)
-        self.redis_graph = Graph(graph, self.r)
+        self.r = redis.Redis(host=host, port=port, password=password)
+        self.redis_graph = self.r.graph(graph)
 
     def add_node (self, identifier=None, label=None, properties=None):
         """ Add a node with the given label and properties. """
@@ -57,7 +61,7 @@ def commit (self):
     def query (self, query):
         """ Query and return result set. """
         result = self.redis_graph.query(query)
-        result.pretty_print()
+        print(result)
         return result
     
     def delete (self):
@@ -84,4 +88,4 @@ def test ():
     rg.delete ()
 
 #    rg.query ("""MATCH (a { id : 'chemical_substance' }) RETURN a""")
-#test ()
+#test ()
\ No newline at end of file
diff --git a/dags/roger/core/storage.py b/dags/roger/core/storage.py
new file mode 100644
index 00000000..b4869758
--- /dev/null
+++ b/dags/roger/core/storage.py
@@ -0,0 +1,502 @@
+""" utils for roger
+
+This is home to the utilities that were formerly in dags/roger/core.py:Util
+"""
+
+import os
+import glob
+import time
+import pathlib
+import pickle
+import shutil
+import yaml
+import orjson as json
+import requests
+from urllib.request import urlretrieve
+from pathlib import Path
+
+from roger.logger import get_logger
+from roger.config import get_default_config as get_config
+from roger.core import SchemaType
+
+log = get_logger()
+config = get_config()
+
+data_dir_env_value = os.getenv("ROGER_DATA_DIR")
+
+if data_dir_env_value is None:
+    ROGER_DATA_DIR = Path(__file__).parent.resolve() / 'data'
+else:
+    ROGER_DATA_DIR = Path(data_dir_env_value)
+
+
+def current_time_in_millis():
+    """
+    Get current time in milliseconds.
+
+    Returns
+    -------
+    int
+    Time in milliseconds
+
+    """
+    return int(round(time.time() * 1000))
+
+# A just do it approach to getting data.
+def read_file(path):
+    """ Read a file.
+    :param path: Path to a file.
+    """
+    text = None
+    with open(path, "r", encoding='utf-8') as stream:
+        text = stream.read()
+    return text
+
+def read_url(url):
+    """ Read data from a URL.
+    :param url: The URL to read. """
+    return requests.get(url, timeout=60).text
+
+def read_data(path):
+    """ Read data from a URL or File. HTTP(S) is the only supported protocol.
+    :param path: A URL or file path. """
+    text = None
+    if is_web(path):
+        text = read_url(path)
+    else:
+        text = read_file(path)
+    return text
+
+def read_object(path, key=None):
+    """ Read on object from a path.
+    :param path: A URL or file path.
+                 Supports YAML and JSON depending on extension.
+    :param key: A configuration key. This is prepended to the path if present.
+    :raises ValueError: If the key is not in the configuration. """
+    if key is not None:
+        prefix = config[key]
+        path = f"{prefix}/{path}" if is_web(prefix) \
+            else os.path.join (prefix, path)
+    obj = None
+    if path.endswith(".yaml") or path.endswith (".yml"):
+        obj = yaml.safe_load (read_data (path))
+    elif path.endswith(".json"):
+        obj = json.loads (read_data (path))
+    elif path.endswith(".pickle"):
+        with open(file=path, mode="rb") as stream:
+            obj = pickle.load(stream)
+    elif path.endswith(".jsonl") or path.endswith('.txt'):
+        obj = read_data(path)
+    return obj
+
+def is_web (uri):
+    """ The URI is a web URI (starts with http or https).
+    :param uri: A URI """
+    return uri.startswith("http://") or uri.startswith ("https://")
+
+def write_object (obj, path, key=None):
+    """ Write an object to a path. YAML and JSON supported based on extension.
+    :param obj: The object to write.
+    :param path: The path to write to.
+    :param key: The configuration key to prepend to the path.
+    """
+    # Prepend a prefix from the configuration file if a key is given.
+    if key is not None:
+        prefix = config[key]
+        path = (f"{prefix}/{path}" if is_web(prefix)
+                else os.path.join (prefix, path))
+
+    # Ensure the directory to be written to exists.
+    dirname = os.path.dirname (path)
+    if not os.path.exists (dirname):
+        os.makedirs (dirname, exist_ok=True)
+
+    # Write the file in the specified format.
+    if path.endswith (".yaml") or path.endswith (".yml"):
+        with open(path, 'w') as outfile:
+            yaml.dump (obj, outfile)
+    elif path.endswith (".json"):
+        with open (path, "w", encoding='utf-8') as stream:
+            stream.write(str(json.dumps (obj, option=json.OPT_INDENT_2).decode('utf-8')))
+    elif path.endswith(".pickle"):
+        with open (path, "wb") as stream:
+            pickle.dump(obj, file=stream)
+    elif path.endswith(".jsonl") or path.endswith('.txt'):
+        with open (path, "w", encoding="utf-8") as stream:
+            stream.write(obj)
+    else:
+        # Raise an exception if invalid.
+        raise ValueError (f"Unrecognized extension: {path}")
+
+def mkdir(path, is_dir=False):
+    directory = os.path.dirname(path) if not is_dir else path
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+def remove(path):
+    if os.path.exists(path):
+        if os.path.isdir(path):
+            shutil.rmtree(path)
+        else:
+            os.remove(path)
+
+def clear_dir(path):
+    remove(path)
+    mkdir(path, is_dir=True)
+
+######################
+# Path methods
+
+def kgx_path(name):
+    """ Form a KGX object path.
+    :path name: Name of the KGX object. """
+    return str(ROGER_DATA_DIR / "kgx" / name)
+
+def kgx_objects(format_="json", path=None):
+    """ A list of KGX objects. """
+    kgx_pattern = kgx_path(f"**.{format_}")
+    if path:
+        kgx_pattern = f"{path}/**/*.{format_}"
+    return sorted(glob.glob (kgx_pattern, recursive=True))
+
+def merge_path(name, path: Path=None):
+    """ Form a merged KGX object path.
+    :path name: Name of the merged KGX object. """
+    if path is None:
+        # create output dir
+        if not os.path.exists(ROGER_DATA_DIR / 'merge'):
+            os.makedirs(ROGER_DATA_DIR / 'merge')
+        return str(ROGER_DATA_DIR / 'merge' / name)
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    return str(path.joinpath(name))
+
+def merged_objects(file_type, path=None):
+    """ A list of merged KGX objects. """
+    if not path:
+        merged_pattern = merge_path(f"**/{file_type}.jsonl")
+    else:
+        merged_pattern =  merge_path(f"**/{file_type}.jsonl", path=path)
+    # this thing should always return one edges or nodes file (based on file_type)
+    try:
+        return sorted(glob.glob(merged_pattern, recursive=True))[0]
+    except IndexError:
+        raise ValueError(f"Could not find merged KGX of type {file_type} in {merged_pattern}")
+
+
+def schema_path(name, path=None):
+    """ Path to a schema object.
+    :param name: Name of the object to get a path for. """
+    if not path:
+        return str(ROGER_DATA_DIR / 'schema' / name)
+    return str (path / 'schema' / name)
+
+def bulk_path(name, path=None):
+    """ Path to a bulk load object.
+    :param name: Name of the object. """
+    if not path:
+        return str(ROGER_DATA_DIR / 'bulk' / name)
+    else:
+        return str(path / name)
+
+def metrics_path(name):
+    """
+    Path to write metrics to
+    :param name:
+    :return:
+    """
+    return str(ROGER_DATA_DIR / "metrics" / name)
+
+def dug_kgx_path(name):
+    return str(ROGER_DATA_DIR / "dug" / "kgx" / name)
+
+def dug_annotation_path(name):
+    return str(ROGER_DATA_DIR / "dug" / "annotations" / name)
+
+def dug_expanded_concepts_path(name):
+    return str(ROGER_DATA_DIR / 'dug' / 'expanded_concepts' / name)
+
+def dug_expanded_concept_objects(data_path=None, format="pickle"):
+    "Return a list of files containing expaneded concept objects"
+    if data_path:
+        file_pattern = os.path.join(data_path, '**', f'expanded_concepts.{format}')
+    else:
+        file_pattern = dug_expanded_concepts_path(
+            os.path.join('*',f'expanded_concepts.{format}'))
+    return sorted(glob.glob(file_pattern, recursive=True))
+
+def dug_extracted_elements_objects(data_path=None, format="txt"):
+    if data_path:
+        file_pattern = os.path.join(data_path, '**', f'extracted_graph_elements.{format}')
+    else:
+        file_pattern = dug_expanded_concepts_path(
+            os.path.join('*', f'extracted_graph_elements.{format}'))
+    return sorted(glob.glob(file_pattern, recursive=True))
+
+def dug_crawl_path(name):
+    return str(ROGER_DATA_DIR / 'dug' / 'crawl' / name)
+
+def dug_kgx_objects():
+    """ A list of dug KGX objects. """
+    dug_kgx_pattern = dug_kgx_path("**.json")
+    return sorted(glob.glob(dug_kgx_pattern))
+
+def dug_concepts_objects(data_path, format="pickle"):
+    """ A list of dug annotation Objects. """
+    if not data_path:
+        concepts_file_path = dug_annotation_path(
+            os.path.join('*',f'concepts.{format}'))
+    else:
+        concepts_file_path = os.path.join(
+            data_path, '**', f'concepts.{format}')
+    return sorted(glob.glob(concepts_file_path, recursive=True))
+
+def dug_elements_objects(data_path=None, format='pickle'):
+    """ A list of dug annotation Objects. """
+    if not data_path:
+        concepts_file_pattern = dug_annotation_path(
+            os.path.join('*', f'elements.{format}'))
+    else:
+        concepts_file_pattern = os.path.join(
+            data_path, '**', f'elements.{format}')
+    return sorted(glob.glob(concepts_file_pattern, recursive=True))
+
+def dug_input_files_path(name) -> pathlib.Path:
+    path = ROGER_DATA_DIR / "dug" / "input_files" / name
+    if not path.exists():
+        log.info(f"Input file path: {path} does not exist, creating")
+        path.mkdir(parents=True, exist_ok=True)
+    else:
+        log.info(f"Input file path: {path} already exists")
+    return path
+
+def dug_topmed_objects(input_data_path=None):
+    "Return list of TOPMed source files"
+    if not input_data_path:
+        input_data_path = str(dug_input_files_path('topmed'))
+    topmed_file_pattern = os.path.join(input_data_path, "topmed_*.csv")
+    return sorted(glob.glob(topmed_file_pattern))
+
+def dug_anvil_path():
+    """Anvil source files"""
+    return dug_input_files_path('anvil')
+
+def dug_sprint_path():
+    """Anvil source files"""
+    return dug_input_files_path('sprint')
+
+def dug_bacpac_path():
+    """Anvil source files"""
+    return dug_input_files_path('bacpac')
+
+def dug_heal_mds_path():
+    """HEAL MDS source files"""
+    return dug_input_files_path('heal-mds-imports')
+
+def dug_heal_research_program_path():
+    """HEAL research programs source files"""
+    return dug_input_files_path('heal-research-programs')
+
+def dug_heal_study_path():
+    """HEAL study source files"""
+    return dug_input_files_path('heal-study-imports')
+
+def dug_crdc_path():
+    """Anvil source files"""
+    return dug_input_files_path('crdc')
+
+def dug_kfdrc_path():
+    """Anvil source files"""
+    return dug_input_files_path('kfdrc')
+
+def dug_nida_objects(input_data_path=None):
+    "Return list of NIDA source files"
+    if not input_data_path:
+        input_data_path = str(dug_input_files_path('nida'))
+    nida_file_pattern = os.path.join(input_data_path, "NIDA-*.xml")
+    return sorted(glob.glob(nida_file_pattern))
+
+def dug_sparc_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = str(dug_input_files_path('sparc'))
+    file_pattern = os.path.join(input_data_path, "scicrunch/*.xml")
+    return sorted(glob.glob(file_pattern))
+
+def dug_anvil_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = dug_anvil_path()
+    files = get_files_recursive(
+        lambda file_name: (
+            not file_name.startswith('GapExchange_')
+            and file_name.endswith('.xml')),
+        input_data_path)
+    return sorted([str(f) for f in files])
+
+def dug_sprint_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = dug_sprint_path()
+    files = get_files_recursive(
+        lambda file_name: file_name.endswith('.xml'), input_data_path)
+    return sorted([str(f) for f in files])
+
+def dug_bacpac_objects(input_data_path=None):
+    "Return list of BACPAC source files"
+    if not input_data_path:
+        input_data_path = dug_bacpac_path()
+    files = get_files_recursive(
+        lambda file_name: file_name.endswith('.xml'), input_data_path)
+    return sorted([str(f) for f in files])
+
+def dug_crdc_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = dug_crdc_path()
+    files = get_files_recursive(
+        lambda file_name: (
+            not file_name.startswith('GapExchange_')
+            and file_name.endswith('.xml')),
+        input_data_path)
+    return sorted([str(f) for f in files])
+
+def dug_heal_study_objects(input_data_path=None):
+    "Return list of HEAL study source files"
+    if not input_data_path:
+        input_data_path = dug_heal_study_path()
+    files = get_files_recursive(lambda file_name : file_name.endswith('.xml'),
+                                input_data_path)
+    return sorted([str(f) for f in files])
+
+def dug_heal_research_program_objects(input_data_path=None):
+    "Return list of HEAL research program source files"
+    if not input_data_path:
+        input_data_path = dug_heal_research_program_path()
+    files = get_files_recursive(lambda file_name : file_name.endswith('.xml'),
+                                input_data_path)
+    return sorted([str(f) for f in files])
+
+def dug_kfdrc_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = dug_kfdrc_path()
+    files = get_files_recursive(
+        lambda file_name: (
+            not file_name.startswith('GapExchange_')
+            and file_name.endswith('.xml')),
+        input_data_path)
+    return sorted([str(f) for f in files])
+
+
+def dug_dd_xml_path():
+    """ Topmed source files"""
+    return dug_input_files_path('db_gap')
+
+def get_files_recursive(file_name_filter, current_dir):
+    file_paths = []
+    for child in current_dir.iterdir():
+        if child.is_dir():
+            file_paths += get_files_recursive(file_name_filter, child)
+            continue
+        if not file_name_filter(child.name):
+            continue
+        else:
+            file_paths += [child]
+    return file_paths
+
+def dug_dd_xml_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = dug_dd_xml_path()
+    files = get_files_recursive(
+        lambda file_name: (
+            not file_name.startswith('._')
+            and file_name.endswith('.xml')),
+        input_data_path)
+    return sorted([str(f) for f in files])
+
+def copy_file_to_dir(file_location, dir_name):
+    return shutil.copy(file_location, dir_name)
+
+def read_schema (schema_type: SchemaType, path=None):
+    """ Read a schema object.
+    :param schema_type: Schema type of the object to read. """
+    if path is not None:
+        path = path / '**'
+    location = glob.glob(schema_path (f"{schema_type.value}-schema.json", path=path), recursive=True)[0]
+    return read_object (location)
+
+def get_uri (path, key):
+    """ Build a URI.
+    :param path: The path of an object.
+    :param key: The key of a configuration value to prepend to the object. """
+    # Incase config has http://..../ or http://... remove / and add back to
+    # avoid double http://...//
+    root_url = config[key].rstrip('/')
+    return f"{root_url}/{path}"
+
+def get_relative_path (path):
+    return os.path.join (os.path.dirname (__file__), path)
+
+def read_relative_object (path):
+    return read_object (get_relative_path(path))
+
+def trunc(text, limit):
+    return ('..' + text[-limit-2:]) if len(text) > limit else text
+
+
+
+def json_line_iter(jsonl_file_path):
+    f = open(file=jsonl_file_path, mode='r', encoding='utf-8')
+    for line in f:
+        yield json.loads(line)
+    f.close()
+
+def jsonl_iter(file_name):
+    # iterating over jsonl files
+    with open(file_name) as stream:
+        for line in stream:
+            # yield on line at time
+            yield json.loads(line)
+
+def json_iter(json_file,entity_key):
+    with open(json_file) as stream:
+        data = json.loads(stream.read())
+        return data[entity_key]
+
+def downloadfile(thread_num, inputq, doneq):
+    url = ""
+    t0 = 0
+    pct = 0
+
+    def downloadprogress(blocknumber, readsize, totalfilesize):
+        nonlocal thread_num
+        nonlocal url, t0, pct
+        blocks_expected = (
+            int(totalfilesize/readsize) +
+            (1 if totalfilesize%readsize != 0 else 0))
+        t1 = int(current_time_in_millis()/1000)
+        elapsed_delta = t1 - t0
+        pct = int(100 * blocknumber / blocks_expected)
+        if elapsed_delta >= 30: # every n seconds
+            log.info(f"thread-{thread_num} {pct}% of size:{totalfilesize} "
+                     f"({blocknumber}/{blocks_expected}) url:{url}")
+            t0 = t1
+
+    num_files_processed = 0
+    while inputq.empty() is False:
+        t0 = int(current_time_in_millis()/1000)
+        url, dst = inputq.get()
+        num_files_processed += 1
+        log.info(f"thread-{thread_num} downloading {url}")
+        try:
+            path, httpMessage = urlretrieve(
+                url, dst, reporthook=downloadprogress)
+            if pct < 100:
+                httpMessageKeys = httpMessage.keys()
+                log.info(f"thread-{thread_num} urlretrieve path:'{path}' "
+                         f"http-keys:{httpMessageKeys} "
+                         f"httpMessage:'{httpMessage.as_string()}")
+        except Exception as e:
+            log.error(f"thread-{thread_num} downloadfile excepton: {e}")
+            continue
+        log.info(f"thread-{thread_num} downloaded {dst}")
+    doneq.put((thread_num,num_files_processed))
+    log.info(f"thread-{thread_num} done!")
+    return
diff --git a/dags/roger/logger.py b/dags/roger/logger.py
new file mode 100644
index 00000000..04e47897
--- /dev/null
+++ b/dags/roger/logger.py
@@ -0,0 +1,34 @@
+import logging
+import sys
+from typing import Optional
+from roger.config import get_default_config
+
+logger: Optional[logging.Logger] = None
+
+
+def get_logger(name: str = 'roger') -> logging.Logger:
+    """
+    Get an instance of logger.
+
+    Parameters
+    ----------
+    name: str
+        The name of logger
+
+    Returns
+    -------
+    logging.Logger
+        An instance of logging.Logger
+
+    """
+    global logger
+    if logger is None:
+        config = get_default_config()
+        logger = logging.getLogger(name)
+        handler = logging.StreamHandler(sys.stdout)
+        formatter = logging.Formatter(config['logging']['format'])
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        logger.setLevel(config['logging']['level'])
+        logger.propagate = True
+    return logger
diff --git a/dags/roger/models/__init__.py b/dags/roger/models/__init__.py
new file mode 100644
index 00000000..3a994ff8
--- /dev/null
+++ b/dags/roger/models/__init__.py
@@ -0,0 +1,4 @@
+"Data models for Roger"
+
+from roger.models.kgx import KGXModel
+from roger.models.biolink import BiolinkModel
diff --git a/dags/roger/models/biolink.py b/dags/roger/models/biolink.py
new file mode 100644
index 00000000..43102188
--- /dev/null
+++ b/dags/roger/models/biolink.py
@@ -0,0 +1,48 @@
+"Biolink data model for Roger"
+
+from bmt import Toolkit
+from roger.logger import get_logger
+
+log = get_logger()
+
+class BiolinkModel:
+    "Biolink data model for Roger"
+    root_type = 'biolink:NamedThing'
+
+    def __init__(self, bl_version='v3.1.2'):
+        self.bl_url = (f'https://raw.githubusercontent.com/biolink'
+                       f'/biolink-model/{bl_version}/biolink-model.yaml')
+        log.info("bl_url is %s", self.bl_url)
+        self.toolkit = Toolkit()
+
+    def find_biolink_leaves(self, biolink_concepts):
+        """Given list of  concepts, returns leaves minus any parent concepts
+        :param biolink_concepts: list of biolink concepts
+        :return: leave concepts.
+        """
+        ancestry_set = set()
+        all_concepts = set(biolink_concepts)
+        unknown_elements = set()
+
+        for x in all_concepts:
+            current_element = self.toolkit.get_element(x)
+            if not current_element:
+                unknown_elements.add(x)
+            ancestors = set(self.toolkit.get_ancestors(
+                x, mixin=True, reflexive=False, formatted=True))
+            ancestry_set = ancestry_set.union(ancestors)
+        leaf_set = all_concepts - ancestry_set - unknown_elements
+        return leaf_set
+
+    def get_leaf_class (self, names):
+        """ Return the leaf classes in the provided list of names. """
+        leaves = list(self.find_biolink_leaves(names))
+        return leaves[0]
+
+    def get_label(self, class_name):
+        "Return the label for the given class name"
+        element = self.toolkit.get_element(class_name)
+        if element:
+            name = element.name
+            return name
+        return class_name.replace("biolink:", "").replace("_", " ")
diff --git a/dags/roger/models/kgx.py b/dags/roger/models/kgx.py
new file mode 100644
index 00000000..e35ab07e
--- /dev/null
+++ b/dags/roger/models/kgx.py
@@ -0,0 +1,503 @@
+"KGX data model for Roger"
+
+import os
+import time
+import queue
+from itertools import chain
+import threading
+from collections import defaultdict
+from xxhash import xxh64_hexdigest
+import orjson as json
+import ntpath
+from kg_utils.merging import DiskGraphMerger
+from kg_utils.constants import *
+
+from roger.config import get_default_config
+from roger.logger import get_logger
+from roger.components.data_conversion import compare_types
+from roger.core import storage
+from roger.models.biolink import BiolinkModel
+from roger.core.enums import SchemaType
+
+log = get_logger()
+
+class KGXModel:
+    """ Abstractions for transforming KGX formatted data.
+
+    KGX stands for Knowledge Graph Exchange
+    """
+    def __init__(self, biolink=None, config=None):
+        if not config:
+            config = get_default_config()
+        self.config = config
+
+        # We need a temp director for the DiskGraphMerger
+        self.temp_directory = storage.merge_path(
+            self.config.kgx.merge_db_temp_dir)
+        log.debug(f"Setting temp_directory to : {self.temp_directory}")
+        isExist = os.path.exists(self.temp_directory)
+        if not isExist:
+            os.makedirs(self.temp_directory)
+
+        self.merger = DiskGraphMerger(temp_directory=self.temp_directory,
+                                      chunk_size=5_000_000)
+        self.biolink_version = self.config.kgx.biolink_model_version
+        log.debug(f"Trying to get biolink version : {self.biolink_version}")
+        if biolink is None:
+            self.biolink = BiolinkModel(self.biolink_version)
+        else:
+            self.biolink = biolink
+        self.enable_metrics = self.config.get('enable_metrics', False)
+
+    def get_kgx_json_format(self, files: list, dataset_version: str):
+        """Gets Json formatted kgx files.
+
+        These files have a the following structure:
+        {"nodes": [{"id":"..."},...], "edges": [{"id":...},...}] }
+
+        Parameters
+        ----------
+        files : list of file names
+        dataset_version : dataset version from dataset meta-data information
+
+        Returns None
+        -------
+
+        """
+        file_tuple_q = queue.Queue()
+        thread_done_q = queue.Queue()
+        for nfile, file_name in enumerate(files):
+            # file_url or skip
+            file_name = dataset_version + "/" + file_name
+            file_url = storage.get_uri(file_name, "kgx_base_data_uri")
+            subgraph_basename = os.path.basename(file_name)
+            subgraph_path = storage.kgx_path(subgraph_basename)
+            if os.path.exists(subgraph_path):
+                log.info(f"cached kgx: {subgraph_path}")
+                continue
+            log.debug("#{}/{} to get: {}".format(
+                nfile+1, len(files), file_url))
+            # folder
+            dirname = os.path.dirname (subgraph_path)
+            if not os.path.exists (dirname):
+                os.makedirs (dirname, exist_ok=True)
+            # add to queue
+            file_tuple_q.put((file_url,subgraph_path))
+
+        # start threads for each file download
+        threads = []
+        for thread_num in range(len(files)): # len(files)
+            th = threading.Thread(
+                target=storage.downloadfile,
+                args=(thread_num, file_tuple_q, thread_done_q))
+            th.start()
+            threads.append(th)
+
+        # wait for each thread to complete
+        for nwait in range(len(threads)):
+            thread_num, num_files_processed = thread_done_q.get()
+            th = threads[thread_num]
+            th.join()
+            log.info(f"#{nwait+1}/{len(threads)} joined: "
+                     f"thread-{thread_num} processed: "
+                     f"{num_files_processed} file(s)")
+
+        all_kgx_files = []
+        for nfile, file_name in enumerate(files):
+            start = storage.current_time_in_millis()
+            file_name = dataset_version + "/" + file_name
+            file_url = storage.get_uri(file_name, "kgx_base_data_uri")
+            subgraph_basename = os.path.basename(file_name)
+            subgraph_path = storage.kgx_path(subgraph_basename)
+            all_kgx_files.append(subgraph_path)
+            if os.path.exists(subgraph_path):
+                log.info(f"cached kgx: {subgraph_path}")
+                continue
+            log.info ("#{}/{} read: {}".format(nfile+1, len(files), file_url))
+            subgraph = storage.read_object(file_url)
+            storage.write_object(subgraph, subgraph_path)
+            total_time = storage.current_time_in_millis() - start
+            edges = len(subgraph['edges'])
+            nodes = len(subgraph['nodes'])
+            log.info(
+                "#{}/{} edges:{:>7} nodes: {:>7} time:{:>8} wrote: {}".format(
+                    nfile+1, len(files), edges, nodes,
+                    total_time/1000, subgraph_path))
+        return all_kgx_files
+
+    def get_kgx_jsonl_format(self, files, dataset_version):
+        """gets pairs of jsonl formatted kgx files.
+
+        Files is expected to have all the pairs.
+
+        I.e if kgx_1_nodes.jsonl exists its expected that kgx_1_edges.jsonl
+        exists in the same path.
+        File names should have strings *nodes*.jsonl and *edges*.jsonl.
+        Parameters
+        ----------
+        files
+        dataset_version
+
+        Returns
+        -------
+
+        """
+        # make a paired list
+        paired_up = []
+        log.info(f"getting {files}")
+        for file_name in files:
+            if "nodes" in file_name:
+                paired_up.append(
+                    [file_name, file_name.replace('nodes', 'edges')])
+        error = False
+        # validate that all pairs exist
+        if len(files) / 2 != len(paired_up):
+            log.error("Error paired up kgx jsonl files don't match "
+                      "list of files specified in metadata.yaml")
+            error = True
+        for pairs in paired_up:
+            if pairs[0] not in files:
+                log.error(
+                    f"{pairs[0]} not in original list "
+                    f"of files from metadata.yaml")
+                error = True
+            if pairs[1] not in files:
+                error = True
+                log.error(
+                    f"{pairs[1]} not in original list "
+                    f"of files from metadata.yaml")
+        if error:
+            raise Exception("Metadata.yaml has inconsistent jsonl files")
+
+        file_tuple_q = queue.Queue()
+        thread_done_q = queue.Queue()
+        for npairs, pairs in enumerate(paired_up):
+            for npair, p in enumerate(pairs):
+                file_name = dataset_version + "/" + p
+                file_url = storage.get_uri(file_name, "kgx_base_data_uri")
+                subgraph_basename = os.path.basename(file_name)
+                subgraph_path = storage.kgx_path(subgraph_basename)
+                if os.path.exists(subgraph_path):
+                    log.info(f"skip cached kgx: {subgraph_path}")
+                    continue
+                log.info ("#{}.{}/{} read: {}".format(
+                    npairs+1, npair+1, len(paired_up), file_url))
+                # folder
+                dirname = os.path.dirname (subgraph_path)
+                if not os.path.exists (dirname):
+                    os.makedirs (dirname, exist_ok=True)
+                # add to queue
+                file_tuple_q.put((file_url,subgraph_path))
+
+        # start threads for each file download
+        threads = []
+        for thread_num in range(file_tuple_q.qsize()):
+            th = threading.Thread(
+                target=storage.downloadfile,
+                args=(thread_num, file_tuple_q, thread_done_q))
+            th.start()
+            threads.append(th)
+
+        # wait for each thread to complete
+        for nwait in range(len(threads)):
+            thread_num, num_files_processed = thread_done_q.get()
+            th = threads[thread_num]
+            th.join()
+            log.info(f"#{nwait+1}/{len(threads)} joined: "
+                     f"thread-{thread_num} processed: "
+                     f"{num_files_processed} file(s)")
+
+        all_kgx_files = []
+        for pairs in paired_up:
+            nodes = 0
+            edges = 0
+            start = storage.current_time_in_millis()
+            for p in pairs:
+                file_name = dataset_version + "/" + p
+                file_url = storage.get_uri(file_name, "kgx_base_data_uri")
+                subgraph_basename = os.path.basename(file_name)
+                subgraph_path = storage.kgx_path(subgraph_basename)
+                all_kgx_files.append(subgraph_path)
+                if os.path.exists(subgraph_path):
+                    log.info(f"cached kgx: {subgraph_path}")
+                    continue
+                data = storage.read_object(file_url)
+                storage.write_object(data, subgraph_path)
+                if "edges" in p:
+                    edges = len(data.split('\n'))
+                else:
+                    nodes = len(data.split('\n'))
+            total_time = storage.current_time_in_millis() - start
+            log.info(
+                "wrote {:>45}: edges:{:>7} nodes: {:>7} time:{:>8}".format(
+                    storage.trunc(subgraph_path, 45), edges, nodes, total_time))
+        return all_kgx_files
+
+    def get (self, dataset_version = "v1.0"):
+        """ Read metadata for KGX files and downloads them locally.
+        :param dataset_version: Data version to operate on.
+        """
+        metadata = storage.read_relative_object ("../../metadata.yaml")
+        data_set_list = self.config.kgx.data_sets
+        kgx_files_remote = []
+        for item in metadata['kgx']['versions']:
+            if (item['version'] == dataset_version and
+                item['name'] in data_set_list):
+                log.info(f"Getting KGX dataset {item['name']}, "
+                         f"version {item['version']}")
+                if item['format'] == 'json':
+                    kgx_files_remote += self.get_kgx_json_format(
+                        item['files'], item['version'])
+                elif item['format'] == 'jsonl':
+                    kgx_files_remote += self.get_kgx_jsonl_format(
+                        item['files'], item['version'])
+                else:
+                    raise ValueError(
+                        f"Unrecognized format in metadata.yaml: "
+                        f"{item['format']}, valid formats are `json` "
+                        f"and `jsonl`.")
+        # Fetchs kgx generated from Dug Annotation workflow.
+        new_files = self.fetch_dug_kgx() + kgx_files_remote
+        all_files_in_dir = (
+            storage.kgx_objects("json") +
+            storage.kgx_objects("jsonl"))
+        files_to_remove = [x for x in all_files_in_dir
+                           if x not in new_files]
+        if len(files_to_remove):
+            log.info(
+                "Found some old files to remove from kgx dir : %s",
+                files_to_remove)
+            for file in files_to_remove:
+                storage.remove(file)
+                log.info("removed %s", file)
+        log.info("Done.")
+
+
+
+    def fetch_dug_kgx(self):
+        """
+        Copies files from dug output dir to roger kgx dir.
+        :return:
+        """
+        dug_kgx_files = storage.dug_kgx_objects()
+        all_kgx_files = []
+        log.info("Copying dug KGX files to %s. Found %d kgx files to copy.",
+                 storage.kgx_path(''), len(dug_kgx_files))
+        for file in dug_kgx_files:
+            file_name = ntpath.basename(file)
+            dest = storage.kgx_path(file_name)
+            all_kgx_files.append(dest)
+            storage.write_object({}, dest)
+            log.info(f"Copying from {file} to {dest}.")
+            storage.copy_file_to_dir(file, dest)
+        log.info("Done copying dug KGX files.")
+        return all_kgx_files
+
+    def create_nodes_schema(self, input_data_path=None, output_data_path=None):
+        """
+        Extracts schema for nodes based on biolink leaf types
+        :return:
+        """
+
+        category_schemas = defaultdict(lambda: None)
+        category_error_nodes = set()
+        merged_nodes_file = storage.merged_objects("nodes", input_data_path)
+        log.info(f"Processing : {merged_nodes_file}")
+        counter = 0
+        for node in storage.json_line_iter(merged_nodes_file):
+            # Debuging code
+            if counter % 10000 == 0:
+                log.info(f"Processing node : {node} counter : {counter}")
+            counter += 1
+
+            if not node.get('category'):
+                category_error_nodes.add(node['id'])
+                node['category'] = [BiolinkModel.root_type]
+
+            # Get all leaf types of this node
+            node_types = list(
+                self.biolink.find_biolink_leaves(node['category']))
+            # pick the fist one to work on            
+            node_type = node_types[0]
+
+
+            # make sure it is defined in the final dict
+            category_schemas[node_type] = category_schemas.get(node_type, {})
+
+            # compute full list of attributes and the value types of the
+            # attributes for that type.
+            for k in node.keys():
+                current_type = type(node[k]).__name__
+                if k not in category_schemas[node_type]:
+                    category_schemas[node_type][k] = current_type
+                else:
+                    previous_type = category_schemas[node_type][k]
+                    category_schemas[node_type][k] = compare_types(
+                        previous_type, current_type)
+
+            # copy over final result to every other leaf type
+            for tp in node_types:
+                category_schemas[tp] = category_schemas[node_type]
+
+
+        if len(category_error_nodes):
+            log.warning(f"some nodes didn't have category assigned. "
+                     f"KGX file has errors."
+                     f"Nodes {len(category_error_nodes)}."
+                     f"Showing first 10: {list(category_error_nodes)[:10]}."
+                     f"These will be treated as {BiolinkModel.root_type}.")
+
+        # Write node schemas.
+        self.write_schema(category_schemas, SchemaType.CATEGORY, output_path=output_data_path)
+
+    def create_edges_schema(self, input_data_path=None, output_data_path=None):
+        """
+        Create unified schema for all edges in an edges jsonl file.
+        :return:
+        """
+        predicate_schemas = defaultdict(lambda: None)
+        merged_edges_file = storage.merged_objects("edges", input_data_path)
+        """ Infer predicate schemas. """
+        for edge in storage.json_line_iter(merged_edges_file):
+            predicate = edge['predicate']
+            predicate_schemas[predicate] = predicate_schemas.get(predicate,
+                                                                 {})
+            for k in edge.keys():
+                current_type = type(edge[k]).__name__
+                if k not in predicate_schemas[predicate]:
+                    predicate_schemas[predicate][k] = current_type
+                else:
+                    previous_type = predicate_schemas[predicate][k]
+                    predicate_schemas[predicate][k] = compare_types(
+                        previous_type, current_type)
+        self.write_schema(predicate_schemas, SchemaType.PREDICATE, output_path=output_data_path)
+
+    def create_schema (self):
+        """Determine the schema of each type of object.
+
+        We have to do this to make it possible to write tabular data. Need to
+        know all possible columns in advance and correct missing fields.
+        """
+        if self.schema_up_to_date():
+            log.info (f"schema is up to date.")
+            return
+
+        self.create_nodes_schema()
+        self.create_edges_schema()
+
+    def schema_up_to_date (self):
+        return storage.is_up_to_date (
+            source=storage.kgx_objects(),
+            targets=[
+                storage.schema_path (
+                    f"{SchemaType.PREDICATE.value}-schema.json"),
+                storage.schema_path (
+                    f"{SchemaType.PREDICATE.value}-schema.json")
+            ])
+
+    def write_schema(self, schema, schema_type: SchemaType ,output_path=None):
+        """ Output the schema file.
+ 
+        :param schema: Schema to get keys from.
+        :param schema_type: Type of schema to write.
+        """
+        file_name = storage.schema_path (f"{schema_type.value}-schema.json", output_path)
+        log.info("writing schema: %s", file_name)
+        dictionary = { k : v for k, v in schema.items () }
+        storage.write_object (dictionary, file_name)
+
+    def merge(self, input_path=None, output_path=None):
+        """ This version uses the disk merging from the kg_utils module """
+
+        metrics = {}
+        start = time.time()
+
+        log.info(f"Input path = {input_path}, Output path = {output_path}")
+
+        if input_path:
+            json_format_files = storage.kgx_objects("json", input_path)
+            jsonl_format_files = storage.kgx_objects("jsonl", input_path)
+        else:
+            json_format_files = storage.kgx_objects("json")
+            jsonl_format_files = storage.kgx_objects("jsonl")
+
+        # Create lists of the nodes and edges files in both json and jsonl
+        # formats
+        jsonl_node_files = {file for file in jsonl_format_files
+                            if "node" in file.split('/')[-1]}
+        jsonl_edge_files = {file for file in jsonl_format_files
+                            if "edge" in file.split('/')[-1]}
+        log.info(f"Jsonl edge files : {jsonl_edge_files}")
+        log.info(f"Jsonl node files : {jsonl_node_files}")
+
+        # Create all the needed iterators and sets thereof
+        jsonl_node_iterators = [storage.jsonl_iter(file_name)
+                                for file_name in jsonl_node_files]
+        jsonl_edge_iterators = [storage.jsonl_iter(file_name)
+                                for file_name in jsonl_edge_files]
+        json_node_iterators = [storage.json_iter(file_name, 'nodes')
+                               for file_name in json_format_files]
+        json_edge_iterators = [storage.json_iter(file_name, 'edges')
+                               for file_name in json_format_files]
+        all_node_iterators = json_node_iterators + jsonl_node_iterators
+        all_edge_iterators = json_edge_iterators + jsonl_edge_iterators
+
+        # chain the iterators together
+        node_iterators = chain(*all_node_iterators)
+        edge_iterators = chain(*all_edge_iterators)
+
+        # now do the merge
+        self.merger.merge_nodes(node_iterators)
+        merged_nodes = self.merger.get_merged_nodes_jsonl()
+
+
+        self.merger.merge_edges(edge_iterators)
+        merged_edges = self.merger.get_merged_edges_jsonl()
+
+        write_merge_metric = {}
+        t = time.time()
+        start_nodes_jsonl = time.time()
+
+
+        nodes_file_path = storage.merge_path("nodes.jsonl", output_path)
+
+        # stream out nodes to nodes.jsonl file
+        with open(nodes_file_path, 'w') as stream:
+            for nodes in merged_nodes:
+                stream.write(nodes)
+
+        time_difference = time.time() - start_nodes_jsonl
+        log.info("writing nodes took : %s", str(time_difference))
+        write_merge_metric['nodes_writing_time'] = time_difference
+        start_edge_jsonl = time.time()
+
+        # stream out edges to edges.jsonl file
+        edges_file_path = storage.merge_path("edges.jsonl", output_path)
+        with open(edges_file_path, 'w') as stream:
+            for edges in merged_edges:
+                edges = json.loads(edges)
+                # Add an id field for the edges as some of the downstream
+                # processing expects it.
+                edges['id'] = xxh64_hexdigest(
+                    edges['subject'] + edges['predicate'] +
+                    edges['object'] +
+                    edges.get("biolink:primary_knowledge_source", ""))
+                keys_to_del = set()
+                for key in edges:
+                    if key.startswith('biolink:'):                        
+                        keys_to_del.add(key)
+                for k in keys_to_del:
+                    edges[k.replace('biolink:', '')] = edges[k]
+                    del edges[k]
+                stream.write(json.dumps(edges).decode('utf-8') + '\n')
+
+        write_merge_metric['edges_writing_time'] = time.time() - start_edge_jsonl
+        log.info(f"writing edges took: {time.time() - start_edge_jsonl}")
+        write_merge_metric['total_time'] = time.time() - t
+        metrics['write_jsonl'] = write_merge_metric
+        metrics['total_time'] = time.time() - start
+        log.info(f"total took: {time.time() - start}")
+        if self.enable_metrics:
+            metricsfile_path = storage.metrics_path('merge_metrics.yaml')
+            storage.write_object(metrics, metricsfile_path)
+
diff --git a/dags/roger/pipelines/README.md b/dags/roger/pipelines/README.md
new file mode 100644
index 00000000..e77e6a29
--- /dev/null
+++ b/dags/roger/pipelines/README.md
@@ -0,0 +1,99 @@
+# Building custom Dug data pipelines
+
+The pipelines submodule is where data pipelines can be defined for specific data
+sets with specific, custom behaviors for each one. In previous versions of the
+code, customizations for each pipeline were spread across several modules. With
+this instantiation, the customizations for each data set pipeline are
+consolidated into a single overridden subclass of the DataPipeline class.
+
+## What the base pipeline does
+
+The function `roger.tasks.create_pipeline_taskgroup`, when called with the given
+data pipeline class, will emit an Airflow task group with the following
+structure. If Airflow is not being used, another executor should use a similarly
+structured set of calls and dependencies to ensure that the task pipeline
+executes fully and in order.
+
+```mermaid
+graph TD;
+    annotate-->index_variables;
+    annotate-->validate_index_variables;
+    index_variables-->validate_index_variables;
+    annotate-->make_kg_tagged;
+    annotate-->crawl_tranql;
+    annotate-->index_concepts;
+    crawl_tranql-->validate_index_concepts;
+    index_concepts-->validate_index_concepts;
+    annotate-->validate_index_concepts;
+```
+The pipeline steps are briefly described below
+
+### annotate
+
+By default, `annotate` will call the `get_objects` method to collect a list of
+parsable files. For each of these files, a Dug Crawler object will be created
+which will apply the parser returned by the pipeline class's `get_parser_name`
+method. (This by default will return `parser_name` if it's defined, or will fall
+back to `pipeline_name`.) The results will be written to `elements.json` and
+`concepts.json` as appropriate.
+
+### index_variables
+
+This will load the `elements.json` files from `annotate` and pass them to the
+indexer built from a DugFactory object. (This is sending them to ElasticSearch
+for indexing under the hood.)
+
+### make_kg_tagged
+
+All `elements.json` files will be loaded, and based on the annotations, a
+Translator-compliant knowledge graph will be written to a `_kgx.json` file.
+
+### index_concepts
+
+The `concepts.json` files are read and submitted to ElasticSearch using the
+indexer object derived from the embedded DugFactory object. 
+
+### validate_index_concepts
+
+Concepts from `concepts.json` are double-checked to ensure that the ES indexing
+process actually worked.
+
+## Defining a basic pipeline, with no customizations
+
+Simple pipelines, such as that for the BACPAC dataset, need very little
+customization. All pipelines must define a `pipeline_name`, which will be used
+as the default value for a number of other parameters if they are not
+defined. In the case of BACPAC, a difference in case means that both the
+`pipeline_name` and the `parser_name` need to be defined.
+
+```python
+from roger.pipelines import DugPipeline
+
+class BacPacPipeline(DugPipeline):
+    "Pipeline for BACPAC data set"
+    pipeline_name = "bacpac"
+    parser_name = "BACPAC"
+```
+
+This is the full extent of the code needed to adapt the DugPipeline object to
+BACPAC. Other data sets have more specific customizations that need more custom
+code or variables defined.
+
+## More extensive customization
+
+Because the base pipeline (defined in `roger/pipelines/base.py:DugPipeline`) is
+inherited as a subclass for customizing, effectively any part of the pipeline
+that isn't part of Dug proper can be overriden. Here are some common
+customizations that are expected to be necessary for many parts of the process:
+
+### get_objects
+
+The `get_objects` method by default looks in the `input_data_path` that is
+passed to it, and if that is None, loads the default from the `ROGER_DATA_DIR`
+environment variable. By default, it reads all files with the `.xml` extension
+recursively anywhere in that directory or its subdirectories.
+
+One example customization is the anvil data pipeline, which additionally
+excludes any file that starts with 'GapExchange_'. Any overriden method should
+accept an optional `input_data_path` parameter and return a list of files,
+sorted in the order that they should be processed.
diff --git a/dags/roger/pipelines/__init__.py b/dags/roger/pipelines/__init__.py
new file mode 100644
index 00000000..d6664b8e
--- /dev/null
+++ b/dags/roger/pipelines/__init__.py
@@ -0,0 +1,28 @@
+"Modules for individual datasets"
+
+import pkgutil
+from pathlib import Path
+import importlib
+
+from .base import DugPipeline
+
+def get_pipeline_classes(pipeline_names_dict):
+    """Return a list of all defined pipeline classes
+    """
+
+    base_path = Path(__file__).resolve().parent
+
+    for (_, mod_name, _) in pkgutil.iter_modules([base_path]):
+        if mod_name == 'base':
+            continue
+
+        # No need to actuall get the module symbol, once it's imported, it will
+        # show up below in __subclasses__.
+        importlib.import_module(f"{__name__}.{mod_name}")
+    pipeline_list = []
+
+    for subclass in DugPipeline.__subclasses__():
+        if getattr(subclass, 'pipeline_name') and getattr(subclass, 'pipeline_name') in pipeline_names_dict.keys():
+            subclass.input_version = pipeline_names_dict[getattr(subclass, 'pipeline_name')]
+            pipeline_list.append(subclass)
+    return pipeline_list
diff --git a/dags/roger/pipelines/anvil.py b/dags/roger/pipelines/anvil.py
new file mode 100644
index 00000000..baa82c05
--- /dev/null
+++ b/dags/roger/pipelines/anvil.py
@@ -0,0 +1,24 @@
+"Pipeline for anvil data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class AnvilPipeline(DugPipeline):
+    "Pipeline for Anvil data set"
+    pipeline_name = 'anvil'
+    parser_name = 'Anvil'
+
+    def get_objects(self, input_data_path=None):
+        """Retrieve anvil objects
+
+        This code is imported from roger.core.storage.dug_anvil_objects
+        """
+        if not input_data_path:
+            input_data_path = storage.dug_input_files_path(
+                self.files_dir)
+        files = storage.get_files_recursive(
+            lambda file_name: (
+                not file_name.startswith('GapExchange_')
+                and file_name.endswith('.xml')),
+            input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/bacpac.py b/dags/roger/pipelines/bacpac.py
new file mode 100644
index 00000000..495ba3b9
--- /dev/null
+++ b/dags/roger/pipelines/bacpac.py
@@ -0,0 +1,8 @@
+"Pipeline for BACPAC data"
+
+from roger.pipelines import DugPipeline
+
+class BacPacPipeline(DugPipeline):
+    "Pipeline for BACPAC data set"
+    pipeline_name = "bacpac"
+    parser_name = "BACPAC"
diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py
new file mode 100644
index 00000000..e108443e
--- /dev/null
+++ b/dags/roger/pipelines/base.py
@@ -0,0 +1,994 @@
+"Base class for implementing a dataset annotate, crawl, and index pipeline"
+
+import os
+import asyncio
+from io import StringIO
+import logging
+import re
+import hashlib
+import traceback
+from functools import reduce
+from pathlib import Path
+import tarfile
+from typing import Union
+import jsonpickle
+
+import requests
+
+from dug.core import get_parser, get_annotator, get_plugin_manager, DugConcept
+from dug.core.concept_expander import ConceptExpander
+from dug.core.crawler import Crawler
+from dug.core.factory import DugFactory
+from dug.core.parsers import Parser, DugElement
+from dug.core.annotators import Annotator
+from dug.core.async_search import Search
+from dug.core.index import Index
+
+from roger.config import RogerConfig
+from roger.core import storage
+from roger.models.biolink import BiolinkModel
+from roger.logger import get_logger
+
+from utils.s3_utils import S3Utils
+
+log = get_logger()
+
+class PipelineException(Exception):
+    "Exception raised from DugPipeline and related classes"
+
+def make_edge(subj,
+              obj,
+              predicate='biolink:related_to',
+              predicate_label='related to',
+              relation='biolink:related_to',
+              relation_label='related to'
+              ):
+    """Create an edge between two nodes.
+
+    :param subj: The identifier of the subject.
+    :param pred: The predicate linking the subject and object.
+    :param obj: The object of the relation.
+    :param predicate: Biolink compatible edge type.
+    :param predicate_label: Edge label.
+    :param relation: Ontological edge type.
+    :param relation_label: Ontological edge type label.
+    :returns: Returns and edge.
+    """
+    edge_id = hashlib.md5(
+        f'{subj}{predicate}{obj}'.encode('utf-8')).hexdigest()
+    return {
+        "subject": subj,
+        "predicate": predicate,
+        "predicate_label": predicate_label,
+        "id": edge_id,
+        "relation": relation,
+        "relation_label": relation_label,
+        "object": obj,
+        "provided_by": "renci.bdc.semanticsearch.annotator"
+    }
+
+class FileFetcher:
+    """A basic remote file fetcher class
+    """
+
+    def __init__(
+            self,
+            remote_host: str,
+            remote_dir: Union[str, Path],
+            local_dir: Union[str, Path] = "."
+    ):
+        self.remote_host = remote_host
+        if isinstance(remote_dir, str):
+            self.remote_dir = remote_dir.rstrip("/")
+        else:
+            self.remote_dir = str(remote_dir.as_posix())
+        self.local_dir = Path(local_dir).resolve()
+
+    def __call__(self, remote_file_path: Union[str, Path]) -> Path:
+        remote_path = self.remote_dir + "/" + remote_file_path
+        local_path = self.local_dir / remote_file_path
+        url = f"{self.remote_host}{remote_path}"
+        log.debug("Fetching %s", url)
+        try:
+            response = requests.get(url, allow_redirects=True, timeout=60)
+        except Exception as e:
+            log.error("Unexpected %s: %s", e.__class__.__name__, str(e))
+            raise RuntimeError(f"Unable to fetch {url}") from e
+
+        log.debug("Response: %d", response.status_code)
+        if response.status_code != 200:
+            log.debug("Unable to fetch %s: %d", url, response.status_code)
+            raise RuntimeError(f"Unable to fetch {url}")
+
+        with local_path.open('wb') as file_obj:
+            file_obj.write(response.content)
+        return local_path
+
+class DugPipeline():
+    "Base class for dataset pipelines"
+
+    pipeline_name = None
+    unzip_source = True
+    input_version = ""
+
+    def __init__(self, config: RogerConfig, to_string=False):
+        "Set instance variables and check to make sure we're overriden"
+        if not self.pipeline_name:
+            raise PipelineException(
+                "Subclass must at least define pipeline_name as class var")        
+        self.config = config
+        self.bl_toolkit = BiolinkModel()
+        dug_conf = config.to_dug_conf()
+        self.element_mapping = config.indexing.element_mapping
+        self.factory = DugFactory(dug_conf)
+        self.cached_session = self.factory.build_http_session()
+        self.event_loop = asyncio.new_event_loop()
+        self.log_stream = StringIO()
+        if to_string:
+            self.string_handler = logging.StreamHandler(self.log_stream)
+            log.addHandler(self.string_handler)
+        self.s3_utils = S3Utils(self.config.s3_config)
+
+        self.tranqlizer: ConceptExpander = self.factory.build_tranqlizer()
+
+        graph_name = self.config["redisgraph"]["graph"]
+        source = f"redis:{graph_name}"
+        self.tranql_queries: dict = self.factory.build_tranql_queries(source)
+        self.node_to_element_queries: list = (
+            self.factory.build_element_extraction_parameters(source))
+
+        indexing_config = config.indexing
+        self.variables_index = indexing_config.get('variables_index')
+        self.concepts_index = indexing_config.get('concepts_index')
+        self.kg_index = indexing_config.get('kg_index')
+
+        self.search_obj: Search = self.factory.build_search_obj([
+            self.variables_index,
+            self.concepts_index,
+            self.kg_index,
+        ])
+        self.index_obj: Index = self.factory.build_indexer_obj([
+                self.variables_index,
+                self.concepts_index,
+                self.kg_index,
+
+        ])
+
+    def __enter__(self):
+        self.event_loop = asyncio.new_event_loop()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # close elastic search connection
+        self.event_loop.run_until_complete(self.search_obj.es.close())
+        # close async loop
+        if self.event_loop.is_running() and not self.event_loop.is_closed():
+            self.event_loop.close()
+        if exc_type or exc_val or exc_tb:
+            traceback.print_exc()
+            log.error("%s %s %s", exc_val, exc_val, exc_tb)
+            log.exception("Got an exception")
+
+    def get_data_format(self):
+        """Access method for data_format parameter
+
+        Defaults to pipeline_name unless self.data_format is set. This method
+        can also be overriden
+        """
+        return getattr(self, 'data_format', self.pipeline_name)
+
+    def get_files_dir(self):
+        """Access method for files_dir parameter
+
+        Defaults to pipeline_name unless self.files_dir is set. This method can
+        also be overriden.
+        """
+        return getattr(self, 'files_dir', self.pipeline_name)
+
+    def get_parser_name(self):
+        """Access method for parser_name
+
+        Defaults to pipeline_name unless self.parser_name is set. This method
+        can also be overriden.
+        """
+        return getattr(self, 'parser_name', self.pipeline_name)
+    
+    def get_annotator_name(self):
+        """
+        Access method for annotator_name
+        Defaults to annotator_monarch unless specified using annotation.annotator_type in the configuration file.
+        """
+        return self.config.annotation.annotator_type
+    
+
+    def get_parser(self):
+        dug_plugin_manager = get_plugin_manager()
+        parser: Parser = get_parser(dug_plugin_manager.hook,
+                                         self.get_parser_name())
+        return parser
+    
+    def get_annotator(self):
+        dug_plugin_manager = get_plugin_manager()
+        annotator: Annotator = get_annotator(
+            dug_plugin_manager.hook,
+            self.get_annotator_name(),
+            self.config.to_dug_conf()
+        )
+        return annotator
+
+    def init_annotator(self, max_retries=5, base_delay=1, max_delay=10):
+        attempt = 0
+        while attempt < max_retries:
+            try:                                
+                log.info("Initializing annotator")
+                annotator = self.get_annotator()                
+                return annotator  # success
+            except Exception as e:
+                attempt += 1
+                if attempt == max_retries:
+                    log.error("Max retries reached when creating annotator. Failing with error: %s", e)
+                    raise
+                delay = min(base_delay * (2 ** (attempt - 1)), max_delay)
+                delay += random.uniform(0, 1)  # add jitter
+                log.warning("Error occurred: %s. Retrying in %.2f seconds...", e, delay)
+                time.sleep(delay)
+
+
+    def annotate_files(self, parsable_files, output_data_path=None):
+        """
+        Annotates a Data element file using a Dug parser.
+        :param parser_name: Name of Dug parser to use.
+        :param parsable_files: Files to parse.
+        :return: None.
+        """
+        if not output_data_path:
+            output_data_path = storage.dug_annotation_path('')
+        log.info("Parsing files")
+        log.info("Intializing parser")
+        parser = self.get_parser()
+        log.info("Done intializing parser")
+        annotator = self.init_annotator()
+        log.info("Done intializing annotator")
+        for _, parse_file in enumerate(parsable_files):
+            log.debug("Creating Dug Crawler object on parse_file %s at %d of %d",
+                      parse_file, _ , len(parsable_files))             
+            crawler = Crawler(
+                crawl_file=parse_file,
+                parser=parser,
+                annotator=annotator,
+                tranqlizer='',
+                tranql_queries=[],
+                http_session=self.cached_session
+            )
+
+            # configure output space.
+            current_file_name = '.'.join(
+                os.path.basename(parse_file).split('.')[:-1])
+            elements_file_path = os.path.join(
+                output_data_path, current_file_name)
+            elements_file = os.path.join(elements_file_path, 'elements.txt')
+            concepts_file = os.path.join(elements_file_path, 'concepts.txt')         
+            
+            # Use the specified parser to parse the parse_file into elements.
+            log.debug("Parser is %s", str(parser))
+            elements = parser(parse_file)
+            log.debug("Parsed elements: %s", str(elements))
+
+            # This inserts the list of elements into the crawler where
+            # annotate_elements expects to find it. Maybe in some future version
+            # of Dug this could be a parameter instead of an attribute?
+            crawler.elements = elements
+
+            # @TODO propose for Dug to make this a crawler class init param(??)
+            crawler.crawlspace = elements_file_path
+            log.debug("Crawler annotator: %s", str(crawler.annotator))
+            crawler.annotate_elements()
+
+            # Extract out the concepts gotten out of annotation
+            # Extract out the elements
+            non_expanded_concepts = crawler.concepts
+            # The elements object will have been modified by annotate_elements,
+            # so we want to make sure to catch those modifications.
+            elements = crawler.elements
+
+            # Write pickles of objects to file
+            log.info("Parsed and annotated: %s", parse_file)            
+            
+            storage.write_object(jsonpickle.encode(elements, indent=2), elements_file)
+            log.info("Serialized annotated elements to : %s", elements_file)
+
+            storage.write_object(jsonpickle.encode(non_expanded_concepts, indent=2), concepts_file)
+            log.info("Serialized annotated concepts to : %s", concepts_file)
+
+    def convert_to_kgx_json(self, elements, written_nodes=None):
+        """
+        Given an annotated and normalized set of study variables,
+        generate a KGX compliant graph given the normalized annotations.
+        Write that grpah to a graph database.
+        See BioLink Model for category descriptions.
+        https://biolink.github.io/biolink-model/notes.html
+        """
+        if written_nodes is None:
+            written_nodes = set()
+        graph = {
+            "nodes": [],
+            "edges": []
+        }
+        edges = graph['edges']
+        nodes = graph['nodes']
+
+        for _, element in enumerate(elements):
+            # DugElement means a variable (Study variable...)
+            if not isinstance(element, DugElement):
+                continue
+            study_id = element.collection_id
+            study_link = element.collection_action
+            study_desc = element.collection_desc
+            study_name = element.collection_name or element.collection_id
+
+
+            if study_id not in written_nodes:
+                nodes.append({
+                    "id": study_id,
+                    "category": ["biolink:Study"],
+                    "name": study_name,
+                    "url": study_link,
+                    "description": study_desc
+                })
+                written_nodes.add(study_id)
+
+            # connect the study and the variable.
+            edges.append(make_edge(
+                subj=element.id,
+                relation_label='part of',
+                relation='BFO:0000050',
+                obj=study_id,
+                predicate='biolink:part_of',
+                predicate_label='part of'))
+            edges.append(make_edge(
+                subj=study_id,
+                relation_label='has part',
+                relation="BFO:0000051",
+                obj=element.id,
+                predicate='biolink:has_part',
+                predicate_label='has part'))
+
+            # a node for the variable. Should be BL compatible
+            variable_node = {
+                "id": element.id,
+                "name": element.name,
+                "category": ["biolink:StudyVariable"],
+                # bulk loader parsing issue
+                "description": (
+                    element.description.replace("'", '`').replace('\n', ' '))
+            }
+            if element.id not in written_nodes:
+                nodes.append(variable_node)
+                written_nodes.add(element.id)
+
+            for identifier, metadata in element.concepts.items():
+                identifier_object = metadata.identifiers.get(identifier)
+                # This logic is treating DBGap files.
+                # First item in current DBGap xml files is a topmed tag,
+                # This is treated as a DugConcept Object. But since its not
+                # a concept we get from annotation (?) its never added to
+                # variable.concepts.items (Where variable is a DugElement obj)
+                # The following logic is trying to extract types, and for the
+                # aformentioned topmed tag it adds
+                # `biolink:InfomrmationContentEntity`
+                # Maybe a better solution could be adding types on
+                # DugConcept objects
+                # More specifically Biolink compatible types (?)
+                #
+                if identifier_object:
+                    category = identifier_object.types
+                elif identifier.startswith("TOPMED.TAG:"):
+                    category = ["biolink:InformationContentEntity"]
+                else:
+                    continue
+                if identifier not in written_nodes:
+                    if isinstance(category, str):
+                        bl_element = self.bl_toolkit.toolkit.get_element(
+                            category)
+                        category = [bl_element.class_uri or bl_element.slot_uri]
+                    nodes.append({
+                        "id": identifier,
+                        "category": category,
+                        "name": metadata.name
+                    })
+                    written_nodes.add(identifier)
+                # related to edge
+                edges.append(make_edge(
+                    subj=element.id,
+                    obj=identifier
+                    ))
+                # related to edge
+                edges.append(make_edge(
+                    subj=identifier,
+                    obj=element.id))
+        return graph
+
+    def make_tagged_kg(self, elements):
+        """ Make a Translator standard knowledge graph representing
+        tagged study variables.
+        :param variables: The variables to model.
+        :param tags: The tags characterizing the variables.
+        :returns: dict with nodes and edges modeling a Translator/Biolink KG.
+        """
+        graph = {
+            "nodes": [],
+            "edges": []
+        }
+        edges = graph['edges']
+        nodes = graph['nodes']
+
+        # Create graph elements to model tags and their
+        # links to identifiers gathered by semantic tagging
+        tag_map = {}
+        # @TODO extract this into config or maybe dug ??
+        topmed_tag_concept_type = "TOPMed Phenotype Concept"
+        nodes_written = set()
+        for tag in elements:            
+            if not (isinstance(tag, DugConcept)
+                    and tag.type == topmed_tag_concept_type):
+                continue
+            tag_id = tag.id
+            tag_map[tag_id] = tag
+            nodes.append({
+                "id": tag_id,
+                "name": tag.name,
+                "description": tag.description.replace("'", "`"),
+                "category": ["biolink:InformationContentEntity"]
+            })
+
+            # Link ontology identifiers we've found for this tag via nlp.
+            for identifier, metadata in tag.identifiers.items():
+                if isinstance(metadata.types, str):
+                    bl_element = self.bl_toolkit.toolkit.get_element(
+                        metadata.types)
+                    category = [bl_element.class_uri or bl_element.slot_uri]
+                else:
+                    category = metadata.types
+                synonyms = metadata.synonyms if metadata.synonyms else []
+                nodes.append({
+                    "id": identifier,
+                    "name": metadata.label,
+                    "category": category,
+                    "synonyms": synonyms
+                })
+                nodes_written.add(identifier)
+                edges.append(make_edge(
+                    subj=tag_id,
+                    obj=identifier))
+                edges.append(make_edge(
+                    subj=identifier,
+                    obj=tag_id))
+
+        concepts_graph = self.convert_to_kgx_json(elements,
+                                                  written_nodes=nodes_written)
+        graph['nodes'] += concepts_graph['nodes']
+        graph['edges'] += concepts_graph['edges']
+
+        return graph
+
+    def index_elements(self, elements_file):
+        "Submit elements_file to ElasticSearch for indexing "
+        log.info("Indexing %s...", str(elements_file))
+        elements =jsonpickle.decode(storage.read_object(elements_file))
+        count = 0
+        total = len(elements)
+        # Index Annotated Elements
+        log.info("found %d from elements files.", len(elements))
+        for element in elements:
+            count += 1
+            # Only index DugElements as concepts will be
+            # indexed differently in next step
+            if not isinstance(element, DugConcept):
+                # override data-type with mapping values
+                if element.type.lower() in self.element_mapping:
+                    element.type = self.element_mapping[element.type.lower()]
+                if not element.id:
+                    # no id no indexing
+                    continue
+                # Use the Dug Index object to submit the element to ES
+                self.index_obj.index_element(
+                    element, index=self.variables_index)
+            percent_complete = (count / total) * 100
+            if percent_complete % 10 == 0:
+                log.info("%d %%", percent_complete)
+        log.info("Done indexing %s.", elements_file)
+
+    def validate_indexed_element_file(self, elements_file):
+        "After submitting elements for indexing, verify that they're available"
+        elements = [x for x in jsonpickle.decode(storage.read_object(elements_file))
+                    if not isinstance(x, DugConcept)]
+        # Pick ~ 10 %
+        sample_size = int(len(elements) * 0.1)
+
+        # random.choices(elements, k=sample_size)
+        test_elements = elements[:sample_size]
+        log.info("Picked %d from %s for validation.", len(test_elements),
+                 elements_file)
+        for element in test_elements:
+            # Pick a concept
+            concepts = [element.concepts[curie] for curie in element.concepts
+                        if element.concepts[curie].name]
+
+            if len(concepts):
+                # Pick the first concept
+                concept = concepts[0]
+                curie = concept.id
+                search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', concept.name)
+                log.debug("Searching for Concept: %s and Search term: %s",
+                          str(curie), search_term)
+                all_elements_ids = self._search_elements(curie, search_term)
+                present = element.id in all_elements_ids
+                if not present:
+                    log.error("Did not find expected variable %s in search "
+                              "result.", str(element.id))
+                    log.error("Concept id : %s, Search term: %s",
+                              str(concept.id), search_term)
+                    raise PipelineException(
+                        f"Validation exception - did not find variable "
+                        f"{element.id} from {str(elements_file)}"
+                        f"when searching variable index with Concept ID : "
+                        f"{concept.id} using Search Term : {search_term} ")
+            else:
+                log.info(
+                    "%s has no concepts annotated. Skipping validation for it.",
+                    str(element.id))
+
+    def _search_elements(self, curie, search_term):
+        "Asynchronously call a search on the curie and search term"
+        response = self.event_loop.run_until_complete(self.search_obj.search_vars_unscored(
+            concept=curie,
+            query=search_term
+        ))
+        ids_dict = []
+        if 'total_items' in response:
+            if response['total_items'] == 0:
+                log.error(f"No search elements returned for variable search: {self.variables_index}.")
+                log.error(f"Concept id : {curie}, Search term: {search_term}")
+                raise Exception(f"Validation error - Did not find {curie} for"
+                                f"Search term: {search_term}")
+            else:
+                del response['total_items']
+                for element_type in response:
+                    all_elements_ids = [e['id'] for e in
+                                        reduce(lambda x, y: x + y['elements'], response[element_type], [])]
+                    ids_dict += all_elements_ids
+        return ids_dict
+
+    def crawl_concepts(self, concepts, data_set_name, output_path=None):
+        """Adds tranql KG to Concepts
+
+        Terms grabbed from KG are also added as search terms
+        :param concepts:
+        :param data_set_name:
+        :return:
+        """
+        # TODO crawl dir seems to be storaing crawling info to avoid re-crawling, but is that consting us much? , it was when tranql was slow, but
+        # might right to consider getting rid of it.
+        crawl_dir = storage.dug_crawl_path('crawl_output')
+        output_file_name = os.path.join(data_set_name,
+                                        'expanded_concepts.txt')
+        extracted_dug_elements_file_name = os.path.join(data_set_name,
+                                                        'extracted_graph_elements.txt')
+        if not output_path:
+            output_file = storage.dug_expanded_concepts_path(output_file_name)
+            extracted_output_file = storage.dug_expanded_concepts_path(
+                extracted_dug_elements_file_name
+                )
+        else:
+            output_file = os.path.join(output_path, output_file_name)
+            extracted_output_file = os.path.join( output_path, extracted_dug_elements_file_name)
+        
+        Path(crawl_dir).mkdir(parents=True, exist_ok=True)
+        extracted_dug_elements = []
+        log.debug("Creating Dug Crawler object")
+        crawler = Crawler(
+            crawl_file="",
+            parser=None,
+            annotator=None,
+            tranqlizer=self.tranqlizer,
+            tranql_queries=self.tranql_queries,
+            http_session=self.cached_session,
+        )
+        crawler.crawlspace = crawl_dir
+        counter = 0
+        total = len(concepts)
+        for concept in concepts.values():
+            counter += 1
+            try:
+                crawler.expand_concept(concept)
+                concept.set_search_terms()
+                concept.set_optional_terms()
+            except Exception as e:
+                log.error(concept)
+                raise e
+            for query in self.node_to_element_queries:
+                log.info(query)
+                casting_config = query['casting_config']
+                tranql_source = query['tranql_source']
+                dug_element_type = query['output_dug_type']
+                extracted_dug_elements += crawler.expand_to_dug_element(
+                    concept=concept,
+                    casting_config=casting_config,
+                    dug_element_type=dug_element_type,
+                    tranql_source=tranql_source
+                )
+            concept.clean()
+            percent_complete = int((counter / total) * 100)
+            if percent_complete % 10 == 0:
+                log.info("%d%%", percent_complete)
+        log.info("Crawling %s done", data_set_name)
+        storage.write_object(obj=jsonpickle.encode(concepts, indent=2), path=output_file)
+        log.info ("Concepts serialized to %s", output_file)
+        storage.write_object(obj=jsonpickle.encode(extracted_dug_elements, indent=2),
+                             path=extracted_output_file)
+        log.info("Extracted elements serialized to %s", extracted_output_file)
+
+    def _index_concepts(self, concepts):
+        "Submit concepts to ElasticSearch for indexing"
+        log.info("Indexing Concepts")
+        total = len(concepts)
+        count = 0
+        for concept_id, concept in concepts.items():
+            count += 1
+            self.index_obj.index_concept(concept, index=self.concepts_index)
+            # Index knowledge graph answers for each concept
+            for kg_answer_id, kg_answer in concept.kg_answers.items():
+                self.index_obj.index_kg_answer(
+                    concept_id=concept_id,
+                    kg_answer=kg_answer,
+                    index=self.kg_index,
+                    id_suffix=kg_answer_id
+                )
+            percent_complete = int((count / total) * 100)
+            if percent_complete % 10 == 0:
+                log.info("%s %%", percent_complete)
+        log.info("Done Indexing concepts")
+
+    def _validate_indexed_concepts(self, elements, concepts):
+        """
+        Validates linked concepts are searchable
+        :param elements: Annotated dug elements
+        :param concepts: Crawled (expanded) concepts
+        :return:
+        """
+        # 1 . Find concepts with KG <= 10% of all concepts,
+        # <= because we might have no results for some concepts from tranql
+        sample_concepts = {key: value for key, value
+                           in concepts.items() if value.kg_answers}
+        if len(concepts) == 0:
+            log.info("No Concepts found.")
+            return
+        log.info("Found only %d Concepts with Knowledge graph out of %d. %d%%",
+                 len(sample_concepts), len(concepts),
+                 (len(sample_concepts) / len(concepts)) * 100)
+        # 2. pick elements that have concepts in the sample concepts set
+        sample_elements = {}
+        for element in elements:
+            if isinstance(element, DugConcept):
+                continue
+            for concept in element.concepts:
+                # add elements that have kg
+                if concept in sample_concepts:
+                    sample_elements[concept] = sample_elements.get(
+                        concept, set())
+                    sample_elements[concept].add(element.id)
+
+        # Time for some validation
+        for curie in concepts:
+            concept = concepts[curie]
+            if not concept.kg_answers:
+                continue
+            search_terms = []
+            for key in concept.kg_answers:
+                kg_object = concept.kg_answers[key]
+                search_terms += kg_object.get_node_names()
+                search_terms += kg_object.get_node_synonyms()
+                # reduce(lambda x,y: x + y, [[node.get("name")]
+                #                            + node.get("synonyms", [])
+                #             for node in concept.kg_answers[
+                #                 "knowledge_graph"]["nodes"]], [])
+            # validation here is that for any of these nodes we should get back
+            # the variable.
+            # make unique
+            search_terms_cap = 10
+            search_terms = list(set(search_terms))[:search_terms_cap]
+            log.debug("Using %d Search terms for concept %s", len(search_terms),
+                      str(curie))
+            for search_term in search_terms:
+                # avoids elastic failure due to some reserved characters
+                # 'search_phase_execution_exception',
+                # 'token_mgr_error: Lexical error ...
+                search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', search_term)
+
+                searched_element_ids = self._search_elements(curie, search_term)
+
+                if curie not in sample_elements:
+                    log.error("Did not find Curie id %s in Elements.",
+                              str(curie))
+                    log.error("Concept id : %s, Search term: %s",
+                              str(concept.id), search_term)
+                    raise PipelineException(
+                        f"Validation error - Did not find {curie} for "
+                        f"Concept id : {concept.id}, "
+                        f"Search term: {search_term}")
+
+                present = bool([x for x in sample_elements[curie]
+                                if x in searched_element_ids])
+                if not present:
+                    log.error("Did not find expected variable %s "
+                              "in search result.",
+                              str(curie))
+                    log.error("Concept id : %s, Search term: %s",
+                              str(concept.id), search_term)
+                    raise PipelineException(
+                        f"Validation error - Did not find {curie} for"
+                        f" Concept id : {concept.id}, "
+                        f"Search term: {search_term}")
+
+    def clear_index(self, index_id):
+        "Delete the index specified by index_id from ES"
+        exists = self.event_loop.run_until_complete(self.search_obj.es.indices.exists(index=index_id))
+        if exists:
+            log.info("Deleting index %s", str(index_id))
+            response = self.event_loop.run_until_complete(
+                self.search_obj.es.indices.delete(index=index_id))
+            log.info("Cleared Elastic : %s", str(response))
+        log.info("Re-initializing the indicies")
+        self.index_obj.init_indices()
+
+    def clear_variables_index(self):
+        "Delete the variables index from ES"
+        self.clear_index(self.variables_index)
+
+    def clear_kg_index(self):
+        "Delete the KG index from ES"
+        self.clear_index(self.kg_index)
+
+    def clear_concepts_index(self):
+        "Delete the concepts index from ES"
+        self.clear_index(self.concepts_index)
+
+    ####
+    # Methods above this are directly from what used to be
+    # dug_helpers.dug_utils.Dug. Methods below are consolidated from what used
+    # to be dug_helpers.dug_utils.DugUtil. These are intented to be the "top
+    # level" interface to Roger, which Airflow DAGs or other orchestrators can
+    # call directly.
+
+    def _fetch_s3_file(self, filename, output_dir):
+        "Fetch a file from s3 to output_dir"
+        log.info("Fetching %s", filename)
+        output_name = filename.split('/')[-1]
+        output_path = output_dir / output_name
+        self.s3_utils.get(
+            str(filename),
+            str(output_path),
+        )
+        if self.unzip_source:
+            log.info("Unzipping %s", str(output_path))
+            with tarfile.open(str(output_path)) as tar:
+                tar.extractall(path=output_dir)
+        return output_path
+
+    def _fetch_remote_file(self, filename, output_dir, current_version):
+        "Fetch a file from a location using FileFetcher"
+        log.info("Fetching %s", filename)
+        # fetch from stars
+        remote_host = self.config.annotation_base_data_uri
+        fetch = FileFetcher(
+            remote_host=remote_host,
+            remote_dir=current_version,
+            local_dir=output_dir)
+        output_path = fetch(filename)
+        if self.unzip_source:
+            log.info("Unzipping %s", str(output_path))
+            with tarfile.open(str(output_path)) as tar:
+                tar.extractall(path=output_dir)
+        return output_path
+
+    def get_versioned_files(self):
+        """ Fetches a dug input data files to input file directory
+        """
+        meta_data = storage.read_relative_object("../../metadata.yaml")
+        output_dir: Path = storage.dug_input_files_path(
+            self.get_files_dir())
+        data_store = self.config.dug_inputs.data_source
+
+        # clear dir
+        storage.clear_dir(output_dir)
+        data_sets = self.config.dug_inputs.data_sets
+        log.info("dataset: %s", data_sets)
+        pulled_files = []
+        for data_set in data_sets:
+            data_set_name, current_version = data_set.split(':')
+            for item in meta_data["dug_inputs"]["versions"]:
+                if (item["version"] == current_version and
+                    item["name"] == data_set_name and
+                    item["format"] == self.get_data_format()):
+                    if data_store == "s3":
+                        for filename in item["files"]["s3"]:
+                            pulled_files.append(
+                                self._fetch_s3_file(filename, output_dir))
+                    else:
+                        for filename in item["files"]["stars"]:
+                            pulled_files.append(
+                                self.fetch_remote_file(filename, output_dir,
+                                                       current_version))
+        return [str(filename) for filename in pulled_files]
+
+    def get_objects(self, input_data_path=None):
+        """Retrieve initial source objects for parsing
+
+        This is a default method that will be overridden by subclasses
+        frequently, it is expected.
+        """
+        if not input_data_path:
+            input_data_path = storage.dug_input_files_path(
+                self.get_files_dir())
+        files = storage.get_files_recursive(
+            lambda file_name: file_name.endswith('.xml'),
+            input_data_path)
+        return sorted([str(f) for f in files])
+
+    def annotate(self, to_string=False, files=None, input_data_path=None,
+                 output_data_path=None):
+        "Annotate files with the appropriate parsers and crawlers"
+        if files is None:
+            files = self.get_objects(input_data_path=input_data_path)
+        self.annotate_files(parsable_files=files,
+                            output_data_path=output_data_path)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def index_variables(self, to_string=False, element_object_files=None,
+                        input_data_path=None, output_data_path=None):
+        """Index variables from element object files for pipeline
+
+        if element_object_files is specified, only those files are
+        indexed. Otherwise, if the input_data_path is supplied, elements files
+        under that path are indexed. If neither is supplied, the default
+        directory is searched for index files and those are indexed.
+        """
+        # self.clear_variables_index()
+        if element_object_files is None:
+            element_object_files = storage.dug_elements_objects(input_data_path,format='txt')
+        for file_ in element_object_files:            
+            self.index_elements(file_)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def validate_indexed_variables(self, to_string=None,
+                                   element_object_files=None,
+                                   input_data_path=None,
+                                   output_data_path=None):
+        "Validate output from index variables task for pipeline"
+        if not element_object_files:
+            element_object_files = storage.dug_elements_objects(input_data_path, format='txt')
+        for file_ in element_object_files:
+            log.info("Validating %s", str(file_))
+            self.validate_indexed_element_file(file_)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def validate_indexed_concepts(self, config=None, to_string=None, input_data_path=None, output_data_path=None):
+        """
+        Entry for validate concepts
+        """
+        get_data_set_name = lambda file: os.path.split(os.path.dirname(file))[-1]
+        expanded_concepts_files_dict = {
+            get_data_set_name(file): file for file  in storage.dug_expanded_concept_objects(data_path=input_data_path, format='txt')
+        }
+        annotated_elements_files_dict = {
+            get_data_set_name(file): file for file in storage.dug_elements_objects(data_path=input_data_path, format='txt')
+        }
+        try: 
+            assert len(expanded_concepts_files_dict) == len(annotated_elements_files_dict)
+        except:
+            log.error("Files Annotated Elements files and Expanded concepts files, should be pairs")
+            if len(expanded_concepts_files_dict) > len(annotated_elements_files_dict):
+                log.error("Some Annotated Elements files (from load_and_annotate task) are missing")
+            else:
+                log.error("Some Expanded Concepts files (from crawl task) are missing")
+            log.error(f"Annotated Datasets : {list(annotated_elements_files_dict.keys())}")
+            log.error(f"Expanded Concepts Datasets: {list(expanded_concepts_files_dict.keys())}")
+            exit(-1)
+        for data_set_name in annotated_elements_files_dict:
+            log.debug(f"Reading concepts and elements for dataset {data_set_name}")
+            elements_file_path = annotated_elements_files_dict[data_set_name]
+            concepts_file_path = expanded_concepts_files_dict[data_set_name]
+            dug_elements = jsonpickle.decode(storage.read_object(elements_file_path))
+            dug_concepts = jsonpickle.decode(storage.read_object(concepts_file_path))
+            log.debug(f"Read {len(dug_elements)} elements, and {len(dug_concepts)} Concepts")
+            log.info(f"Validating {data_set_name}")
+            self._validate_indexed_concepts(elements=dug_elements, concepts=dug_concepts)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def make_kg_tagged(self, to_string=False, elements_files=None,
+                       input_data_path=None, output_data_path=None):
+        "Create tagged knowledge graphs from elements"
+        if not output_data_path:
+            output_data_path = storage.dug_kgx_path("")
+        storage.clear_dir(output_data_path)
+        log.info("Starting building KGX files")
+
+        if not elements_files:
+            elements_files = storage.dug_elements_objects(input_data_path, format='txt')
+        log.info(f"found {len(elements_files)} files : {elements_files}")
+        for file_ in elements_files:
+            elements = jsonpickle.decode(storage.read_object(file_))
+            if "topmed_" in file_:
+                kg = self.make_tagged_kg(elements)
+            else:
+                kg = self.convert_to_kgx_json(elements)
+            dug_base_file_name = file_.split(os.path.sep)[-2]
+            output_file_path = os.path.join(output_data_path,
+                                            dug_base_file_name + '_kgx.json')
+            storage.write_object(kg, output_file_path)
+            log.info("Wrote %d and %d edges, to %s", len(kg['nodes']),
+                     len(kg['edges']), output_file_path)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def crawl_tranql(self, to_string=False, concept_files=None,
+                     input_data_path=None, output_data_path=None):
+        "Perform the tranql crawl"        
+        if not concept_files:
+            concept_files = storage.dug_concepts_objects(input_data_path, format='txt')          
+
+        if output_data_path:
+            crawl_dir = os.path.join(output_data_path, 'crawl_output')
+            expanded_concepts_dir = os.path.join(output_data_path,
+                                                 'expanded_concepts')
+        else:
+            crawl_dir = storage.dug_crawl_path('crawl_output')
+            expanded_concepts_dir = storage.dug_expanded_concepts_path("")
+        log.info("Clearing crawl output dir %s", crawl_dir)
+        storage.clear_dir(crawl_dir)
+
+        log.info("Clearing expanded concepts dir: %s", expanded_concepts_dir)
+        storage.clear_dir(expanded_concepts_dir)
+
+        log.info("Crawling Dug Concepts, found %d file(s).",
+                 len(concept_files))
+        for file_ in concept_files:
+            objects = storage.read_object(file_) 
+            objects = objects or {} 
+            if not objects:
+                log.info(f'no concepts in {file_}')
+            data_set =  jsonpickle.decode(objects)
+            original_variables_dataset_name = os.path.split(
+                os.path.dirname(file_))[-1]
+            self.crawl_concepts(concepts=data_set,
+                                data_set_name=original_variables_dataset_name, output_path= output_data_path)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def index_concepts(self, to_string=False,
+                       input_data_path=None, output_data_path=None):
+        "Index concepts from expanded concept files"
+        # These are concepts that have knowledge graphs  from tranql
+        # clear out concepts and kg indicies from previous runs
+        # self.clear_concepts_index()
+        # self.clear_kg_index()
+        expanded_concepts_files = storage.dug_expanded_concept_objects(
+            input_data_path, format="txt")
+        for file_ in expanded_concepts_files:
+            concepts = jsonpickle.decode(storage.read_object(file_))
+            self._index_concepts(concepts=concepts)
+
+        if self.config.indexing.node_to_element_queries:
+            log.info("*******************")
+
+            extracted_elements_files = storage.dug_extracted_elements_objects(data_path=input_data_path)
+            log.info(f"{extracted_elements_files}")
+            for file_ in extracted_elements_files:
+                log.info(f"reading file {file_}")
+                self.index_elements(file_)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
diff --git a/dags/roger/pipelines/bdc.py b/dags/roger/pipelines/bdc.py
new file mode 100644
index 00000000..bc30cf44
--- /dev/null
+++ b/dags/roger/pipelines/bdc.py
@@ -0,0 +1,19 @@
+"Pipeline for BDC-dbGap data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class bdcPipeline(DugPipeline):
+    "Pipeline for BDC-dbGap data set"
+    pipeline_name = "bdc"
+    parser_name = "dbgap"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_dd_xml_path()
+        files = storage.get_files_recursive(
+            lambda file_name: (
+                not file_name.startswith('._')
+                and file_name.endswith('.xml')),
+            input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/bdc_pipelines.py b/dags/roger/pipelines/bdc_pipelines.py
new file mode 100644
index 00000000..d4c6436d
--- /dev/null
+++ b/dags/roger/pipelines/bdc_pipelines.py
@@ -0,0 +1,58 @@
+"Dug pipeline for dbGaP data set"
+
+from roger.pipelines import DugPipeline
+
+class BIOLINCCdbGaPPipeline(DugPipeline):
+    "Pipeline for the dbGaP data set"
+    pipeline_name = 'bdc-biolincc'
+    parser_name = 'biolincc'
+
+
+class covid19dbGaPPipeline(DugPipeline):
+    "Pipeline for the dbGaP data set"
+    pipeline_name = 'bdc-covid19'
+    parser_name = 'covid19'
+
+class dirDbGaPPipeline(DugPipeline):
+    pipeline_name = "bdc-dir"
+    parser_name = "dir"
+
+class LungMapDbGaPPipeline(DugPipeline):
+    pipeline_name = "bdc-lungmap"
+    parser_name = "lungmap"
+
+class nsrrDbGaPPipeline(DugPipeline):
+    pipeline_name = "bdc-nsrr"
+    parser_name = "nsrr"
+
+class ParentDbGaPPipeline(DugPipeline):
+    pipeline_name = "bdc-parent"
+    parser_name = "parent"
+
+class PCGCDbGaPPipeline(DugPipeline):
+    pipeline_name = "pcgc-dbgap"
+    parser_name = "pcgc"
+
+class RecoverDbGaPPipeline(DugPipeline):
+    pipeline_name = "bdc-recover"
+    parser_name = "recover"
+
+class TopmedDBGaPPipeline(DugPipeline):
+    pipeline_name = "bdc-topmed"
+    parser_name = "topmeddbgap"
+
+class CureSCPipeline(DugPipeline):
+    pipeline_name = "bdc-curesc"
+    parser_name = "curesc"
+
+class HeartFailurePipeline(DugPipeline):
+    pipeline_name = "bdc-heartfailure"
+    parser_name = "heartfailure"
+
+class ImagingPipeline(DugPipeline):
+    pipeline_name = "bdc-imaging"
+    parser_name = "imaging"
+
+class RedsPipeline(DugPipeline):
+    pipeline_name = "bdc-reds"
+    parser_name = "reds"
\ No newline at end of file
diff --git a/dags/roger/pipelines/crdc.py b/dags/roger/pipelines/crdc.py
new file mode 100644
index 00000000..2143cf7b
--- /dev/null
+++ b/dags/roger/pipelines/crdc.py
@@ -0,0 +1,19 @@
+"Pipeline for Cancer Commons data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class CRDCPipeline(DugPipeline):
+    "Pipeline for Cancer Commons data set"
+    pipeline_name = "crdc"
+    parser_name = "crdc"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_crdc_path()
+        files = storage.get_files_recursive(
+            lambda file_name: (
+                not file_name.startswith('GapExchange_')
+                and file_name.endswith('.xml')),
+            input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/ctn.py b/dags/roger/pipelines/ctn.py
new file mode 100644
index 00000000..25918062
--- /dev/null
+++ b/dags/roger/pipelines/ctn.py
@@ -0,0 +1,10 @@
+"Pipeline for Clinical trials network data"
+
+from roger.pipelines import DugPipeline
+
+class CTNPipeline(DugPipeline):
+    "Pipeline for Clinical trials nework data set"
+    pipeline_name = "ctn"
+    parser_name = "ctn"
+
+
diff --git a/dags/roger/pipelines/db_gap.py b/dags/roger/pipelines/db_gap.py
new file mode 100644
index 00000000..7c1db504
--- /dev/null
+++ b/dags/roger/pipelines/db_gap.py
@@ -0,0 +1,10 @@
+"Dug pipeline for dbGaP data set"
+
+from roger.pipelines import DugPipeline
+
+class dbGaPPipeline(DugPipeline):
+    "Pipeline for the dbGaP data set"
+
+    pipeline_name = 'dbGaP'
+    parser_name = 'DbGaP'
+    files_dir = 'db_gap'
diff --git a/dags/roger/pipelines/heal_research_programs.py b/dags/roger/pipelines/heal_research_programs.py
new file mode 100644
index 00000000..bfec3f83
--- /dev/null
+++ b/dags/roger/pipelines/heal_research_programs.py
@@ -0,0 +1,16 @@
+"Pipeline for Heal-studies data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class HealResearchProgramPipeline(DugPipeline):
+    "Pipeline for Heal-research-programs  data set"
+    pipeline_name = "heal-mds-research-networks"
+    parser_name = "heal-research"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_heal_research_program_path()
+        files = storage.get_files_recursive(lambda file_name: file_name.endswith('.xml'),
+                                    input_data_path)
+        return sorted([str(f) for f in files])
\ No newline at end of file
diff --git a/dags/roger/pipelines/heal_studies.py b/dags/roger/pipelines/heal_studies.py
new file mode 100644
index 00000000..a08e8115
--- /dev/null
+++ b/dags/roger/pipelines/heal_studies.py
@@ -0,0 +1,16 @@
+"Pipeline for Heal-studies data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class HealStudiesPipeline(DugPipeline):
+    "Pipeline for Heal-studies  data set"
+    pipeline_name = "heal-mds-studies"
+    parser_name = "heal-studies"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_heal_study_path()
+        files = storage.get_files_recursive(lambda file_name: file_name.endswith('.xml'),
+                                    input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/kfdrc.py b/dags/roger/pipelines/kfdrc.py
new file mode 100644
index 00000000..bcb0b7ac
--- /dev/null
+++ b/dags/roger/pipelines/kfdrc.py
@@ -0,0 +1,19 @@
+"Pipeline for KDFRC data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class kfdrcPipeline(DugPipeline):
+    "Pipeline for KDFRC data set"
+    pipeline_name = "kfdrc"
+    parser_name = "kfdrc"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_kfdrc_path()
+        files = storage.get_files_recursive(
+            lambda file_name: (
+                not file_name.startswith('GapExchange_')
+                and file_name.endswith('.xml')),
+            input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/nida.py b/dags/roger/pipelines/nida.py
new file mode 100644
index 00000000..b2e841bd
--- /dev/null
+++ b/dags/roger/pipelines/nida.py
@@ -0,0 +1,18 @@
+"NIDA data set pipeline definition"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class NIDAPipeline(DugPipeline):
+    "NIDA data pipeline"
+
+    pipeline_name = 'nida'
+    parser_name = 'NIDA'
+
+    def get_objects(self, input_data_path=None):
+        "Return list of NIDA source files"
+        if not input_data_path:
+            input_data_path = storage.dug_input_files_path(
+                self.get_files_dir())
+        files = sorted(storage.get_files_recursive(lambda x: 'NIDA-' in x , input_data_path))
+        return files
diff --git a/dags/roger/pipelines/picsure_test.py b/dags/roger/pipelines/picsure_test.py
new file mode 100644
index 00000000..bea4469f
--- /dev/null
+++ b/dags/roger/pipelines/picsure_test.py
@@ -0,0 +1,26 @@
+from roger.pipelines import DugPipeline
+from roger.core import  storage
+from roger.logger import logger
+
+
+class PicSure(DugPipeline):
+    "Pipeline for BACPAC data set"
+    pipeline_name = "bdc-test6"  #lakefs 
+    parser_name = "dbgap"
+
+    def get_objects(self, input_data_path=None):
+        """Retrieve anvil objects
+
+        This code is imported from roger.core.storage.dug_anvil_objects
+        """
+        if not input_data_path:
+            input_data_path = storage.dug_input_files_path(
+                self.files_dir)
+        files = storage.get_files_recursive(
+            lambda file_name: (
+                not file_name.startswith('GapExchange_')
+                and file_name.endswith('.xml')),
+            input_data_path)
+        logger.info("**********")
+        logger.info(files)
+        return sorted([str(f) for f in files])
\ No newline at end of file
diff --git a/dags/roger/pipelines/radx.py b/dags/roger/pipelines/radx.py
new file mode 100644
index 00000000..7ffae159
--- /dev/null
+++ b/dags/roger/pipelines/radx.py
@@ -0,0 +1,18 @@
+"Pipeline for BACPAC data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+
+class RadxPipeline(DugPipeline):
+    "Pipeline for Radx data set"
+    pipeline_name = "radx"
+    parser_name = "radx"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_kfdrc_path()
+        files = storage.get_files_recursive(
+            lambda file_name: file_name.endswith('.json'),
+            input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/sparc.py b/dags/roger/pipelines/sparc.py
new file mode 100644
index 00000000..d1c9c950
--- /dev/null
+++ b/dags/roger/pipelines/sparc.py
@@ -0,0 +1,17 @@
+"Pipeline for Sparc data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class SparcPipeline(DugPipeline):
+    "Pipeline for Sparc  data set"
+    pipeline_name = "sparc"
+    parser_name = "SciCrunch"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_heal_study_path()
+        files = storage.get_files_recursive(
+            lambda x: True, input_data_path
+        )
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/topmed.py b/dags/roger/pipelines/topmed.py
new file mode 100644
index 00000000..90b3e515
--- /dev/null
+++ b/dags/roger/pipelines/topmed.py
@@ -0,0 +1,41 @@
+"Pipeline for Topmed data"
+
+from roger.pipelines import DugPipeline
+from roger.pipelines.base import log, os
+import jsonpickle
+from roger.core import storage
+from roger.logger import logger
+class TopmedPipeline(DugPipeline):
+    "Pipeline for Topmed data set"
+    pipeline_name = "topmed"
+    parser_name = "TOPMedTag"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = str(storage.dug_input_files_path('topmed'))
+        files =storage.get_files_recursive(
+                lambda file_name: file_name.endswith('.csv'),
+                input_data_path)
+        return sorted([str(x) for x in files])
+
+    def make_kg_tagged(self, to_string=False, elements_files=None,
+                       input_data_path=None, output_data_path=None):
+        "Create tagged knowledge graphs from elements"
+        log.info("Override base.make_kg_tagged called")
+        if not output_data_path:
+            output_data_path = storage.dug_kgx_path("")
+        storage.clear_dir(output_data_path)
+        if not elements_files:
+            elements_files = storage.dug_elements_objects(input_data_path, format='txt')
+        for file_ in elements_files:
+            elements = jsonpickle.decode(storage.read_object(file_))
+            kg = self.make_tagged_kg(elements)
+            dug_base_file_name = file_.split(os.path.sep)[-2]
+            output_file_path = os.path.join(output_data_path,
+                                            dug_base_file_name + '_kgx.json')
+            storage.write_object(kg, output_file_path)
+            log.info("Wrote %d and %d edges, to %s", len(kg['nodes']),
+                     len(kg['edges']), output_file_path)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
diff --git a/dags/roger/pvc.yaml b/dags/roger/pvc.yaml
new file mode 100644
index 00000000..691fed1b
--- /dev/null
+++ b/dags/roger/pvc.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: search-data
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 10Mi
diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py
new file mode 100755
index 00000000..5fe5ff90
--- /dev/null
+++ b/dags/roger/tasks.py
@@ -0,0 +1,449 @@
+"Tasks and methods related to Airflow implementations of Roger"
+
+import os
+
+from airflow.operators.python import PythonOperator
+from airflow.operators.empty import EmptyOperator
+from airflow.utils.task_group import TaskGroup
+from airflow.utils.dates import days_ago
+from airflow.models import DAG
+from airflow.models.dag import DagContext
+from airflow.models.taskinstance import TaskInstance
+from airflow.operators.bash import BashOperator
+from typing import Union
+from pathlib import Path
+import glob
+import shutil
+
+
+from roger.config import config, RogerConfig
+from roger.logger import get_logger
+from roger.pipelines.base import DugPipeline
+from avalon.mainoperations import put_files, LakeFsWrapper, get_files
+from lakefs_sdk.configuration import Configuration
+from lakefs_sdk.models.merge import Merge
+from functools import partial
+
+logger = get_logger()
+
+default_args = {
+    'owner': 'RENCI',
+    'start_date': days_ago(1)
+}
+
+
+def task_wrapper(python_callable, **kwargs):
+    """
+    Overrides configuration with config from airflow.
+    :param python_callable:
+    :param kwargs:
+    :return:
+    """
+    # get dag config provided
+    dag_run = kwargs.get('dag_run')
+    pass_conf = kwargs.get('pass_conf', True)
+    if config.lakefs_config.enabled:
+        # get input path
+        input_data_path = generate_dir_name_from_task_instance(kwargs['ti'],
+                                                            roger_config=config,
+                                                            suffix='input')
+        # get output path from task id run id dag id combo
+        output_data_path = generate_dir_name_from_task_instance(kwargs['ti'],
+                                                            roger_config=config,
+                                                            suffix='output')
+    else:
+        input_data_path, output_data_path = None, None
+    # cast it to a path object
+    func_args = {
+        'input_data_path': input_data_path,
+        'output_data_path': output_data_path,
+        'to_string': kwargs.get('to_string')
+    }
+    logger.info(f"Task function args: {func_args}")
+    # overrides values
+    config.dag_run = dag_run
+    if pass_conf:
+        return python_callable(config=config, **func_args)
+    return python_callable(**func_args)
+
+def get_executor_config(data_path='/opt/airflow/share/data'):
+    """ Get an executor configuration.
+    :param annotations: Annotations to attach to the executor.
+    :returns: Returns a KubernetesExecutor if K8s configured, None otherwise.
+    """
+    env_var_prefix = config.OS_VAR_PREFIX
+    # based on environment set on scheduler pod, make secrets for worker pod
+    # this ensures passwords don't leak as pod templates.
+    secrets_map = [{
+        "secret_name_ref": "ELASTIC_SEARCH_PASSWORD_SECRET",
+        "secret_key_ref": "ELASTIC_SEARCH_PASSWORD_SECRET_KEY",
+        "env_var_name": f"{env_var_prefix}ELASTIC__SEARCH_PASSWORD"
+        },{
+        "secret_name_ref": "REDIS_PASSWORD_SECRET",
+        "secret_key_ref": "REDIS_PASSWORD_SECRET_KEY",
+        "env_var_name": f"{env_var_prefix}REDISGRAPH_PASSWORD"
+    }]
+    secrets = []
+    for secret in secrets_map:
+        secret_name = os.environ.get(secret["secret_name_ref"], False)
+        secret_key_name = os.environ.get(secret["secret_key_ref"], False)
+        if secret_name and secret_key_name:
+            secrets.append({
+                "name": secret["env_var_name"],
+                "valueFrom": {
+                    "secretKeyRef": {
+                       "name": secret_name,
+                       "key": secret_key_name
+                    }
+                }})
+
+    k8s_executor_config = {
+        "KubernetesExecutor": {
+            "envs": secrets,
+        }
+    }
+    return k8s_executor_config
+
+def init_lakefs_client(config: RogerConfig) -> LakeFsWrapper:
+    configuration = Configuration()
+    configuration.username = config.lakefs_config.access_key_id
+    configuration.password = config.lakefs_config.secret_access_key
+    configuration.host = config.lakefs_config.host
+    the_lake = LakeFsWrapper(configuration=configuration)
+    return the_lake
+
+
+def pagination_helper(page_fetcher, **kwargs):
+    """Helper function to iterate over paginated results"""
+    while True:
+        resp = page_fetcher(**kwargs)
+        yield from resp.results
+        if not resp.pagination.has_more:
+            break
+        kwargs['after'] = resp.pagination.next_offset
+
+
+def avalon_commit_callback(context: DagContext, **kwargs):
+    client: LakeFsWrapper  = init_lakefs_client(config=config)
+    # now files have been processed,
+    # this part should
+    # get the out path of the task
+    local_path = str(generate_dir_name_from_task_instance(context['ti'],
+                                                           roger_config=config,
+                                                           suffix='output')).rstrip('/') + '/'
+    task_id = context['ti'].task_id
+    dag_id = context['ti'].dag_id
+    run_id = context['ti'].run_id
+    # run id looks like 2023-10-18T17:35:14.890186+00:00
+    # normalized to 2023_10_18T17_35_14_890186_00_00
+    # since lakefs branch id must consist of letters, digits, underscores and dashes, 
+    # and cannot start with a dash
+    run_id_normalized = run_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_')
+    dag_id_normalized = dag_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_')
+    task_id_normalized = task_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_')
+    temp_branch_name = f'{dag_id_normalized}_{task_id_normalized}_{run_id_normalized}'
+    # remote path to upload the files to.
+    remote_path = f'{dag_id}/{task_id}/'
+
+    # merge destination branch
+    branch = config.lakefs_config.branch
+    repo = config.lakefs_config.repo
+    # This part pushes to a temp branch on the repo
+
+    # now we have the output path lets do some pushing but where ?
+    # right now lets stick to using one repo ,
+
+    # issue Vladmir pointed out if uploads to a single lakefs branch have not
+    # been finalized with commit,
+    # this would cause dirty commits if parallel tasks target the same branch.
+
+    # solution: Lakefs team suggested we commit to a different temp branch per
+    # task, and merge that branch.
+    # this callback function will do that for now.
+
+    # 1. put files into a temp branch.
+    # 2. make sure a commit happens.
+    # 3. merge that branch to master branch.
+    logger.info("Pushing local path %s to %s@%s in %s dir",
+                local_path, repo, temp_branch_name, remote_path)
+    put_files(
+        local_path=local_path,
+        remote_path=remote_path,
+        task_name=task_id,
+        task_args=[""],
+        pipeline_id=dag_id,
+        task_docker_image="docker-image",
+        s3storage=False,
+        lake_fs_client=client,
+        branch=temp_branch_name,
+        repo=repo,
+        # @TODO figure out how to pass real commit id here
+        commit_id=branch,
+        source_branch_name=branch
+    )
+
+    # see what changes are going to be pushed from this branch to main branch
+    for diff in pagination_helper(client._client.refs_api.diff_refs,
+                                  repository=repo, left_ref=branch,
+                                  right_ref=temp_branch_name):
+        logger.info("Diff: " + str(diff))
+    
+    try:
+        # merging temp branch to working branch
+        # the current working branch wins incase of conflicts
+        merge = Merge(**{"strategy": "source-wins"})
+        client._client.refs_api.merge_into_branch(repository=repo,
+                                                source_ref=temp_branch_name,
+                                                destination_branch=branch,
+                                                merge=merge
+                                                )
+
+        logger.info(f"merged branch {temp_branch_name} into {branch}")
+    except Exception as e:
+        # remove temp 
+        logger.error(e)
+    # delete temp branch
+    finally:
+        client._client.branches_api.delete_branch(
+            repository=repo,
+            branch=temp_branch_name
+        )
+
+        logger.info(f"deleted temp branch {temp_branch_name}")
+        logger.info(f"deleting local dir {local_path}")
+        files_to_clean = glob.glob(local_path + '**', recursive=True) + [local_path]
+
+    clean_up(context, **kwargs)
+
+def clean_up(context: DagContext, **kwargs):
+    input_dir = str(generate_dir_name_from_task_instance(context['ti'],
+                                                          roger_config=config,
+                                                          suffix='output')).rstrip('/') + '/'
+    output_dir = str(generate_dir_name_from_task_instance(context['ti'],
+                                                          roger_config=config,
+                                                          suffix='input')).rstrip('/') + '/'
+    files_to_clean = glob.glob(input_dir + '**', recursive=True) + [input_dir]
+    files_to_clean += glob.glob(output_dir + '**', recursive=True) + [output_dir]
+    for f in files_to_clean:
+        if os.path.exists(f):
+            shutil.rmtree(f)
+
+def generate_dir_name_from_task_instance(task_instance: TaskInstance,
+                                         roger_config: RogerConfig, suffix:str):
+    # if lakefs is not enabled just return none so methods default to using
+    # local dir structure.
+    if not roger_config.lakefs_config.enabled:
+        return None
+    root_data_dir =  os.getenv("ROGER_DATA_DIR").rstrip('/')
+    task_id = task_instance.task_id
+    dag_id = task_instance.dag_id
+    run_id = task_instance.run_id
+    try_number = task_instance._try_number
+    return Path(
+        f"{root_data_dir}/{dag_id}_{task_id}_{run_id}_{try_number}_{suffix}")
+
+def setup_input_data(context, exec_conf):
+    logger.info("""
+        - Figures out the task name and id,
+        - find its data dependencies
+        - clean up and create in and out dir
+        - put dependency data in input dir
+        - if for some reason data was not found raise an exception
+          """)
+    # Serves as a location where files the task will work on are placed.
+    # computed as ROGER_DATA_DIR + /current task instance name_input_dir
+
+    input_dir = str(generate_dir_name_from_task_instance(
+        context['ti'], roger_config=config, suffix="input"))
+    # Clear up files from previous run etc...
+
+    # create input dir
+    os.makedirs(input_dir, exist_ok=True)
+
+    # Download files from lakefs and store them in this new input_path
+    client = init_lakefs_client(config=config)
+    repos = exec_conf['repos']
+    # if no external repo is provided we assume to get the upstream task dataset.
+    if not repos or len(repos) == 0:
+        # merge destination branch
+        branch = config.lakefs_config.branch
+        repo = config.lakefs_config.repo
+        task_instance: TaskInstance = context['ti']
+        # get upstream ids
+        upstream_ids = task_instance.task.upstream_task_ids
+        dag_id = task_instance.dag_id
+        # calculate remote dirs using dag_id + upstreams
+        repos = [{
+            'repo': repo,
+            'branch': branch,
+            'path': f'{dag_id}/{upstream_id}'
+        } for upstream_id in upstream_ids]
+
+    # input_repo = exec_conf['input_repo']
+    # input_branch = exec_conf['input_branch']
+    # If input repo is provided use that as source of files
+    for repo in repos:
+        if not repo.get('path'):
+            # get all if path is not specified
+            repo['path'] = '*'
+    logger.info(f"repos : {repos}")
+    for r in repos:
+        logger.info("downloading %s from %s@%s to %s",
+                    r['path'], r['repo'], r['branch'], input_dir)
+        # create path to download to ...
+        if not os.path.exists(input_dir + f'/{r["repo"]}'):
+            os.mkdir(input_dir + f'/{r["repo"]}')
+        get_files(
+            local_path=input_dir + f'/{r["repo"]}',
+            remote_path=r['path'],
+            branch=r['branch'],
+            repo=r['repo'],
+            changes_only=False,
+            lake_fs_client=client
+        )
+
+
+def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = {}, pass_conf=True, no_output_files=False):
+    """ Create a python task.
+    :param func_kwargs: additional arguments for callable.
+    :param dag: dag to add task to.
+    :param name: The name of the task.
+    :param a_callable: The code to run in this task.
+    """    
+    
+    # these are actual arguments passed down to the task function
+    op_kwargs = {
+        "python_callable": a_callable,
+        "to_string": True,
+        "pass_conf": pass_conf
+    }
+    # update / override some of the args passed to the task function by default
+    if func_kwargs is None:
+        func_kwargs = {}
+    op_kwargs.update(func_kwargs)
+
+
+    # Python operator arguments , by default for non-lakefs config this is all we need. 
+    python_operator_args = {
+            "task_id": name,
+            "python_callable":task_wrapper,            
+            # "executor_config" : get_executor_config(),
+            "dag": dag,
+            "provide_context" : True
+    }
+
+    # if we have lakefs...
+    if config.lakefs_config.enabled:
+
+        # repo and branch for pre-execution , to download input objects
+        pre_exec_conf = {
+            'repos': []
+        }
+        if external_repos:
+            # if the task is a root task , beginning of the dag...
+            # and we want to pull data from a different repo.
+            pre_exec_conf = {
+                'repos': [{
+                    'repo': r['name'],
+                    'branch': r['branch'],
+                    'path': r.get('path', '*')
+                } for r in external_repos]
+            }
+            
+        pre_exec = partial(setup_input_data, exec_conf=pre_exec_conf)
+        # add pre_exec partial function as an argument to python executor conf 
+        python_operator_args['pre_execute'] = pre_exec
+        python_operator_args['on_failure_callback'] = partial(clean_up, kwargs=op_kwargs)
+        # if the task has  output files, we will add a commit callback  
+        if not no_output_files:
+            python_operator_args['on_success_callback'] = partial(avalon_commit_callback, kwargs=op_kwargs)
+        
+    # add kwargs
+    python_operator_args["op_kwargs"] = op_kwargs
+
+    return PythonOperator(**python_operator_args)
+
+def create_pipeline_taskgroup(
+        dag,
+        pipeline_class: type,
+        configparam: RogerConfig,
+        **kwargs):
+    """Emit an Airflow dag pipeline for the specified pipeline_class
+
+    Extra kwargs are passed to the pipeline class init call.
+    """
+    name = pipeline_class.pipeline_name
+    input_dataset_version = pipeline_class.input_version
+
+    with TaskGroup(group_id=f"{name}_dataset_pipeline_task_group") as tg:
+        with pipeline_class(config=configparam, **kwargs) as pipeline:
+            pipeline: DugPipeline
+            annotate_task = create_python_task(
+                dag,
+                f"annotate_{name}_files",
+                pipeline.annotate,
+                external_repos=[{
+                    'name': getattr(pipeline_class, 'pipeline_name'),
+                    'branch': input_dataset_version
+                }],
+                pass_conf=False)
+
+            index_variables_task = create_python_task(
+                dag,
+                f"index_{name}_variables",
+                pipeline.index_variables,
+                pass_conf=False,
+                # declare that this task will not generate files.
+                no_output_files=True)
+            index_variables_task.set_upstream(annotate_task)
+
+            validate_index_variables_task = create_python_task(
+                dag,
+                f"validate_{name}_index_variables",
+                pipeline.validate_indexed_variables,                
+                pass_conf=False,
+                 # declare that this task will not generate files.
+                no_output_files=True
+                )
+            validate_index_variables_task.set_upstream([annotate_task, index_variables_task])
+
+            make_kgx_task = create_python_task(
+                dag,
+                f"make_kgx_{name}",
+                pipeline.make_kg_tagged,
+                pass_conf=False)
+            make_kgx_task.set_upstream(annotate_task)
+
+            crawl_task = create_python_task(
+                dag,
+                f"crawl_{name}",
+                pipeline.crawl_tranql,
+                pass_conf=False) 
+            crawl_task.set_upstream(annotate_task)
+
+            index_concepts_task = create_python_task(
+                dag,
+                f"index_{name}_concepts",
+                pipeline.index_concepts,
+                pass_conf=False,
+                 # declare that this task will not generate files.
+                no_output_files=True)
+            index_concepts_task.set_upstream(crawl_task)
+
+            validate_index_concepts_task = create_python_task(
+                dag,
+                f"validate_{name}_index_concepts",
+                pipeline.validate_indexed_concepts,
+                pass_conf=False,
+                 # declare that this task will not generate files.
+                no_output_files=True
+            )
+            validate_index_concepts_task.set_upstream([crawl_task, index_concepts_task, annotate_task])
+
+
+            complete_task = EmptyOperator(task_id=f"complete_{name}")
+            complete_task.set_upstream(
+                (make_kgx_task,
+                 validate_index_variables_task, validate_index_concepts_task))
+
+    return tg
diff --git a/dags/test_metadata.yaml b/dags/test_metadata.yaml
new file mode 100644
index 00000000..54d508c4
--- /dev/null
+++ b/dags/test_metadata.yaml
@@ -0,0 +1,124 @@
+# This is a file that lists the data to be used for testing purposes
+# It contains a reduced set of the metadata.yaml file
+kgx:
+  versions:
+  - files:
+    - biolink-v1.0.json
+    - ctd-v1.0.json
+    - gtopdb-v1.0.json
+    - hetio-v1.0.json
+    - hgnc-v1.0.json
+    - hmdb-v1.0.json
+    - kegg-v1.0.json
+    - mychem-v1.0.json
+    - ontological-hierarchy-v1.0.json
+    - panther-v1.0.json
+    - foodb-v1.0.json
+    - pharos-v1.0.json
+    - intact-v1.0.json
+    - human-goa-v1.0.json
+    - uberongraph-v1.0.json
+    - viral-proteome-v1.0.json
+    version: v1.0
+    name: baseline-graph
+    format: json
+  - files:
+    - biolink-v2.0.json
+    - ctd-v2.0.json
+    - gtopdb-v2.0.json
+    - hetio-v2.0.json
+    - hgnc-v2.0.json
+    - hmdb-v2.0.json
+    - kegg-v2.0.json
+    - mychem-v2.0.json
+    - ontological-hierarchy-v2.0.json
+    - panther-v2.0.json
+    - foodb-v2.0.json
+    - pharos-v2.0.json
+    - intact-v2.0.json
+    - human-goa-v2.0.json
+    - uberongraph-v2.0.json
+    - viral-proteome-v2.0.json
+    version: v2.0
+    name: baseline-graph
+    format: json
+  - files:
+      - heal/sparc/curation-export-processed.json
+    version: v2.0
+    name: sparc-kgx
+    format: json
+  - files:
+      - Biolink_edges_v3.0.jsonl
+      - Biolink_nodes_v3.0.jsonl
+      - CTD_edges_v3.0.jsonl
+      - CTD_nodes_v3.0.jsonl
+      - DrugCentral_edges_v3.0.jsonl
+      - DrugCentral_nodes_v3.0.jsonl
+      - GtoPdb_edges_v3.0.jsonl
+      - GtoPdb_nodes_v3.0.jsonl
+      - Hetio_edges_v3.0.jsonl
+      - Hetio_nodes_v3.0.jsonl
+      - HGNC_edges_v3.0.jsonl
+      - HGNC_nodes_v3.0.jsonl
+      - HMDB_edges_v3.0.jsonl
+      - HMDB_nodes_v3.0.jsonl
+      - HumanGOA_edges_v3.0.jsonl
+      - HumanGOA_nodes_v3.0.jsonl
+      - IntAct_edges_v3.0.jsonl
+      - IntAct_nodes_v3.0.jsonl
+      - OntologicalHierarchy_edges_v3.0.jsonl
+      - OntologicalHierarchy_nodes_v3.0.jsonl
+      - PANTHER_edges_v3.0.jsonl
+      - PANTHER_nodes_v3.0.jsonl
+      - PHAROS_edges_v3.0.jsonl
+      - PHAROS_nodes_v3.0.jsonl
+      - UberGraph_edges_v3.0.jsonl
+      - UberGraph_nodes_v3.0.jsonl
+    version: v3.0
+    name: baseline-graph
+    format: jsonl
+  - version: test
+    files:
+    - hgnc_nodes.jsonl
+    - hgnc_edges.jsonl
+    name: test
+  - version: v3.0
+    name: cde-graph
+    format: jsonl
+    files:
+      - cde/annotated_edges_v3.0.jsonl
+      - cde/annotated_nodes_v3.0.jsonl
+dug_inputs:
+  versions:
+    - name: bdc
+      version: v1.0
+      files:
+        s3:
+          - "bdc/v1.0/bdc_dbgap_data_dicts.tar.gz"
+        stars:
+          - "bdc_dbgap_data_dicts.tar.gz"
+      format: dbGaP
+    - name: nida
+      version: v1.0
+      files:
+        s3:
+          - "nida/v1.0/nida-12studies.tar.gz"
+        stars:
+          - "nida-12studies.tar.gz"
+      format: nida
+    - name: sparc
+      version: v1.0
+      files:
+        s3:
+          - "sparc/v1.0/sparc-dbgap-xml-formatted.tar.gz"
+        stars:
+          - "sparc-dbgap-xml-formatted.tar.gz"
+      format: sparc
+    - name: anvil
+      version: v1.0
+      files:
+        s3:
+          - "bdc/v1.0/anvil_dbgap_data_dicts.tar.gz"
+        stars:
+          - "anvil_dbgap_data_dicts.tar.gz"
+      format: anvil
\ No newline at end of file
diff --git a/dags/utils/__init__.py b/dags/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dags/utils/s3_utils.py b/dags/utils/s3_utils.py
new file mode 100644
index 00000000..f0f7277b
--- /dev/null
+++ b/dags/utils/s3_utils.py
@@ -0,0 +1,45 @@
+from contextlib import contextmanager
+
+import boto3
+
+from roger.config import S3Config
+
+
+class S3Utils:
+
+    def __init__(
+            self,
+            s3_config: S3Config
+            ):
+        self.config = s3_config
+
+    @contextmanager
+    def connect(
+            self,
+    ):
+        session = boto3.session.Session(
+            aws_access_key_id=self.config.access_key,
+            aws_secret_access_key=self.config.secret_key,
+        )
+
+        s3 = session.resource(
+            's3',
+            endpoint_url=self.config.host,
+        )
+        bucket = s3.Bucket(self.config.bucket)
+        yield bucket
+
+    def get(self, remote_file_name: str, local_file_name: str):
+        with self.connect() as bucket:
+            bucket.download_file(remote_file_name, local_file_name)
+
+    def put(self, local_file_name: str, remote_file_name: str):
+        with self.connect() as bucket:
+            bucket.upload_file(local_file_name, remote_file_name)
+
+    def ls(self):
+        with self.connect() as bucket:
+            return [
+                obj
+                for obj in bucket.objects.all()
+            ]
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 00000000..7c698ed6
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,207 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
+#
+# WARNING: This configuration is for local development. Do not use it in a production deployment.
+#
+# This configuration supports basic configuration using environment variables or an .env file
+# The following variables are supported:
+#
+# AIRFLOW_IMAGE_NAME         - Docker image name used to run Airflow.
+#                              Default: apache/airflow:master-python3.8
+# AIRFLOW_UID                - User ID in Airflow containers
+#                              Default: 50000
+# AIRFLOW_GID                - Group ID in Airflow containers
+#                              Default: 50000
+# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account.
+#                              Default: airflow
+# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account.
+#                              Default: airflow
+#
+# Feel free to modify this file to suit your needs.
+---
+version: '3'
+x-airflow-common:
+  &airflow-common
+  build:
+    dockerfile: Dockerfile
+    context: .
+  environment:
+    &airflow-common-env
+    AIRFLOW__CORE__EXECUTOR: CeleryExecutor
+    AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
+    AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
+    AIRFLOW__CELERY__BROKER_URL: redis://:$REDIS_PASSWORD@redis:$REDIS_PORT/0
+    AIRFLOW__CORE__FERNET_KEY: ''
+    AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
+    AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
+
+    ROGER_DUG__INPUTS_DATA__SETS: "$ROGER_DUG__INPUTS_DATA__SETS"
+    ROGER_ELASTICSEARCH_HOST: "$ELASTIC_API_HOST"
+    ROGER_ELASTICSEARCH_PASSWORD: "$ELASTIC_PASSWORD"
+    ROGER_ELASTICSEARCH_NBOOST__HOST: "$NBOOST_API_HOST"
+    ROGER_REDISGRAPH_HOST: "$REDIS_HOST"
+    ROGER_REDISGRAPH_PASSWORD: "$REDIS_PASSWORD"
+    ROGER_KGX_DATASET__VERSION: "v3.0"
+    ROGER_DATA_DIR: "/opt/airflow/share/data"
+  volumes:
+    - ./dags:/opt/airflow/dags
+    - ./logs:/opt/airflow/logs
+    - ./plugins:/opt/airflow/plugins
+    - ./data:/opt/airflow/share/data
+  user: root
+  depends_on:
+    redis:
+      condition: service_healthy
+    postgres:
+      condition: service_healthy
+
+services:
+  postgres:
+    image: postgres:13
+    environment:
+      POSTGRES_USER: airflow
+      POSTGRES_PASSWORD: airflow
+      POSTGRES_DB: airflow
+    volumes:
+      - postgres-db-volume:/var/lib/postgresql/data
+      - ${DATA_DIR}/elastic:/elastic
+      - ${DATA_DIR}/redis:/redis
+    healthcheck:
+      test: ["CMD", "pg_isready", "-U", "airflow"]
+      interval: 5s
+      retries: 5
+    restart: always
+
+  airflow-webserver:
+    <<: *airflow-common
+    command: webserver
+    ports:
+      - 8080:8080
+    healthcheck:
+      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
+      interval: 10s
+      timeout: 10s
+      retries: 5
+    restart: always
+
+  airflow-scheduler:
+    <<: *airflow-common
+    command: scheduler
+    restart: always
+
+  airflow-worker:
+    <<: *airflow-common
+    command: celery worker
+    restart: always
+
+  airflow-init:
+    <<: *airflow-common
+    command: version
+    environment:
+      <<: *airflow-common-env
+      _AIRFLOW_DB_UPGRADE: 'true'
+      _AIRFLOW_WWW_USER_CREATE: 'true'
+      _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
+      _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
+
+  flower:
+    <<: *airflow-common
+    command: celery flower
+    ports:
+      - 5555:5555
+    healthcheck:
+      test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
+      interval: 10s
+      timeout: 10s
+      retries: 5
+    restart: always
+
+  redis:
+    # image: redislabs/redisgraph:2.10.9 #Alternative Image
+    user: root
+    image: 'redis/redis-stack:6.2.4-v2'
+    command: "redis-server --requirepass $REDIS_PASSWORD --loadmodule /opt/redis-stack/lib/redisgraph.so"
+    environment:
+      - REDIS_ARGS=--requirepass $REDIS_PASSWORD
+    volumes:
+      - $DATA_DIR/redis:/data # FIX RDB Error on local
+    ports:
+      - $REDIS_PORT:$REDIS_PORT
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 30s
+      retries: 50
+    restart: always
+
+  dug:
+    image: containers.renci.org/helxplatform/dug:latest
+    depends_on:
+      - elasticsearch
+      - redis
+    restart: always
+    environment:
+      ELASTIC_API_HOST: "$ELASTIC_API_HOST"
+      ELASTIC_PASSWORD: "$ELASTIC_PASSWORD"
+      REDIS_HOST: "$REDIS_HOST"
+      REDIS_PASSWORD: "$REDIS_PASSWORD"
+      FLASK_ENV: "development"
+      PYTHONUNBUFFERED: "TRUE"
+    entrypoint: [ "gunicorn",
+                     "--workers=$API_WORKERS", "--name=dug",
+                     "--bind=0.0.0.0:$API_PORT", "--timeout=$API_TIMEOUT",
+                     "--log-level=DEBUG", "-k", "uvicorn.workers.UvicornWorker", "--reload", "dug.server:APP"]
+    ports:
+      - $API_PORT:$API_PORT
+
+  elasticsearch:
+    user: root
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.5.2
+    environment:
+      - ELASTIC_PASSWORD=$ELASTIC_PASSWORD
+      - discovery.type=single-node
+      - xpack.security.enabled=true
+      - ingest.geoip.downloader.enabled=false
+    volumes:
+      - $DATA_DIR/elastic:/usr/share/elasticsearch/data
+    ports:
+      - '9200:9200'
+      - '9300:9300'
+
+  tranql:
+    image: containers.renci.org/helxplatform/tranql:rti-merge
+    ports:
+      - '8001:8001'
+    entrypoint: [
+        "gunicorn",
+        "--workers=4",
+        "--bind=0.0.0.0:8001",
+        "--timeout=300",
+        "--access-logfile=$TRANQL_ACCESS_LOG",
+        "--error-logfile=$TRANQL_ERROR_LOG",
+        "--log-level=debug",
+        "tranql.api:app",
+    ]
+    environment:
+      - REDIS_PASSWORD=$REDIS_PASSWORD
+    volumes:
+      - ./tranql-schema.yaml:/tranql/tranql/conf/schema.yaml
+volumes:
+  postgres-db-volume:
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 0fe342a6..8b6ca0b2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,189 +1,14 @@
-alabaster==0.7.12
-alembic==1.4.2
-antlr4-python3-runtime==4.8
-apache-airflow==1.10.12
-apispec==1.3.3
-appnope==0.1.0
-argcomplete==1.12.0
-argon2-cffi==20.1.0
-async-generator==1.10
-attrs==19.3.0
-Babel==2.8.0
-backcall==0.2.0
-biolink-model==1.2.5
-biolinkml==1.5.8
-bleach==3.2.1
-bmt==0.1.1
-cached-property==1.5.1
-cachetools==4.1.1
-cattrs==1.0.0
-certifi==2020.6.20
-cffi==1.14.3
-CFGraph==0.2.1
-chardet==3.0.4
-click==7.1.2
-colorama==0.4.3
-colorlog==4.0.2
-configparser==3.5.3
-croniter==0.3.34
-decorator==4.4.2
-defusedxml==0.6.0
-dill==0.3.2
-dnspython==1.16.0
-docker==4.3.1
-docutils==0.16
-email-validator==1.1.1
-entrypoints==0.3
-env==0.1.0
-Flask==1.1.2
-Flask-Admin==1.5.4
-Flask-AppBuilder==2.3.4
-Flask-Babel==1.0.0
-Flask-Caching==1.3.3
-Flask-JWT-Extended==3.24.1
-Flask-Login==0.4.1
-Flask-OpenID==1.2.5
-Flask-SQLAlchemy==2.4.4
-flask-swagger==0.2.14
-Flask-WTF==0.14.3
-funcsigs==1.0.2
-future==0.18.2
-graphviz==0.14.1
-gunicorn==20.0.4
-idna==2.10
-imagesize==1.2.0
-importlib-metadata==1.7.0
-iniconfig==1.1.1
-ipykernel==5.3.4
-ipython==7.18.1
-ipython-genutils==0.2.0
-ipywidgets==7.5.1
-iso8601==0.1.12
-isodate==0.6.0
-itsdangerous==1.1.0
-jedi==0.17.2
-Jinja2==2.11.2
-json-merge-patch==0.2
-jsonasobj==1.2.1
-jsonlines==1.2.0
-jsonschema==3.2.0
-jupyter==1.0.0
-jupyter-client==6.1.7
-jupyter-console==6.2.0
-jupyter-core==4.6.3
-jupyterlab-pygments==0.1.2
-kgx==0.1.0
-lazy-object-proxy==1.5.1
-lockfile==0.12.2
-Mako==1.1.3
-Markdown==2.6.11
-MarkupSafe==1.1.1
-marshmallow==2.21.0
-marshmallow-enum==1.5.1
-marshmallow-sqlalchemy==0.23.1
-mistune==0.8.4
-mypy==0.790
-mypy-extensions==0.4.3
-natsort==7.0.1
-nbclient==0.5.1
-nbconvert==6.0.7
-nbformat==5.0.8
-neo4jrestclient==2.1.1
-nest-asyncio==1.4.1
-networkx==2.5
-notebook==6.1.4
-numpy==1.19.1
-ordered-set==4.0.2
-packaging==20.4
-pandas==1.1.0
-pandocfilters==1.4.2
-parso==0.7.1
-pathlib==1.0.1
-pathtools==0.1.2
-pbr==5.5.0
-pendulum==1.4.4
-pexpect==4.8.0
-pickleshare==0.7.5
-pluggy==0.13.1
-prefixcommons==0.1.9
-prison==0.1.3
-prologterms==0.0.6
-prometheus-client==0.8.0
-prompt-toolkit==3.0.8
-psutil==5.7.2
-PTable==0.9.2
-ptyprocess==0.6.0
-py==1.9.0
-pycparser==2.20
-Pygments==2.6.1
-PyJSG==0.10.0
-PyJWT==1.7.1
-pyparsing==2.4.7
-pyrsistent==0.16.0
-PyShEx==0.7.14
-PyShExC==0.8.2
-pystache==0.5.4
-pytest==6.1.1
-python-daemon==2.2.4
-python-dateutil==2.8.1
-python-editor==1.0.4
-python-nvd3==0.15.0
-python-slugify==4.0.1
-python3-openid==3.2.0
-pytz==2020.1
-pytzdata==2020.1
-PyYAML==5.3.1
-pyzmq==19.0.2
-qtconsole==4.7.7
-QtPy==1.9.0
-rdflib==5.0.0
-rdflib-jsonld==0.5.0
-redis==3.5.3
-redisgraph==2.1.5
-redisgraph-bulk-loader==0.9.3
-requests==2.24.0
-Send2Trash==1.5.0
-setproctitle==1.1.10
-ShExJSG==0.7.0
-six==1.15.0
-snowballstemmer==2.0.0
-sparql-slurper==0.3.4
-SPARQLWrapper==1.8.5
-Sphinx==3.2.1
-sphinx-click==2.5.0
-sphinx-rtd-theme==0.5.0
-sphinxcontrib-applehelp==1.0.2
-sphinxcontrib-devhelp==1.0.2
-sphinxcontrib-htmlhelp==1.0.3
-sphinxcontrib-jsmath==1.0.1
-sphinxcontrib-qthelp==1.0.3
-sphinxcontrib-serializinghtml==1.1.4
-SQLAlchemy==1.3.18
-SQLAlchemy-JSONField==0.9.0
-SQLAlchemy-Utils==0.36.8
-stringcase==1.2.0
-tabulate==0.8.7
-tenacity==4.12.0
-terminado==0.9.1
-terminaltables==3.1.0
-testpath==0.4.4
-text-unidecode==1.3
-thrift==0.13.0
-toml==0.10.1
-tornado==6.0.4
-traitlets==5.0.5
-typed-ast==1.4.1
-typing-extensions==3.7.4.2
-tzlocal==1.5.1
-unicodecsv==0.14.1
-urllib3==1.25.10
-validators==0.18.1
-watchdog==0.10.3
-wcwidth==0.2.5
-webencodings==0.5.1
-websocket-client==0.57.0
-Werkzeug==0.16.1
-widgetsnbextension==3.5.1
-WTForms==2.3.3
-zipp==3.1.0
-zope.deprecation==4.4.0
+elasticsearch==8.5.2
+flatten-dict
+jsonpickle
+git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6
+setuptools>=66
+pytest
+PyYAML
+git+https://github.com/helxplatform/dug@2.13.11
+orjson==3.9.15
+git+https://github.com/helxplatform/kg_utils.git@v0.0.10
+git+https://github.com/helxplatform/python-stringcase@1.2.1
+bmt==1.4.4
+git+https://github.com/helxplatform/avalon.git@v1.1.0
+h11>=0.16.0
diff --git a/roger-cli-steps.md b/roger-cli-steps.md
new file mode 100644
index 00000000..8e132746
--- /dev/null
+++ b/roger-cli-steps.md
@@ -0,0 +1,27 @@
+# Deployment with Roger CLI
+
+## QUICK Local Set Up
+
+This is list steps to produce a local deployment of Roger. This set up does NOT use airflow and instead only uses the Roger CLI via **Makefile** commands.
+
+### Prerequsite Steps
+
+- Set up Roger dependencies by ensuring that the `.env` has all the correct information.
+- Run the following docker compose commands
+  - `docker compose up tranql -d`: starts up tranql which is the API handlerfor redis graph in the `graph` stage
+  - `docker compose up redis -d`: starts up redis which will be used via redis graph for the `graph` stage
+  - `docker compose up dug -d`: starts up dug API to work as the API handler for elastic search in the `index` stage
+  - `docker compose up elasticsearch -d`: starts up elastic search for the `index` stage
+
+### Roger CLI Steps
+
+1) `python3 -m venv ~/.environments/roger`
+2) `source ~/.environments/roger/bin/activate`
+3) `pip install -r requirements.txt`
+4) `export PYTHONPATH=$PWD/dags`
+5) Change the elasticsearch and redisgraph `host` values to localhost in `dags/roger/config/config.yaml`
+6) Get the S3 Bucket credentials (access_key, bucket, host, secret_key) and export them as environment variables with ROGER_S3_ in the front of the value like: `ROGER_S3_ACCESS__KEY=XXXXKEYXXXX`
+7) `cd bin/` and here either run `make all` OR separate the commands into three steps:
+   1) `make annotate`: executes the CLI related commands found in `bin/dug_annotate/Makefile`
+   2) `make graph`: executes the CLI related commands found in `bin/roger_graph_build/Makefile`
+   3) `make index`: executes the CLI related commands found in `bin/dug_index/Makefile`
diff --git a/roger/config.yaml b/roger/config.yaml
deleted file mode 100644
index f1377a3c..00000000
--- a/roger/config.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-redisgraph:
-  username: ""
-  password: ""
-  host: localhost
-  graph: test
-  ports:
-     http: 6379
-
-logging:
-  level: DEBUG
-  format: '[%(name)s][%(filename)s][%(funcName)20s] %(levelname)s: %(message)s'
-
-data_root: roger/data
-base_data_uri: https://stars.renci.org/var/kgx_data
-
-#https://github.com/RedisGraph/redisgraph-bulk-loader/blob/master/redisgraph_bulk_loader/bulk_insert.py#L43
-bulk_loader:
-  separator: "|"
-  enforce_schema: False
-  skip_invalid_nodes: False
-  skip_invalid_edges: False
-  quote: 0
-  max_token_count: 1024
-  max_buffer_size: 2048
-  max_token_size: 500
-  index: []
-  full_text_index: []
-
-validation:
-  queries:
-    count_nodes:
-      name: "Count Nodes"
-      query: "MATCH (a) RETURN COUNT(a)"
-    count_edges:
-      name: "Count Edges"
-      query: "MATCH (a)-[e]-(b) RETURN COUNT(e)"
-    connectivity:
-      name: TOPMED Connectivity
-      query: "MATCH (a { id : '$var' })--(b) RETURN a.category, b.id"
-      args:
-      - var: TOPMED.TAG:8
-      - var: TOPMED.VAR:phv00000484.v1.p10
-      - var: TOPMED.VAR:phv00000487.v1.p10
-      - var: TOPMED.VAR:phv00000496.v1.p10
-      - var: TOPMED.VAR:phv00000517.v1.p10
-      - var: TOPMED.VAR:phv00000518.v1.p10
-      - var: TOPMED.VAR:phv00000528.v1.p10
-      - var: TOPMED.VAR:phv00000529.v1.p10
-      - var: TOPMED.VAR:phv00000530.v1.p10
-      - var: TOPMED.VAR:phv00000531.v1.p10
-    count_connected_nodes:
-      name: Count Connected Nodes
-      query: "MATCH (a)-[e]-(b) RETURN count(a), count(b)"
-    query_by_type:
-      name: Query by Type
-      query: "MATCH (a:gene)-[e]-(b) WHERE 'chemical_substance' IN b.category RETURN count(distinct(a)), count(distinct(b))"
diff --git a/roger/core.py b/roger/core.py
deleted file mode 100644
index 3d354071..00000000
--- a/roger/core.py
+++ /dev/null
@@ -1,688 +0,0 @@
-import argparse
-import glob
-import json
-import os
-import redis
-import requests
-import shutil
-import time
-import yaml
-import sys
-import traceback
-from biolink import model
-from collections import defaultdict
-from enum import Enum
-from io import StringIO
-from kgx.cli import redisgraph_upload
-from roger.roger_util import get_logger, get_config
-from redisgraph_bulk_loader.bulk_insert import bulk_insert
-from roger.roger_db import RedisGraph
-from string import Template
-
-log = get_logger ()
-config = get_config ()
-data_root = config['data_root']
-    
-class SchemaType(Enum):
-    """ High level semantic metatdata concepts.
-    Categories are classes in an ontological model like Biolink.
-    Predicates are links between nodes. """
-    CATEGORY = "category"
-    PREDICATE = "predicate"
-    
-class FileFormat(Enum):
-    """ File formats this module knows about. """
-    JSON = "json"
-    YAML = "yaml"
-    
-class Util:
-
-    @staticmethod
-    def current_time_in_millis():
-        """
-        Get current time in milliseconds.
-        
-        Returns
-        -------
-        int
-        Time in milliseconds
-        
-        """
-        return int(round(time.time() * 1000))
-
-    """ A just do it approach to getting data. """
-    @staticmethod
-    def read_file(path):
-        """ Read a file. 
-        :param path: Path to a file.
-        """
-        text = None
-        with open(path, "r") as stream:
-            text = stream.read ()
-        return text
-    
-    @staticmethod
-    def read_url(url):
-        """ Read data from a URL.
-        :param url: The URL to read. """
-        return requests.get (url).text
-    
-    @staticmethod
-    def read_data(path):
-        """ Read data from a URL or File. HTTP(S) is the only supported protocol.
-        :param path: A URL or file path. """
-        text = None
-        if Util.is_web(path):
-            text = Util.read_url (path)
-        else:
-            text = Util.read_file (path)
-        return text
-    
-    @staticmethod
-    def read_object(path, key=None):
-        """ Read on object from a path. 
-        :param path: A URL or file path. Supports YAML and JSON depending on extension.
-        :param key: A configuration key. This is prepended to the path if present.
-        :raises ValueError: If the key is not in the configuration. """
-        if key is not None:
-            prefix = config[key]
-            path = f"{prefix}/{path}" if Util.is_web(prefix) \
-                else os.path.join (prefix, path)
-        obj = None
-        if path.endswith (".yaml") or path.endswith (".yml"):
-            obj = yaml.safe_load (Util.read_data (path))
-        elif path.endswith (".json"):
-            obj = json.loads (Util.read_data (path))
-        return obj
-
-    @staticmethod
-    def is_web (uri):
-        """ The URI is a web URI (starts with http or https).
-        :param uri: A URI """
-        return uri.startswith("http://") or uri.startswith ("https://")
-    
-    @staticmethod
-    def write_object (obj, path, key=None):
-        """ Write an object to a path. YAML and JSON supported based on extension.
-        :param obj: The object to write.
-        :param path: The path to write to.
-        :param key: The configuration key to prepend to the path.
-        """
-        """ Prepend a prefix from the configuration file if a key is given. """
-        if key is not None:
-            prefix = config[key]
-            path = f"{prefix}/{path}" if Util.is_web(prefix) \
-                else os.path.join (prefix, path)
-        """ Ensure the directory to be written to exists. """
-        dirname = os.path.dirname (path)
-        if not os.path.exists (dirname):
-            os.makedirs (dirname, exist_ok=True)
-        """ Write the file in the specified format. """
-        if path.endswith (".yaml") or path.endswith (".yml"):
-            with open(path, 'w') as outfile:
-                yaml.dump (obj, stream)
-        elif path.endswith (".json"):
-            with open (path, "w") as stream:
-                json.dump (obj, stream, indent=2)
-        else:
-            """ Raise an exception if invalid. """
-            raise ValueError (f"Unrecognized extension: {path}")
-
-    @staticmethod
-    def kgx_path (name):
-        """ Form a KGX object path.
-        :path name: Name of the KGX object. """
-        return os.path.join (data_root, "kgx", name)
-
-    @staticmethod
-    def kgx_objects ():
-        """ A list of KGX objects. """
-        kgx_pattern = Util.kgx_path("**.json")
-        return sorted(glob.glob (kgx_pattern))
-    
-    @staticmethod
-    def merge_path (name):
-        """ Form a merged KGX object path.
-        :path name: Name of the merged KGX object. """
-        return os.path.join (data_root, "merge", name)
-
-    @staticmethod
-    def merged_objects ():
-        """ A list of merged KGX objects. """
-        merged_pattern = Util.merge_path("**.json")
-        return sorted(glob.glob (merged_pattern))
-        
-    @staticmethod
-    def schema_path (name):
-        """ Path to a schema object.
-        :param name: Name of the object to get a path for. """
-        return os.path.join (data_root, "schema", name)
-
-    @staticmethod
-    def bulk_path (name):
-        """ Path to a bulk load object.
-        :param name: Name of the object. """
-        return os.path.join (data_root, "bulk", name)
-
-    @staticmethod
-    def read_schema (schema_type: SchemaType):
-        """ Read a schema object.
-        :param schema_type: Schema type of the object to read. """
-        path = Util.schema_path (f"{schema_type.value}-schema.json")
-        return Util.read_object (path)
-    
-    @staticmethod
-    def get_uri (path, key):
-        """ Build a URI.
-        :param path: The path of an object.
-        :param key: The key of a configuration value to prepend to the object. """
-        return f"{config[key]}/{path}"
-
-    @staticmethod
-    def get_relative_path (path):
-        return os.path.join (os.path.dirname (__file__), path)
-
-    @staticmethod
-    def read_relative_object (path):
-        return Util.read_object (Util.get_relative_path(path))
-
-    @staticmethod
-    def trunc(text, limit):
-        return ('..' + text[-limit-2:]) if len(text) > limit else text
-
-    @staticmethod
-    def is_up_to_date (source, targets):
-        target_time_list = [ os.stat (f).st_mtime for f in targets if os.path.exists(f) ]
-        if len(target_time_list) == 0:
-            log.debug (f"no targets found")
-            return False
-        source = [ os.stat (f).st_mtime for f in source if os.path.exists (f) ]
-        if len(source) == 0:
-            log.debug ("no source found. up to date")
-            return True
-        return max(source) < min(target_time_list)
-        
-class KGXModel:
-    """ Abstractions for transforming Knowledge Graph Exchange formatted data. """
-    def __init__(self, biolink):
-        self.biolink = biolink
-        
-    def get (self, dataset_version = "v0.1"):
-        """ Read metadata for edge and node files, then join them into whole KGX objects
-        containing both nodes and edges. 
-        :param dataset_version: Data version to operate on.
-        """
-        metadata = Util.read_relative_object ("metadata.yaml")
-        for item in metadata['versions']:
-            if item['version'] == dataset_version:
-                for edge_url in item['edgeFiles']:
-                    start = Util.current_time_in_millis ()
-                    edge_url = Util.get_uri (edge_url, "base_data_uri")
-                    node_url = edge_url.replace ("-edge-", "-node-")
-                    subgraph_basename = os.path.basename (edge_url.replace ("-edge", ""))
-                    subgraph_path = Util.kgx_path (subgraph_basename)
-                    if os.path.exists (subgraph_path):
-                        log.info (f"cached kgx: {subgraph_path}")
-                        continue
-                    subgraph = {
-                        "edges" : Util.read_object (edge_url),
-                        "nodes" : Util.read_object (node_url)
-                    }
-                    Util.write_object (subgraph, subgraph_path)
-                    total_time = Util.current_time_in_millis () - start
-                
-                    edges = len(subgraph['edges'])
-                    nodes = len(subgraph['nodes'])
-                    log.debug ("wrote {:>45}: edges:{:>7} nodes: {:>7} time:{:>8}".format (
-                        Util.trunc(subgraph_path, 45), edges, nodes, total_time))
-
-    def create_schema (self):
-        """
-        Determine the schema of each type of object. We have to do this to make it possible
-        to write tabular data. Need to know all possible columns in advance and correct missing
-        fields.
-        """
-        if self.schema_up_to_date():
-            log.info (f"schema is up to date.")
-            return
-        
-        predicate_schemas = defaultdict(lambda:None)
-        category_schemas = defaultdict(lambda:None)        
-        for subgraph in Util.kgx_objects ():
-            """ Read a kgx data file. """
-            log.debug (f"analyzing schema of {subgraph}.")
-            basename = os.path.basename (subgraph).replace (".json", "")
-            graph = Util.read_object (subgraph)
-            """ Infer predicate schemas. """
-            for edge in graph['edges']:
-                predicate = edge['edge_label']
-                if not predicate in predicate_schemas:
-                    predicate_schemas[predicate] = edge
-                    for k in edge.keys ():
-                        edge[k] = ''
-                else:
-                    for k in edge.keys ():
-                        if not k in predicate_schemas[predicate]:
-                            predicate_schemas[predicate][k] = ''
-            """ Infer node schemas. """
-            for node in graph['nodes']:
-                node_type = self.biolink.get_leaf_class (node['category'])
-                if not node_type in category_schemas:
-                    category_schemas[node_type] = node
-                    for k in node.keys ():
-                        node[k] = ''
-                else:
-                    for k in node.keys ():
-                        if not k in category_schemas[node_type]:
-                            category_schemas[node_type][k] = ''
-        """ Write node and predicate schemas. """
-        self.write_schema (predicate_schemas, SchemaType.PREDICATE)
-        self.write_schema (category_schemas, SchemaType.CATEGORY)
-
-    def schema_up_to_date (self):
-        return Util.is_up_to_date (
-            source=Util.kgx_objects (),
-            targets=[
-                Util.schema_path (f"{SchemaType.PREDICATE.value}-schema.json"),
-                Util.schema_path (f"{SchemaType.PREDICATE.value}-schema.json")
-            ])
-                
-    def write_schema (self, schema, schema_type: SchemaType):
-        """ Output the schema file. 
-        :param schema: Schema to get keys from.
-        :param schema_type: Type of schema to write. """
-        file_name = Util.schema_path (f"{schema_type.value}-schema.json")
-        log.info (f"writing schema: {file_name}")
-        dictionary = { k : self.format_keys(v.keys(), schema_type)  for k, v in schema.items () }
-        Util.write_object (dictionary, file_name)
-        
-    def merge_nodes (self, L, R):
-        for k in L.keys ():
-            R_v = R.get (k, None)
-            if R_v == '' or R_v == None:
-                L[k] = R_v
-
-    def diff_lists (self, L, R):
-        return list(list(set(L)-set(R)) + list(set(R)-set(L)))
-
-    def merge (self):
-        """ Merge nodes. Would be good to have something less computationally intensive. """
-        for path in Util.kgx_objects ():
-            new_path = path.replace ('/kgx/', '/merge/')
-
-            source_stats = os.stat (path)
-            if os.path.exists (new_path):
-                dest_stats = os.stat (new_path)
-                if dest_stats.st_mtime > source_stats.st_mtime:
-                    log.info (f"merge {new_path} is up to date.")
-                    continue
-
-            log.info (f"merging {path}")
-            graph = Util.read_object (path)
-            graph_nodes = graph.get ('nodes', [])
-            graph_map = { n['id'] : n for n in graph_nodes }
-            graph_keys = graph_map.keys ()
-            total_merge_time = 0
-            for path_2 in Util.kgx_objects ():
-                if path_2 == path:
-                    continue
-                start = Util.current_time_in_millis ()
-                other_graph = Util.read_object (path_2)
-                load_time = Util.current_time_in_millis () - start
-
-                start = Util.current_time_in_millis ()                
-                other_nodes = other_graph.get('nodes', [])
-                other_map = { n['id'] : n for n in other_nodes }
-                other_keys = set(other_map.keys())
-                intersection = [ v for v in graph_keys if v in other_keys ]
-                difference = list(set(other_keys) - set(graph_keys))
-                scope_time = Util.current_time_in_millis () - start
-                
-                start = Util.current_time_in_millis ()
-                for i in intersection:
-                    self.merge_nodes (graph_map[i], other_map[i])
-                other_graph['nodes'] = [ other_map[i] for i in difference ]
-                merge_time = Util.current_time_in_millis () - start
-                
-                start = Util.current_time_in_millis ()
-                Util.write_object (other_graph, path_2.replace ('kgx', 'merge'))
-                write_time = Util.current_time_in_millis () - start
-                log.debug ("merged {:>45} load:{:>5} scope:{:>7} merge:{:>3}".format(
-                    Util.trunc(path_2, 45), load_time, scope_time, merge_time))
-                total_merge_time += load_time + scope_time + merge_time + write_time
-                
-            start = Util.current_time_in_millis ()
-            Util.write_object (graph, new_path)
-            rewrite_time = Util.current_time_in_millis () - start
-            log.info (f"{path} rewrite: {rewrite_time}. total merge time: {total_merge_time}")
-
-    def format_keys (self, keys, schema_type : SchemaType):
-        """ Format schema keys. Make source and destination first in edges. Make
-        id first in nodes. Remove keys for fields we can't yet represent.
-        :param keys: List of keys.
-        :param schema_type: Type of schema to conform to.
-        """
-        """ Sort keys. """
-        k_list = sorted(keys)
-        if schema_type == SchemaType.PREDICATE:
-            """ Rename subject and object to src and dest """
-            k_list.remove ('subject')
-            k_list.remove ('object')
-            k_list.insert (0, 'src')
-            k_list.insert (1, 'dest')
-        elif schema_type == SchemaType.CATEGORY:
-            """ Make id the first field. Remove smiles. It causes ast parse errors. 
-            TODO: update bulk loader to ignore AST on selected fields.
-            """
-            k_list.remove ('id')
-            if 'simple_smiles' in k_list:
-                k_list.remove ('simple_smiles')
-            k_list.insert (0, 'id')
-        return k_list
-
-    def load (self):
-        """ Use KGX to load a data set into Redisgraph """
-        input_format = "json"
-        uri = f"redis://{config['redisgraph']['host']}:{config['redisgraph']['ports']['http']}/"
-        username = config['redisgraph']['username']
-        password = config['redisgraph']['password']
-        log.info (f"connecting to redisgraph: {uri}")
-        for subgraph in glob.glob (f"{kgx_repo}/**.json"):
-            redisgraph_upload(inputs=[ subgraph ],
-                              input_format=input_format,
-                              input_compression=None,
-                              uri=uri,
-                              username=username,
-                              password=password,
-                              node_filters=[],
-                              edge_filters=[])
-
-class BiolinkModel:
-    """ Programmatic model of Biolink. """
-    def to_camel_case(self, snake_str):
-        """ Convert a snake case string to camel case. """
-        components = snake_str.split('_')
-        return ''.join(x.title() for x in components)
-
-    def get_class(self, name):
-        """ Get a Python class from a string name. """
-        return getattr(sys.modules["biolink.model"], name)
-
-    def is_derived (self, a_class_name, classes):
-        """ Return true if the class derives from any of the provided classes. """
-        for c in classes:
-            if isinstance (self.get_class(self.to_camel_case(a_class_name)), c):
-                return True
-        return False
-
-    def get_leaf_class (self, names):
-        """ Return the leaf classes in the provided list of names. """
-        classes = [ self.get_class(self.to_camel_case(n)) for n in names ]
-        leaves = [ n for n in names if not self.is_derived (n, classes) ]
-        return leaves [0]
-
-class BulkLoad:
-    """ Tools for creating a Redisgraph bulk load dataset. """
-    def __init__(self, biolink):
-        self.biolink = biolink
-
-    def tables_up_to_date (self):
-        return Util.is_up_to_date (
-            source=[
-                Util.schema_path (f"{SchemaType.PREDICATE.value}-schema.json"),
-                Util.schema_path (f"{SchemaType.PREDICATE.value}-schema.json")
-            ] + Util.merged_objects (),
-            targets=glob.glob (Util.bulk_path ("nodes/**.csv")) + \
-            glob.glob (Util.bulk_path ("edges/**.csv")))
-
-    def create (self):
-        """ Check source times. """
-        if self.tables_up_to_date ():
-            log.info ("up to date.")
-            return
-        
-        """ Format the data for bulk load. """
-        predicates_schema = Util.read_schema (SchemaType.PREDICATE)
-        categories_schema = Util.read_schema (SchemaType.CATEGORY)
-        bulk_path = Util.bulk_path("")
-        if os.path.exists(bulk_path): 
-            shutil.rmtree(bulk_path)
-
-        state = defaultdict(lambda:None)
-        for subgraph in Util.merged_objects ():
-            log.info (f"processing {subgraph}")
-            graph = Util.read_object (subgraph)
-
-            """ Write node data for bulk load. """
-            categories = defaultdict(lambda: [])
-            for node in graph['nodes']:
-                index = self.biolink.get_leaf_class (node['category'])
-                categories[index].append (node)
-            self.write_bulk (Util.bulk_path("nodes"), categories, categories_schema,
-                        state=state, f=subgraph)
-
-            """ Write predicate data for bulk load. """
-            predicates = defaultdict(lambda: [])
-            for edge in graph['edges']:
-                predicates[edge['edge_label']].append (edge)
-                edge['src'] = edge.pop ('subject')
-                edge['dest'] = edge.pop ('object')
-            self.write_bulk (Util.bulk_path("edges"), predicates, predicates_schema)
-            
-    def cleanup (self, v):
-        """ Filter problematic text. 
-        :param v: A value to filter and clean.
-        """
-        if isinstance(v, list):
-            v = [ self.cleanup(val) for val in v ]
-        elif isinstance (v, str):
-            """ Some values contain the CSV separator character. 'fix' that. """
-            if len(v) > 1 and v[0] == '[' and v[-1] == ']':
-                v = v.replace ("[", "@").replace ("]", "@") #f" {v}"
-            v = v.replace ("|","^")
-        return v
-    
-    def write_bulk (self, bulk_path, obj_map, schema, state={}, f=None):
-        """ Write a bulk load group of objects.
-        :param bulk_path: Path to the bulk loader object to write.
-        :param obj_map: A map of biolink type to list of objects.
-        :param schema: The schema (nodes or predicates) containing identifiers.
-        :param state: Track state of already written objects to avoid duplicates.
-        """
-        os.makedirs (bulk_path, exist_ok=True)
-        for key, objects in obj_map.items ():
-            out_file = f"{bulk_path}/{key}.csv"
-            if len(objects) == 0:
-                continue
-            new_file = not os.path.exists (out_file)
-            all_keys = schema[key]
-            with open (out_file, "a") as stream:
-                if new_file:
-                    log.info (f"  --creating {out_file}")
-                    stream.write ("|".join (all_keys))
-                    stream.write ("\n")
-                """ Make all objects conform to the schema. """
-                for obj in objects:
-                    for akey in all_keys:
-                        if not akey in obj:
-                            obj[akey] = ""
-                """ Write fields, skipping duplicate objects. """
-                for obj in objects:
-                    oid = str(obj['id'])
-                    if oid in state:
-                        continue
-                    state[oid] = oid
-                    values = [ self.cleanup(obj[k]) for k in all_keys if not 'smiles' in k ]
-                    clean = list(map(str, values))
-                    s = "|".join (clean)
-                    stream.write (s)
-                    stream.write ("\n")
-
-    def insert (self):
-        redisgraph = config.get('redisgraph', {})
-        bulk_loader = config.get('bulk_loader', {})
-        nodes = sorted(glob.glob (Util.bulk_path ("nodes/**.csv")))
-        edges = sorted(glob.glob (Util.bulk_path ("edges/**.csv")))
-        graph = redisgraph['graph']
-        log.info (f"bulk loading \n  nodes: {nodes} \n  edges: {edges}")
-        print (f"bulk loading \n  nodes: {nodes} \n  edges: {edges}")
-
-        try:
-            log.info (f"deleting graph {graph} in preparation for bulk load.")
-            db = self.get_redisgraph (redisgraph)
-            db.redis_graph.delete ()
-        except redis.exceptions.ResponseError:
-            log.info ("no graph to delete")
-            
-        log.info (f"bulk loading graph: {graph}")        
-        args = []
-        if len(nodes) > 0:
-            args.extend (("-n " + " -n ".join (nodes)).split ())
-        if len(edges) > 0:
-            args.extend (("-r " + " -r ".join (edges)).split ())
-        args.extend ([ "--separator=|" ])
-        args.extend ([ redisgraph['graph'] ])
-        """ standalone_mode=False tells click not to sys.exit() """
-        bulk_insert (args, standalone_mode=False)
-
-    def get_redisgraph (self, redisgraph):
-        return RedisGraph (host=redisgraph['host'],
-                           port=redisgraph['ports']['http'],
-                           graph=redisgraph['graph'])
-    
-    def validate (self):
-        redisgraph = config.get('redisgraph', {})
-        print (f"config:{json.dumps(redisgraph, indent=2)}")
-        db = self.get_redisgraph (redisgraph)
-        validation_queries = config.get('validation', {}).get('queries', [])
-        for key, query in validation_queries.items ():
-            text = query['query']
-            name = query['name']
-            args = query.get('args', [{}])
-            for arg in args:
-                start = Util.current_time_in_millis ()
-                instance = Template (text).safe_substitute (arg)
-                db.query (instance)
-                duration = Util.current_time_in_millis () - start
-                log.info (f"Query {key}:{name} ran in {duration}ms: {instance}") 
-            
-class Roger:
-    """ Consolidate Roger functionality for a cleaner interface. """
-
-    def __init__(self, to_string=False):
-        """ Initialize.
-        :param to_string: Log messages to a string, available as self.log_stream.getvalue() 
-        after execution completes.
-        """
-        import logging
-        if to_string:
-            """ Add a stream handler to enable to_string. """
-            self.log_stream = StringIO()
-            self.string_handler = logging.StreamHandler (self.log_stream)
-            log.addHandler (self.string_handler)
-        self.biolink = BiolinkModel ()
-        self.kgx = KGXModel (self.biolink)
-        self.bulk = BulkLoad (self.biolink)
-
-    def __enter__(self):
-        """ Implement Python's Context Manager interface. """
-        return self
-    
-    def __exit__(self, exception_type, exception_value, traceback):
-        """ Implement Python's Context Manager interface. We use this finalizer
-        to detach the stream handler appended in the constructor.
-        :param exception_type: Type of exception, if one occurred.
-        :param exception_value: The exception, if one occurred.
-        :param traceback: The stack trace explaining the exception. 
-        """
-        if exception_type or exception_value or traceback:
-            log.error ("{} {} {}".format (exception_type, exception_value, traceback))
-        log.removeHandler (self.string_handler)
-        
-class RogerUtil:
-    """ An interface abstracting Roger's inner workings to make it easier to
-    incorporate into external tools like workflow engines. """
-    @staticmethod
-    def get_kgx (to_string=False):
-        output = None
-        with Roger (to_string) as roger:
-            roger.kgx.get ()
-            output = roger.log_stream.getvalue () if to_string else None
-        return output
-    
-    @staticmethod
-    def create_schema (to_string=False):
-        output = None
-        with Roger (to_string) as roger:
-            roger.kgx.create_schema ()
-            output = roger.log_stream.getvalue () if to_string else None
-        return output
-    
-    @staticmethod
-    def merge_nodes (to_string=False):
-        output = None
-        with Roger (to_string) as roger:
-            roger.kgx.merge ()
-            output = roger.log_stream.getvalue () if to_string else None
-        return output
-    
-    @staticmethod
-    def create_bulk_load (to_string=False):
-        output = None
-        with Roger (to_string) as roger:
-            roger.bulk.create ()
-            output = roger.log_stream.getvalue () if to_string else None
-        return output
-
-    @staticmethod
-    def bulk_load (to_string=False):
-        output = None
-        with Roger (to_string) as roger:
-            roger.bulk.insert ()
-            output = roger.log_stream.getvalue () if to_string else None
-        return output
-
-    @staticmethod
-    def validate (to_string=False):
-        output = None
-        with Roger (to_string) as roger:
-            roger.bulk.validate ()
-            output = roger.log_stream.getvalue () if to_string else None
-        return output
-    
-if __name__ == "__main__":
-    """ Roger CLI. """
-    parser = argparse.ArgumentParser(description='Roger')
-    parser.add_argument('-v', '--dataset-version', help="Dataset version.", default="v0.1")
-    parser.add_argument('-d', '--data-root', help="Root of data hierarchy", default=None)
-    parser.add_argument('-g', '--get-kgx', help="Get KGX objects", action='store_true')
-    parser.add_argument('-l', '--load-kgx', help="Load via KGX", action='store_true')
-    parser.add_argument('-s', '--create-schema', help="Infer schema", action='store_true')
-    parser.add_argument('-m', '--merge-kgx', help="Merge KGX nodes", action='store_true')
-    parser.add_argument('-b', '--create-bulk', help="Create bulk load", action='store_true')
-    parser.add_argument('-i', '--insert', help="Do the bulk insert", action='store_true')
-    parser.add_argument('-a', '--validate', help="Validate the insert", action='store_true')
-    args = parser.parse_args ()
-
-    biolink = BiolinkModel ()
-    kgx = KGXModel (biolink)
-    bulk = BulkLoad (biolink)
-    if args.data_root is not None:
-        data_root = get_config()['data_root'] = args.data_root
-        log.info (f"data root:{data_root}")
-    if args.get_kgx:
-        kgx.get (dataset_version=args.dataset_version)
-    if args.load_kgx:
-        kgx.load ()
-    if args.merge_kgx:
-        kgx.merge ()
-    if args.create_schema:
-        kgx.create_schema ()
-    if args.create_bulk:
-        bulk.create ()
-    if args.insert:
-        bulk.insert ()
-    if args.validate:
-        bulk.validate ()
-
-    sys.exit (0)
diff --git a/roger/metadata.yaml b/roger/metadata.yaml
deleted file mode 100644
index 1b56206e..00000000
--- a/roger/metadata.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-versions:
-- edgeFiles:
- # - biolink_kgx-edge-v0.1.json
-  - chembio_kgx-edge-v0.1.json
-  - chemical_normalization-edge-v0.1.json
-  - cord19-phenotypes-edge-v0.1.json
-#  - cord19-scibite-edge-v0.1.json
-#  - cord19-scigraph-edge-v0.1.json
-  - ctd-edge-v0.1.json
-  - foodb-edge-v0.1.json
-#  - kegg-edge-v0.1.json
-  - mychem-edge-v0.1.json
-#  - panther-edge-v0.1.json
-  - pharos-edge-v0.1.json
-  - topmed-edge-v0.1.json
-  nodeFiles:
-  - biolink_kgx-node-v0.1.json
-  - chembio_kgx-node-v0.1.json
-  - chemical_normalization-node-v0.1.json
-  - cord19-phenotypes-node-v0.1.json
-  - cord19-scibite-node-v0.1.json
-  - cord19-scigraph-node-v0.1.json
-  - ctd-node-v0.1.json
-  - foodb-node-v0.1.json
- # - kegg-node-v0.1.json
-  - mychem-node-v0.1.json
-  - panther-node-v0.1.json
-  - pharos-node-v0.1.json
-  - topmed-node-v0.1.json
-  version: v0.1
-- version: test
-  edgeFiles:
-   - cord19-phenotypes-edge-v0.1.json
-   - chembio_kgx-edge-v0.1.json
-  nodeFiles:
-   - cord19-phenotypes-node-v0.1.json
-   - chembio_kgx-node-v0.1.json
diff --git a/roger/roger_util.py b/roger/roger_util.py
deleted file mode 100644
index 35c5f3f7..00000000
--- a/roger/roger_util.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import logging
-import requests
-import sys
-import yaml
-from os import path
-from typing import Dict, Any, Optional
-
-config: Optional[Dict[str, Any]] = None
-logger: Optional[logging.Logger] = None
-
-CONFIG_FILENAME = path.join(path.dirname(path.abspath(__file__)), 'config.yaml')
-
-def get_config(filename: str = CONFIG_FILENAME) -> dict:
-    """
-    Get config as a dictionary
-
-    Parameters
-    ----------
-    filename: str
-        The filename with all the configuration
-
-    Returns
-    -------
-    dict
-        A dictionary containing all the entries from the config YAML
-
-    """
-    global config
-    if config is None:
-        config = yaml.load(open(filename), Loader=yaml.FullLoader)
-    return config
-
-def get_logger(name: str = 'roger') -> logging.Logger:
-    """
-    Get an instance of logger.
-
-    Parameters
-    ----------
-    name: str
-        The name of logger
-
-    Returns
-    -------
-    logging.Logger
-        An instance of logging.Logger
-
-    """
-    global logger
-    if logger is None:
-        config = get_config()
-        logger = logging.getLogger(name)
-        handler = logging.StreamHandler(sys.stdout)
-        formatter = logging.Formatter(config['logging']['format'])
-        handler.setFormatter(formatter)
-        logger.addHandler(handler)
-        logger.setLevel(config['logging']['level'])
-        logger.propagate = False
-    return logger
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 00000000..32e25657
--- /dev/null
+++ b/tests/integration/__init__.py
@@ -0,0 +1,3 @@
+from pathlib import Path
+
+TEST_DATA_DIR = (Path(__file__).parent / 'data').resolve()
\ No newline at end of file
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
new file mode 100644
index 00000000..d0a68d1c
--- /dev/null
+++ b/tests/integration/conftest.py
@@ -0,0 +1,76 @@
+import os
+
+from roger.core.enums import SchemaType
+import json
+
+class BiolinkMock:
+    def __init__(self):
+        self.leafs = [
+            'chemical_substance',
+            'molecular_activity',
+            'gene',
+            'biological_process',
+            'disease',
+            'phenotypic_feature'
+        ]
+
+    def get_leaf_class(self, class_names):
+        for y in self.leafs:
+            if y in class_names:
+                return y
+        return class_names[0]
+
+    def find_biolink_leaves(self, biolink_concepts):
+        return set([concept for concept in biolink_concepts
+                    if concept in ['named_thing', 'some_other_type']])
+
+category = None
+predicates = None
+file_content_assertions = {}
+kgx_files = []
+merged_files = []
+merge_file_test_dir = ''
+schema = {
+    SchemaType.PREDICATE: {},
+    SchemaType.CATEGORY: {}
+}
+
+def kgx_objects():
+    return [os.path.join(*os.path.split(__file__)[:-1], 'data', file)
+            for file in kgx_files]
+
+def merged_objects():
+    return [os.path.join(*os.path.split(__file__)[:-1], 'data', file)
+            for file in merged_files]
+
+def bulk_path(*args, **kwargs):
+    return os.path.join(*os.path.split(__file__)[:-1], 'data', 'bulk')
+
+def is_up_to_date(*args, **kwargs):
+    return False
+
+def schema_path(name, *args, **kwargs):
+    return name
+
+def read_schema(schema_type: SchemaType, *args, **kwargs):
+    return conftest.schema[schema_type]
+
+def read_object(path, *args, **kwargs):
+    import json
+    with open(path) as f:
+        return json.load(f)
+
+def write_object(dictionary, file_name):
+    print(dictionary, file_name)
+    print(file_content_assertions)
+    assert file_content_assertions[file_name] == dictionary
+
+def merge_path(file_name):
+    return os.path.join(*os.path.split(__file__)[:-1], 'data', 'merge',
+                        merge_file_test_dir, file_name)
+
+def json_line_iter(jsonl_file_path):
+    f = open(file=jsonl_file_path, mode='r')
+    for line in f:
+        yield json.loads(line)
+    f.close()
diff --git a/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/edges.jsonl b/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/edges.jsonl
new file mode 100644
index 00000000..e0477a06
--- /dev/null
+++ b/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/edges.jsonl
@@ -0,0 +1,2 @@
+{"id": "edge_1", "edge_label": "edge_type_1", "list_vs_str": [], "list_vs_int": [], "list_vs_bool": [], "list_vs_float": [], "str_vs_float": "", "str_vs_bool": "", "str_vs_int": "", "int_vs_bool": 0, "int_vs_float": 0, "float_vs_bool": 0, "predicate":  "related_to"}
+{"id": "edge_2", "edge_label": "edge_type_1", "list_vs_str": "", "list_vs_int": 0, "list_vs_bool": true, "list_vs_float": 0.0, "str_vs_float": 0.0, "str_vs_bool": false, "str_vs_int": 0, "int_vs_bool": true, "int_vs_float": 0.0, "float_vs_bool": true, "predicate":  "related_to" }
\ No newline at end of file
diff --git a/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/expected.json b/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/expected.json
new file mode 100644
index 00000000..49c8e2b3
--- /dev/null
+++ b/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/expected.json
@@ -0,0 +1,20 @@
+{
+  "predicate-schema.json": {
+    "related_to": {
+    "id": "str",
+    "edge_label": "str",
+    "list_vs_str": "list",
+    "list_vs_int": "list",
+    "list_vs_bool": "list",
+    "list_vs_float": "list",
+    "str_vs_float": "str",
+    "str_vs_bool": "str",
+    "str_vs_int": "str",
+    "int_vs_bool": "str",
+    "int_vs_float": "str",
+    "float_vs_bool": "str",
+    "predicate": "str"
+    }
+  },
+  "category-schema.json": {}
+}
\ No newline at end of file
diff --git a/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/nodes.jsonl b/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/nodes.jsonl
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/edges.jsonl b/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/edges.jsonl
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/expected.json b/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/expected.json
new file mode 100644
index 00000000..7d7979b5
--- /dev/null
+++ b/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/expected.json
@@ -0,0 +1,19 @@
+{
+  "category-schema.json": {
+    "named_thing": {
+    "id": "str",
+    "category": "list",
+    "list_vs_str": "list",
+    "list_vs_int": "list",
+    "list_vs_bool": "list",
+    "list_vs_float": "list",
+    "str_vs_float": "str",
+    "str_vs_bool": "str",
+    "str_vs_int": "str",
+    "int_vs_bool": "str",
+    "int_vs_float": "str",
+    "float_vs_bool": "str"
+    }
+  },
+  "predicate-schema.json": {}
+}
\ No newline at end of file
diff --git a/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/nodes.jsonl b/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/nodes.jsonl
new file mode 100644
index 00000000..ed5aae5d
--- /dev/null
+++ b/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/nodes.jsonl
@@ -0,0 +1,2 @@
+{"id": "node_1", "category": ["named_thing"], "list_vs_str": [], "list_vs_int": [], "list_vs_bool": [], "list_vs_float": [], "str_vs_float": "", "str_vs_bool": "", "str_vs_int": "", "int_vs_bool": 0, "int_vs_float": 0, "float_vs_bool": 0}
+{"id": "node_1", "category": ["named_thing"], "list_vs_str": "", "list_vs_int": 0, "list_vs_bool": true, "list_vs_float": 0.0, "str_vs_float": 0.0, "str_vs_bool": false, "str_vs_int": 0, "int_vs_bool": true, "int_vs_float": 0.0, "float_vs_bool": true}
\ No newline at end of file
diff --git a/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/edges.jsonl b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/edges.jsonl
new file mode 100644
index 00000000..63ab7769
--- /dev/null
+++ b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/edges.jsonl
@@ -0,0 +1,4 @@
+{"edge_label": "edge_label_1", "id": "x", "bool_attr": false, "int_attr": 0, "float_attr": 0.0 , "predicate": "edge_label_1"}
+{"edge_label": "edge_label_1", "id": "x3", "str_attr": "str", "list_attr": [], "predicate": "edge_label_1"}
+{"edge_label": "edge_label_2", "id": "x4", "str_attr": "str", "predicate": "edge_label_2"}
+{"edge_label": "edge_label_2", "id": "x3", "bool_attr": true, "float_attr": 2.33, "int_attr": 3092, "str_att": "name", "predicate": "edge_label_2"}
\ No newline at end of file
diff --git a/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/expected.json b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/expected.json
new file mode 100644
index 00000000..6fcf9cd5
--- /dev/null
+++ b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/expected.json
@@ -0,0 +1,43 @@
+{
+  "category-schema.json": {
+    "named_thing": {
+      "str_attr": "str",
+      "list_attr": "list",
+      "bool_attr": "bool",
+      "int_attr": "int",
+      "float_attr": "float",
+      "id": "str",
+      "category": "list"
+    },
+    "some_other_type": {
+      "id": "str",
+      "category": "list",
+      "attr_1": "str",
+      "attr_2": "list",
+      "attr_3": "bool",
+      "attr_4": "int"
+    }
+  },
+  "predicate-schema.json": {
+    "edge_label_1": {
+      "id": "str",
+      "edge_label": "str",
+      "str_attr": "str",
+      "list_attr": "list",
+      "bool_attr": "bool",
+      "int_attr": "int",
+      "float_attr": "float",
+      "predicate": "str"
+    },
+    "edge_label_2": {
+      "id": "str",
+      "str_attr": "str",
+      "edge_label": "str",
+      "bool_attr": "bool",
+      "float_attr": "float",
+      "int_attr":  "int",
+      "str_att": "str",
+      "predicate": "str"
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/nodes.jsonl b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/nodes.jsonl
new file mode 100644
index 00000000..1670a2be
--- /dev/null
+++ b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/nodes.jsonl
@@ -0,0 +1,3 @@
+{"id": "ID1", "category": ["named_thing"], "list_attr": [], "bool_attr": false, "int_attr": 0}
+{"id": "ID2", "category": ["named_thing"], "str_attr": "", "float_attr": 0.0}
+{"id": "Id3", "category": ["some_other_type"], "attr_1": "", "attr_2": [], "attr_3": true, "attr_4": 1}
\ No newline at end of file
diff --git a/tests/integration/test_KGX_Model.py b/tests/integration/test_KGX_Model.py
new file mode 100644
index 00000000..4fc5716f
--- /dev/null
+++ b/tests/integration/test_KGX_Model.py
@@ -0,0 +1,42 @@
+import json
+import pytest
+from unittest.mock import patch
+
+from roger.models.kgx import KGXModel
+from . import conftest
+
+
+@pytest.fixture
+def kgx_model():
+    biolink = conftest.BiolinkMock()
+    kgx_model = KGXModel(biolink=biolink, config={})
+    return kgx_model
+
+def setup_mock_and_run_create_schema(test_files_dir, kgx_model: KGXModel):
+
+    with patch('roger.models.kgx.storage', conftest):
+        conftest.merge_file_test_dir = test_files_dir
+        with open(conftest.merge_path("expected.json")) as f:
+            expected = json.load(f)
+            conftest.file_content_assertions = expected
+        kgx_model.create_schema()
+
+def test_create_schema_plain(kgx_model: KGXModel):
+    file_name = 'non_conflicting_prop_types__schema__kgx'
+    setup_mock_and_run_create_schema(file_name, kgx_model=kgx_model)
+
+def test_create_schema_conflicting_nodes(kgx_model: KGXModel):
+    file_name = 'conflicting_prop_types__nodes__schema__kgx'
+    setup_mock_and_run_create_schema(file_name, kgx_model=kgx_model)
+
+def test_create_schema_conflicting_edges(kgx_model: KGXModel):
+    file_name = 'conflicting_prop_types__edges__schema__kgx'
+    setup_mock_and_run_create_schema(file_name, kgx_model=kgx_model)
+
+def test_merge(kgx_model: KGXModel):
+    with patch('roger.models.kgx.storage', conftest):
+        conftest.kgx_files = [
+            'data_1.merge.kgx.json',
+            'data_2.merge.kgx.json'
+        ]
+    #TODO add tests for merge nodes
diff --git a/tests/integration/test_bulk_loader.py b/tests/integration/test_bulk_loader.py
new file mode 100644
index 00000000..ae99573f
--- /dev/null
+++ b/tests/integration/test_bulk_loader.py
@@ -0,0 +1,111 @@
+import pytest
+from unittest.mock import patch
+
+from roger.core import BulkLoad
+from . import conftest
+
+
+@pytest.fixture
+def bulk_loader():
+    biolink = conftest.BiolinkMock()
+    return BulkLoad(biolink=biolink, config={'separator': 30})
+
+
+def test_create_redis_schema():
+    test_schema = {
+        'concept': {
+            'attribute0': 'list',
+            'attribute1': 'str',
+            'attribute2': 'int',
+            'attribute3': 'bool'
+        }
+    }
+    redis_schema = BulkLoad.create_redis_schema_header(test_schema['concept'], is_relation=False)
+    assert 'attribute0:ARRAY' in redis_schema
+    assert 'attribute1:STRING' in redis_schema
+    assert 'attribute2:INT' in redis_schema
+    assert 'attribute3:BOOL' in redis_schema
+
+    redis_schema = BulkLoad.create_redis_schema_header(test_schema['concept'], is_relation=True)
+    assert 'attribute0:ARRAY' in redis_schema
+    assert 'attribute1:STRING' in redis_schema
+    assert 'attribute2:INT' in redis_schema
+    assert 'attribute3:BOOL' in redis_schema
+
+    # should add these columns to relationships
+    assert 'internal_start_id:START_ID' in redis_schema
+    assert 'internal_end_id:END_ID' in redis_schema
+
+
+def test_group_by_set_attr():
+    items = [
+        {   # we need to make sure that empty values are the only ones ignored
+            # not values that evaluate to false.
+            'id': 0,
+            'attr_1': '',
+            'attr_2': 2,
+            'attr_3': [],
+            'attr_4': False,
+            'attr_5': None
+        },
+        {
+            'id': 1,
+            'attr_1': 'a',
+            'attr_2': 'b',
+            'attr_3': 'c',
+            'attr_4': ''
+        }
+    ]
+    # first group is attr_2, attr_4, 'id'
+    group_1 = frozenset(['attr_2', 'attr_4', 'id'])
+    # second group is attr_1, attr_2, attr_3 , 'id'
+    group_2 = frozenset(['attr_1', 'attr_2', 'attr_3', 'id'])
+    grouping, invalid_keys = BulkLoad.group_items_by_attributes_set(objects=items,
+                                                                    processed_object_ids=set())
+    assert group_1 in grouping
+    assert group_2 in grouping
+
+    assert items[0] in grouping[group_1]
+    assert items[1] in grouping[group_2]
+
+
+def test_write_bulk_nodes(bulk_loader: BulkLoad):
+    nodes_schema = {
+        "named_thing": {
+            "id": "str",
+            "str": "str",
+            "list_attr": "list",
+            "bool_attr": "bool",
+            "float_attr": "float",
+            "int_attr": "int"
+        }
+    }
+    node_objects = {
+        "named_thing": [
+            {
+                "id": "ID:1",
+                "str": "name",
+                "list_attr": ["x"],
+                "bool_attr": False,
+                "float_attr": 0.1,
+                "int_attr": 0
+            }
+        ]
+    }
+    with patch('roger.core.bulkload.storage', conftest):
+        bulk_path = conftest.bulk_path()
+        state = {}
+        bulk_loader.write_bulk(bulk_path=bulk_path,
+                               obj_map=node_objects,
+                               schema=nodes_schema,
+                               state=state,
+                               is_relation=False)
+        assert len(state['file_paths']) > 0
+        # @TODO add assertions.
+        # with open(os.path.join(bulk_path,'named_thing_csv-0-1'))
+
+
+
+
+
+
diff --git a/tests/integration/test_dug_utils.py b/tests/integration/test_dug_utils.py
new file mode 100644
index 00000000..4e31f820
--- /dev/null
+++ b/tests/integration/test_dug_utils.py
@@ -0,0 +1,62 @@
+import tempfile
+
+from pathlib import Path
+
+import pytest
+
+from dug_helpers.dug_utils import FileFetcher, get_topmed_files, get_dbgap_files
+from roger.config import config
+
+
+def test_fetch_network_file():
+    filename = "README.md"
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        fetch1 = FileFetcher(
+            "https://github.com",
+            "/helxplatform/roger/blob/main/",
+            tmp_dir,
+        )
+        expected_path = Path(tmp_dir) / filename
+        assert not expected_path.exists()
+        fetch1(filename)
+        assert expected_path.exists()
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        fetch2 = FileFetcher(
+            "https://github.com",
+            Path("/helxplatform/roger/blob/main/"),
+            Path(tmp_dir),
+        )
+
+        expected_path = Path(tmp_dir) / filename
+        assert not expected_path.exists()
+        fetch2(filename)
+        assert expected_path.exists()
+
+
+def test_fetcher_errors():
+
+    filename = "DOES NOT EXIST.md"
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        fetch = FileFetcher(
+            "https://github.com",
+            Path("/helxplatform/roger/blob/main/"),
+            Path(tmp_dir),
+        )
+        with pytest.raises(RuntimeError):
+            fetch(filename)
+
+
+@pytest.mark.skip()
+def test_get_topmed_files():
+    file_names = get_topmed_files(config=config)
+    for file_name in file_names:
+        assert Path(file_name).exists()
+
+
+@pytest.mark.skip()
+def test_get_dbgap_files():
+    file_names = get_dbgap_files(config=config)
+    for file_name in file_names:
+        assert Path(file_name).exists()
\ No newline at end of file
diff --git a/tests/integration/test_type_conversion_util.py b/tests/integration/test_type_conversion_util.py
new file mode 100644
index 00000000..ab4e122f
--- /dev/null
+++ b/tests/integration/test_type_conversion_util.py
@@ -0,0 +1,49 @@
+from roger.components.data_conversion_utils import TypeConversionUtil
+
+
+def test_type_comparision():
+    datatype_1 = list.__name__
+    datatype_2 = str.__name__
+    datatype_3 = bool.__name__
+    datatype_4 = float.__name__
+    datatype_5 = int.__name__
+    # list should always come first
+    assert datatype_1 == TypeConversionUtil.compare_types(datatype_1, datatype_2)
+    assert datatype_1 == TypeConversionUtil.compare_types(datatype_1, datatype_3)
+    assert datatype_1 == TypeConversionUtil.compare_types(datatype_1, datatype_4)
+    assert datatype_1 == TypeConversionUtil.compare_types(datatype_1, datatype_5)
+
+    # then string
+    assert datatype_2 == TypeConversionUtil.compare_types(datatype_2, datatype_3)
+    assert datatype_2 == TypeConversionUtil.compare_types(datatype_2, datatype_4)
+    assert datatype_2 == TypeConversionUtil.compare_types(datatype_2, datatype_5)
+
+    # the rest should always be casted up to string
+    assert datatype_2 == TypeConversionUtil.compare_types(datatype_3, datatype_4)
+    assert datatype_2 == TypeConversionUtil.compare_types(datatype_4, datatype_5)
+    assert datatype_2 == TypeConversionUtil.compare_types(datatype_5, datatype_3)
+
+    # should raise error when sent 'Unknown' data types
+    bogus_dt = "bogus"
+    try:
+        TypeConversionUtil.compare_types(bogus_dt, datatype_1)
+    except AssertionError as error:
+        exception_raised = True
+    assert exception_raised
+    try:
+        TypeConversionUtil.compare_types(datatype_1, bogus_dt)
+    except AssertionError as error:
+        exception_raised = True
+    assert exception_raised
+
+
+def test_casting_values():
+    castable = [
+        ["True", bool.__name__, True],
+        [1 , bool.__name__, True],
+        [1.0, bool.__name__, True],
+        [[], bool.__name__, False]
+    ]
+    for items in castable:
+        assert items[-1] == TypeConversionUtil.cast(*items[:-1])  # cast (value, type)
+
diff --git a/tests/test_redis_query.cypher b/tests/test_redis_query.cypher
new file mode 100644
index 00000000..509df1fa
--- /dev/null
+++ b/tests/test_redis_query.cypher
@@ -0,0 +1,5 @@
+MATCH (c{id:'HP:0032316'}) return c
+
+MATCH (disease:`Disease` {`id`: 'MONDO:0004979'}) WITH disease MATCH (disease)-[e1_disease_phenotypic_feature]-(phenotypic_feature:`PhenotypicFeature` {})
+WITH disease AS disease, phenotypic_feature AS phenotypic_feature, collect(e1_disease_phenotypic_feature) AS e1_disease_phenotypic_feature
+RETURN disease,phenotypic_feature,e1_disease_phenotypic_feature,labels(disease) AS type__disease,labels(phenotypic_feature) AS type__phenotypic_feature,[edge in e1_disease_phenotypic_feature | type(edge)] AS type__e1_disease_phenotypic_feature,[edge in e1_disease_phenotypic_feature | [startNode(edge).id, endNode(edge).id]] AS id_pairs__e1_disease_phenotypic_feature
\ No newline at end of file
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
new file mode 100644
index 00000000..f32cb512
--- /dev/null
+++ b/tests/unit/test_config.py
@@ -0,0 +1,75 @@
+import os
+
+from roger.config import RogerConfig, RedisConfig
+
+
+def test_merge():
+    dict_a = {
+        'redis': {
+            'host': 'redis',
+            'port': 6379,
+            'user': 'admin',
+            'password': 'pass1'
+        }
+    }
+    dict_b = {
+        'redis': {
+            'port': 6389,
+            'password': 'pass2'
+        },
+        'elasticsearch': {
+            'host': 'elastic',
+            'port': 9200
+        }
+    }
+
+    assert RogerConfig.merge_dicts(dict_a, dict_b) == {
+       'redis': {
+           'host': 'redis',
+           'port': 6389,
+           'user': 'admin',
+           'password': 'pass2'
+        },
+        'elasticsearch': {
+            'host': 'elastic',
+            'port': 9200
+        }
+    }
+
+
+def test_get_overrides():
+    prefix = "TEST_VALUES_"
+    assert RogerConfig.get_override_data(prefix) == {}
+
+    os.environ[f"{prefix}REDIS_HOST"] = 'http://redis.svc'
+    os.environ[f"{prefix}REDIS_PORT"] = '6379'
+    os.environ[f"{prefix}REDIS_USER"] = 'redis-admin'
+    os.environ[f"{prefix}REDIS_PASSWORD"] = 'admin-pass'
+    os.environ[f"{prefix}ELASTIC__SEARCH_HOST"] = 'http://elastic.svc'
+
+    actual = RogerConfig.get_override_data(prefix)
+    expected = {
+        'redis': {
+            'host': 'http://redis.svc',
+            'port': '6379',
+            'user': 'redis-admin',
+            'password': 'admin-pass',
+        },
+        'elastic_search': {
+            'host': 'http://elastic.svc',
+        }
+    }
+    assert actual == expected
+
+
+def test_redis_conf():
+    redis_conf = RedisConfig(**{})
+    assert redis_conf.username == ""
+    assert redis_conf.password == ""
+    assert redis_conf.host == "redis"
+    assert redis_conf.graph == "test"
+    assert redis_conf.port == 6379
+
+    redis_conf = RedisConfig(**{"port": "6379"})
+    assert redis_conf.port == 6379
+
diff --git a/tranql-schema.yaml b/tranql-schema.yaml
new file mode 100644
index 00000000..79d9d575
--- /dev/null
+++ b/tranql-schema.yaml
@@ -0,0 +1,12 @@
+schema:
+    redis:
+      doc: |
+        Roger is a knowledge graph built by aggregeting several kgx formatted knowledge graphs from several sources.
+      url: "redis:"
+      redis: true
+      redis_connection_params:
+        # Host here is the service name in the docker composed container.
+        host: redis
+        port: 6379
+        # SET USERNAME and PASSWORD
+        # via REDIS_USERNAME , REDIS_PASSWORD Env vars (i.e capitialize service name)
diff --git a/tranql_translate.py b/tranql_translate.py
deleted file mode 100644
index e9fe67e6..00000000
--- a/tranql_translate.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-
-"""
-An Airflow workflow for the Roger Translator KGX data pipeline.
-"""
-
-import os
-import subprocess
-from airflow.operators.bash_operator import BashOperator
-from airflow.contrib.example_dags.libs.helper import print_stuff
-from airflow.models import DAG
-from airflow.operators.python_operator import PythonOperator
-from airflow.utils.dates import days_ago
-from roger.core import RogerUtil
-
-default_args = {
-    'owner': 'RENCI',
-    'start_date': days_ago(1)
-}
-
-""" Build the workflow's tasks and DAG. """
-with DAG(
-    dag_id='tranql_translate',
-    default_args=default_args,
-    schedule_interval=None
-) as dag:
-
-    """ Configure use of KubernetesExecutor. """
-    at_k8s=False
-    
-    def get_executor_config (annotations=None):
-        """ Get an executor configuration.
-        :param annotations: Annotations to attach to the executor.
-        :returns: Returns a KubernetesExecutor if K8s is configured and None otherwise.
-        """
-        k8s_executor_config = {
-            "KubernetesExecutor": {
-                "annotations": annotations
-            }
-        }
-        return k8s_executor_config if at_k8s else None
-
-    def create_python_task (name, a_callable):
-        """ Create a python task.
-        :param name: The name of the task.
-        :param a_callable: The code to run in this task.
-        """
-        return PythonOperator(
-            task_id=name,
-            python_callable=a_callable,
-            op_kwargs={ 'to_string' : True },
-            executor_config=get_executor_config (annotations={
-                "task_name" : name
-            })
-        )
-
-    """ Build the workflow tasks. """
-    intro = BashOperator(task_id='Intro', bash_command='echo running tranql translator')
-    get_kgx = create_python_task ("GetSource", RogerUtil.get_kgx)
-    create_schema = create_python_task ("CreateSchema", RogerUtil.create_schema)
-    merge_nodes = create_python_task ("MergeNodes", RogerUtil.merge_nodes)
-    create_bulk_load = create_python_task ("CreateBulkLoad", RogerUtil.create_bulk_load)
-    bulk_load = create_python_task ("BulkLoad", RogerUtil.bulk_load)
-    validate = create_python_task ("Validate", RogerUtil.validate)
-    finish = BashOperator (task_id='Finish', bash_command='echo finish')
-
-    """ Build the DAG. """
-    intro >> get_kgx >> [ create_schema, merge_nodes ] >> create_bulk_load >> \
-        bulk_load >> validate >> finish
-