diff --git a/.env b/.env new file mode 100644 index 00000000..2b42e8d7 --- /dev/null +++ b/.env @@ -0,0 +1,23 @@ +AIRFLOW_UID=502 +AIRFLOW_GID=0 + +API_WORKERS=4 +API_PORT=5551 +API_TIMEOUT=10 + +DATA_DIR=./local_storage + +DUG_LOG_LEVEL=INFO + +ELASTICSEARCH_PASSWORD=12345 +ELASTICSEARCH_HOST=elasticsearch +ELASTICSEARCH_USERNAME=elastic + +NBOOST_API_HOST=nboost + +REDIS_PASSWORD=weak +REDIS_HOST=merge-redis-master +REDIS_PORT=6379 +TRANQL_ACCESS_LOG=access.log +TRANQL_ERROR_LOG=error.log +ROGER_DUG__INPUTS_DATA__SETS=topmed:v1.0 \ No newline at end of file diff --git a/.github/workflows/build-push-dev-image.yml b/.github/workflows/build-push-dev-image.yml new file mode 100644 index 00000000..13f8cfb7 --- /dev/null +++ b/.github/workflows/build-push-dev-image.yml @@ -0,0 +1,86 @@ +# Workflow responsible for the +# development release processes. +# +name: Build-Push-Dev-Image +on: + push: + branches: + - develop + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + # Do not build another image on a pull request. + # Any push to develop will trigger a new build however. + pull_request: + branches-ignore: + - '*' + +jobs: + build-push-dev-image: + runs-on: ubuntu-latest + steps: + + - name: Checkout Code + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + # fetch-depth: 0 means, get all branches and commits + fetch-depth: 0 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + # Docker Buildx is important to caching in the Build And Push Container + # step + # https://github.com/marketplace/actions/build-and-push-docker-images + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Login to Container Registry + uses: docker/login-action@v3 + with: + registry: containers.renci.org + username: ${{ secrets.CONTAINERHUB_USERNAME }} + password: ${{ secrets.CONTAINERHUB_TOKEN }} + logout: true + + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Push Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + # Push to renci-registry and dockerhub here. + # cache comes from dockerhub. + tags: | + ${{ github.repository }}:develop + ${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + containers.renci.org/${{ github.repository }}:develop + containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache-dev + cache-to: type=registry,ref=${{ github.repository }}:buildcache-dev,mode=max \ No newline at end of file diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml new file mode 100644 index 00000000..07b22d21 --- /dev/null +++ b/.github/workflows/build-push-release.yml @@ -0,0 +1,131 @@ +# Workflow responsible for the +# major release processes. +# + +name: Build-Push-Release +on: + push: + branches: + - master + - main + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + tags-ignore: + - '*' +jobs: + build-push-release: + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + fetch-depth: 0 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + # https://github.com/marketplace/actions/git-semantic-version + - name: Semver Check + uses: paulhatch/semantic-version@v5.0.3 + id: version + with: + # The prefix to use to identify tags + tag_prefix: "v" + # A string which, if present in a git commit, indicates that a change represents a + # major (breaking) change, supports regular expressions wrapped with '/' + major_pattern: "/breaking:|major:/" + # A string which indicates the flags used by the `major_pattern` regular expression. Supported flags: idgs + major_regexp_flags: "ig" + # Same as above except indicating a minor change, supports regular expressions wrapped with '/' + minor_pattern: "/feat:|feature:|minor:/" + # A string which indicates the flags used by the `minor_pattern` regular expression. Supported flags: idgs + minor_regexp_flags: "ig" + # A string to determine the format of the version output + # version_format: "${major}.${minor}.${patch}-prerelease${increment}" + version_format: "${major}.${minor}.${patch}" + search_commit_body: false + + # Docker Buildx is important to caching in the Build And Push Container + # step + # https://github.com/marketplace/actions/build-and-push-docker-images + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Login to Container Registry + uses: docker/login-action@v3 + with: + registry: containers.renci.org + username: ${{ secrets.CONTAINERHUB_USERNAME }} + password: ${{ secrets.CONTAINERHUB_TOKEN }} + logout: true + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Push Container + uses: docker/build-push-action@v5 + with: + push: true + # Push to renci-registry and dockerhub here. + # cache comes from dockerhub. + tags: | + containers.renci.org/${{ github.repository }}:v${{ steps.version.outputs.version }} + containers.renci.org/${{ github.repository }}:latest + containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + ${{ github.repository }}:v${{ steps.version.outputs.version }} + ${{ github.repository }}:latest + ${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache-release + cache-to: type=registry,ref=${{ github.repository }}:buildcache-release,mode=max + +#==========================TAG & RELEASE W/ NOTES ========================= + + # Note: GITHUB_TOKEN is autogenerated feature of github app + # which is auto-enabled when using github actions. + # https://docs.github.com/en/actions/security-guides/automatic-token-authentication + # https://docs.github.com/en/rest/git/tags?apiVersion=2022-11-28#create-a-tag-object + # https://docs.github.com/en/rest/git/refs?apiVersion=2022-11-28#create-a-reference + # This creates a "lightweight" ref tag. + - name: Create Tag for Release + run: | + curl \ + -s --fail -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/${{ github.repository }}/git/refs \ + -d '{"ref":"refs/tags/v${{ steps.version.outputs.version }}","sha":"${{ github.sha }}"}' + +# https://cli.github.com/manual/gh_release_create + - name: Create Release + env: + RELEASE_VERSION: ${{ steps.version.outputs.version }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release create ${{ env.RELEASE_VERSION }} \ + -t "${{ env.RELEASE_VERSION }}" \ + --generate-notes \ + --latest \ No newline at end of file diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml new file mode 100644 index 00000000..b7f3e6a5 --- /dev/null +++ b/.github/workflows/code-checks.yml @@ -0,0 +1,129 @@ +# Workflow responsible for core acceptance testing. +# Tests Currently Run: +# - flake8-linter +# - PYTest +# - Bandit +# For PR Vulnerability Scanning a separate workflow will run. +# The build-push-dev-image and build-push-release workflows +# handle the develop and release image storage respectively. +# +# + +name: Code-Checks +on: + push: + branches-ignore: + - master + - main + - develop + pull_request: + branches: + - develop + - master + - main + types: [opened, synchronize] + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + +jobs: + ############################## flake8-linter ############################## + flake8-linter: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12" + + # Currently actions/setup-python supports caching + # but the cache is not as robust as cache action. + # Here we cache the entire python env which speeds subsequent builds up alot. (alot being scientific term) + # Ref: https://blog.allenai.org/python-caching-in-github-actions-e9452698e98d + - uses: actions/cache@v3 + name: Cache Python + with: + path: ${{ env.pythonLocation }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pyproject.toml') }} + + - name: Install Requirements + run: | + pip install -r requirements.txt + + - name: Lint with flake8 + run: | + pip install flake8 + flake8 --ignore=E,W dags + # We continue on error here until the code is clean + # flake8 --ignore=E,W --exit-zero . + continue-on-error: true + + ################################### PYTEST ################################### + # pytest: + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v3 + # - name: Set up Python + # uses: actions/setup-python@v4 + # with: + # python-version: '3.12' + + # - name: Install Requirements + # run: | + # pip install -r requirements.txt + # pip install coverage + # pip install ./tests + + # - name: Test with pytest + # run: | + # make test + ############################## test-image-build ############################## + test-image-build: + runs-on: ubuntu-latest + # if: ${{ github.actor == 'dependabot[bot]' }} + steps: + - uses: actions/checkout@v3 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Parse Github Reference Name + id: branch + run: | + REF=${{ github.ref_name }} + echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: | + ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml new file mode 100644 index 00000000..1e7bc060 --- /dev/null +++ b/.github/workflows/trivy-pr-scan.yml @@ -0,0 +1,67 @@ +name: trivy-pr-scan +on: + pull_request: + branches: + - develop + - master + - main + types: [ opened, synchronize ] + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + +jobs: + trivy-pr-scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: false + load: true + tags: ${{ github.repository }}:vuln-test + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + + # We will not be concerned with Medium and Low vulnerabilities + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: '${{ github.repository }}:vuln-test' + format: 'sarif' + severity: 'CRITICAL,HIGH' + ignore-unfixed: true + output: 'trivy-results.sarif' + exit-code: '1' + # Scan results should be viewable in GitHub Security Dashboard + # We still fail the job if results are found, so below will always run + # unless manually canceled. + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + if: '!cancelled()' + with: + sarif_file: 'trivy-results.sarif' \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..6c46fe7f --- /dev/null +++ b/.gitignore @@ -0,0 +1,154 @@ +# Git ignore bioler plate from https://github.com/github/gitignore/blob/master/Python.gitignore +.secret-env +.vscode/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.secrets-env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# PyCharm +.idea + +# Rope project settings +.ropeproject + +# Mac +.DS_Store + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# Local output directories +dags/roger/data +local_storage +logs +tests/integration/data/bulk/ diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..e4e7438b --- /dev/null +++ b/.pylintrc @@ -0,0 +1,4 @@ +[MAIN] +disable=invalid-name, + no-member, + no-value-for-parameter diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..47d2c13f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +FROM bitnami/airflow:2.10.5-debian-12-r7 + +USER root +RUN apt-get update && apt-get install -y git nano vim gcc rustc cargo +#RUN useradd -u 1001 -ms /bin/bash airflow && chown -R airflow /home/airflow +COPY requirements.txt requirements.txt +RUN source /opt/bitnami/airflow/venv/bin/activate && CARGO_HOME=/tmp/.cargo && \ + pip install setuptools wheel && \ + pip install -r requirements.txt + +RUN rm -f requirements.txt + +## Vul patches +## Python lib patches on airflow python env +RUN source /opt/bitnami/airflow/venv/bin/activate pip install --upgrade \ + flask-appbuilder==4.5.3 \ + cryptography==44.0.1 \ + werkzeug==3.0.6 \ + urllib3==2.2.2 +RUN source /opt/bitnami/airflow/venv/bin/activate pip uninstall -y \ + apache-airflow-providers-mysql==6.2.0 + +# Uninstall these from non airflow python env +RUN pip install --upgrade \ + flask-appbuilder==4.5.3 \ + cryptography==44.0.1 \ + werkzeug==3.0.6 \ + urllib3==2.2.2 +RUN apt-get autoremove -y vim +RUN apt-get autoremove -y binutils +RUN apt-get autoremove -y linux-libc-dev + +USER airflow diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..ef227aa4 --- /dev/null +++ b/Makefile @@ -0,0 +1,76 @@ +PYTHON = $(shell which python3) +PYTHONPATH = dags +VERSION_FILE = ./dags/_version.py +VERSION = $(shell cut -d " " -f 3 ${VERSION_FILE}) +DOCKER_REPO = docker.io +DOCKER_OWNER = helxplatform +DOCKER_APP = roger +DOCKER_TAG = ${VERSION} +DOCKER_IMAGE = ${DOCKER_OWNER}/${DOCKER_APP}:$(DOCKER_TAG) + +.DEFAULT_GOAL = help + +.PHONY: help clean install test build image publish + +help: + @grep -E '^#[a-zA-Z\.\-]+:.*$$' $(MAKEFILE_LIST) | tr -d '#' | awk 'BEGIN {FS = ": "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' + +mk_dirs: + mkdir -p {logs,plugins} + mkdir -p local_storage/elastic + mkdir -p local_storage/redis + +rm_dirs: + rm -rf logs/* + rm -rf local_storage/elastic/* + rm -rf local_storage/redis/* + rm -rf ./dags/roger/data/* + +#install: Install application along with required packages to local environment +install: + ${PYTHON} -m pip install --upgrade pip + ${PYTHON} -m pip install -r requirements.txt + +#test.lint: Run flake8 on the source code +test.lint: + ${PYTHON} -m flake8 dags + +#test.doc: Run doctests in the source code +test.doc: + echo "Running doc tests..." + ${PYTHON} -m pytest --doctest-modules dags/roger + +#test.unit: Run unit tests +test.unit: + ${PYTHON} --version + ${PYTHON} -m pytest tests/unit + +#test.integration: Run unit tests +test.integration: + echo "Running integration tests..." + ${PYTHON} -m pytest tests/integration + +#test: Run all tests +test: test.unit test.integration + +#build: Build the Docker image +build: + echo "Building docker image: ${DOCKER_IMAGE}" + docker build --no-cache -t ${DOCKER_IMAGE} -f Dockerfile . + echo "Successfully built: ${DOCKER_IMAGE}" + +#publish: Push the Docker image +publish: + docker tag ${DOCKER_IMAGE} ${DOCKER_REPO}/${DOCKER_IMAGE} + docker push ${DOCKER_REPO}/${DOCKER_IMAGE} + +#clean: Remove old data +clean: rm_dirs mk_dirs + +#stack.init: Initialize the airflow DB +stack.init: mk_dirs + docker-compose up airflow-init + +#stack: Bring up Airflow and all backend services +stack: stack.init + docker-compose up diff --git a/README.md b/README.md index 0e7038aa..92ed1652 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,24 @@ cd bin make clean install validate ``` +## Quickstart + +You can quickly set up the required dependencies and spin up all the necessary services with: + +```shell +make install +make stack +``` + +Without using make, you can run the necessary commands directly on the shell: + +```shell +mkdir -p {logs,plugins} +mkdir -p local_storage/elastic +docker-compose up airflow-init +docker-compose up +``` + ## Design Roger's is designed to transform data through well defined and transparent phases. @@ -59,8 +77,31 @@ Fetches KGX files according to a data version selecting the set of files to use. Merges nodes duplicated across files aggregating properties from all nodes ### Schema Identify and record the schema (properties) of every edge and node type. +Schema records the type resolved for each property of a node/edge. The **Schema** step generates category +schema file for node schema and predicate schema for edges. In these files properties are collected and +scoped based on type of the edges and nodes found. For instances where properties do not have consistent data +type across a given scope, the following rule is used to resolve to final data type: + +* If the property has fluctuating type among a boolean, a float or an Integer in the same scope, +it's final data type would be a string. +* If conflicting property is ever a string but never a list in the scope, it's final data type will be string. +* If conflicting property is ever a list , it's final data type will be a list. + +Using this approach attributes will be casted based on the resolution set here when loading to the graph database +in subsequent steps. ### Bulk Create Create bulk load CSV files conforming to the Redisgraph Bulk Loader's requirements. +**Bulk create** uses the Schema generated in **Schema** step to generate csv headers +([redis csv headers](https://github.com/RedisGraph/redisgraph-bulk-loader#input-schemas)) with +the assumed types . Currently redis bulk loader requires every column to have a value. +To address this issue, this step groups the entities being processed (edges/nodes) +based on attributes that have values. Then these groups are written into separate csv files. Nodes +are written as csv(s) under `/bulk/nodes` and edges under `/bulk/edges`. +Each csv with these folders has the following naming convention +`.csv--`. +When populating the CSV with values, the appropriate casting is done on the properties to normalize +them to the data types defined in the **Schema** step. + ### Bulk Load Use the bulk loader to load Redisgraph logging statistics on each type of loaded object. ### Validate @@ -492,14 +533,122 @@ Open localhost:8080 in a browser. Then run: ``` -python tranql_translator.py +python tranql_translate.py ``` The Airflow interface shows the workflow: ![image](https://user-images.githubusercontent.com/306971/97787955-b968f680-1b8b-11eb-86cc-4d93842eafd3.png) -Use the Trigger icon to run the workflow immediatley. +Use the Trigger icon to run the workflow immediately. + + +### Running Roger in Kubernetes + +Roger supports installing on kubernetes via [Helm](helm.sh). + +### Prerequisites + +#### 1. Setup persistence volume + + Create a pvc(roger-data-pvc) for storing roger Data with the following definition. + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: roger-data-pvc +spec: + storageClassName: + accessModes: + - ReadWriteMany + resources: + requests: + storage: +``` + +Then run : + +```shell script +kubectl -n create -f pvc.yaml +``` + +#### 2. Create git ssh secrets: + +There are two secrets for airflow required for Git syncronization. + +This is used by `airflow.airflow.config.AIRFLOW__KUBERNETES__GIT_SSH_KEY_SECRET_NAME` + ```yaml + kind: Secret + apiVersion: v1 + metadata: + name: airflow-secrets + data: + gitSshKey: >- + + type: Opaque + ``` + +This used by `airflow.dags.git.secret` + +```yaml +kind: Secret +apiVersion: v1 +metadata: + name: airflow-git-keys +data: + id_rsa: + id_rsa.pub: + known_hosts: +type: Opaque +``` + +### Installing + +#### 1. Init helm dependencies + +Navigate to `roger/bin` dir, and run `roger init`. This will initialize helm dependencies for [airflow helm repo](https://airflow-helm.github.io/charts)) +and [redis helm repo](https://github.com/bitnami/charts/tree/master/bitnami/redis#redis). +```shell script +cd bin/ +export NAMESPACE= +export RELEASE_NAME= +export CLUSTER_DOMAIN=cluster.local +./roger init +``` + + +#### 2. Installing + +Run and flow the notes to access the servers. +```shell script +./roger start +``` + +#### 3. Run Roger workflow +In the Notes a port forward command should be printed. Use that to +access airflow UI and run the following steps to run Roger workflow. +The Airflow interface shows the workflow: +![image](https://user-images.githubusercontent.com/45075777/104513185-403f4400-55bd-11eb-9142-cbfd7879504b.png) +Press Trigger to get to the following page: +![image](https://user-images.githubusercontent.com/45075777/104513451-b04dca00-55bd-11eb-837c-65d20d697fff.png) + +Enter the configuration parameters to get to Redis cluster installed in step 2: +```json +{"redisgraph": {"host": "", "port": 6379 , "graph" : "graph-name" }} +``` +And run work flow. +#### 4. Other Commands: + +To shutdown and remove the setup from k8s: +```shell script +./roger stop +``` + +To restart the setup: +```shell script +./roger restart +``` diff --git a/bin/Makefile b/bin/Makefile index 21ba47bb..a9833163 100644 --- a/bin/Makefile +++ b/bin/Makefile @@ -1,61 +1,24 @@ -########################################################## -## -## -## Make the Roger database in phases. -## -## Opertions -## -## get: Fetch versioned knowledge graph exchange -## (KGX) formatted data files. -## -## merge: Merge nodes, consolidating duplicates -## and preserving fields. -## -## schema: Identify the all properties in each -## predicate and node type. -## -## tables: Write tabular formatted data for all -## edges and nodes. -## -## install: Bulk load a Redisgraph instance. -## -## validate: Validate database contents. -## -## clean: Delete all data artifacts. -## -## -########################################################## - -# Root of Roger -ROGER_HOME=$(PWD)/.. - -# Path to Roger executable -ROGER=${ROGER_HOME}/bin/roger - -# Location of data -DATA_ROOT=${ROGER_HOME}/roger/data +ROGER_MAKE_DIR=./roger_graph_build +ANNOTATE_MAKE_DIR=./dug_annotate +INDEXING_MAKE_DIR=./dug_indexing + RM=/bin/rm -TIME=/usr/bin/time -clean: - $(RM) -rf $(DATA_ROOT) +DATA_ROOT=${ROGERENV_DATA__ROOT} -get: - $(TIME) $(ROGER) kgx get --data-root $(DATA_ROOT) -merge: get - $(TIME) $(ROGER) kgx merge --data-root $(DATA_ROOT) +clean: + $(RM) -rf $(DATA_ROOT) -schema: merge - $(TIME) $(ROGER) kgx schema --data-root $(DATA_ROOT) -tables: schema - $(TIME) $(ROGER) bulk create --data-root $(DATA_ROOT) +annotate: + make -C ${ANNOTATE_MAKE_DIR} all -install: tables - $(TIME) $(ROGER) bulk load --data-root $(DATA_ROOT) +graph: + make -C ${ROGER_MAKE_DIR} all -validate: - $(TIME) $(ROGER) bulk validate --data-root $(DATA_ROOT) +index: + make -C ${INDEXING_MAKE_DIR} all +all: annotate graph index \ No newline at end of file diff --git a/bin/Readme.md b/bin/Readme.md new file mode 100644 index 00000000..bcade36a --- /dev/null +++ b/bin/Readme.md @@ -0,0 +1,136 @@ +### Running Roger + +This document outlines some of the ways that Roger can be run. + +### Roger Configuration + +Configuration is mainly managed through `roger/roger/config.yaml`. +Each values in this config file can be overridden by shell environment +variables. For instance to override the following : + +``` + kgx: + biolink_model_version: 1.5.0 + dataset_version: v1.0 +``` + +Overridding variables can be exported as: + +```shell script +export ROGERENV_KGX_BIOLINK__MODEL__VERSION=1.6 +export ROGERENV_KGX_DATASET__VERSION=v1.1 +``` +Some things to note are: +* Environment variables should be prefixed by `ROGERENV_` +* Single Underscore `_` character denotes sub-key in the yaml +* Double Underscores `__` are treated as regular underscore +* Keys in yaml are in lower and environment variables that override them should be in upper case. + +### Deploy Script + +`roger/bin/deploy` script can be used to deploy Roger's dependencies in either docker or kubernetes. +For full capabilities use: +```shell script +cd roger/bin +./deploy help +``` + +##### Docker + +For local development we can use docker containers to run backend services that roger depends on. +These are Redis store, Elastic search and Tranql web service. + +Eg: +```shell script +cd roger/bin +./deploy docker config # to display the configuration (port address and passwords) +./deploy docker start # to start +./deploy help # for help on commands +``` + +##### Kubernetes + +For running on k8s we can configure git branch and docker images by exporting: +```shell script +export NAMESPACE=your-namespace +export RELEASE=roger +export CLUSTER_DOMAIN=cluster.local +export WORKING_GIT_BRANCH=develop +``` +deploy using : + +```shell script +cd roger/bin +./deploy k8s config # to display the configuration +./deploy k8s start # to start +./deploy k8s help # for help on commands +``` + +### Local Development + +##### Setup python virtual env + +```shell script +cd roger +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +##### Configuration + +Refer to configuration section to override server names and passwords to +passwords etc.. to the backend servers. + +For development there is a dev.env file in `roger/bin/` directory with some start +up variables. Modify as needed. The following command can be used to export them into +shell. +```shell script +export $(grep -v'^#' bin/dev.env | xargs 0) +``` + + +##### Run a task + +To run a single task : + +```shell script +python cli.py -l # runs annotatation task +python cli.py -h # see the full list of available arguments. +``` + +##### Using the Makefiles + +Another way to run roger is as a pipeline, where each task is +In `roger/roger/bin/` there is a root make file and in the `roger/roger/bin/dug_annotate`, +`roger/roger/bin/dug_indexing` and `roger/roger/bin/roger_graph_build`. + +Running all pipelines end to end: + +```shell script +cd roger/roger/bin/ +make all +``` + +Running annotation pipeline: + +```shell script +cd roger/roger/bin/ +make annotate +``` + +Running graph pipeline: + +```shell script +cd roger/roger/bin/ +make graph +``` + +Running index pipeline: + +```shell script +cd roger/roger/bin/ +make index +``` + + diff --git a/bin/airk8s b/bin/airk8s deleted file mode 100755 index c6b32aae..00000000 --- a/bin/airk8s +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -set -x -set -e - -namespace=${NAMESPACE:-scox} -version=v7.11.0 - -init () { - helm repo add stable https://kubernetes-charts.storage.googleapis.com - helm repo update -} -start () { - helm install "airflow" stable/airflow \ - --version "$version" \ - --namespace "$namespace" \ - --values ./custom-values.yaml -} -status () { - helm status "airflow" --namespace $namespace - echo Scheduler: - kubectl -n $namespace logs $(kubectl get pods | grep airflow-scheduler | awk '{ print $1 }') -c git-sync - echo Worker: - kubectl -n $namespace logs $(kubectl get pods | grep airflow-worker | awk '{ print $1 }') -c git-sync -} -stop () { - helm delete "airflow" --namespace $namespace -} -connect () { - kubectl exec -it \ - --namespace $namespace \ - --container airflow-web \ - Deployment/airflow-web \ - /bin/bash -} -web () { - export NODE_PORT=$(kubectl get --namespace $namespace -o jsonpath="{.spec.ports[0].nodePort}" services airflow-web) - export NODE_IP=$(kubectl get nodes --namespace $namespace -o jsonpath="{.items[0].status.addresses[0].address}") - echo http://$NODE_IP:$NODE_PORT/ - export AIRFLOW_UI=http://$NODE_IP:$NODE_PORT/ -} -gitsecret () { - kubectl create secret generic \ - airflow-git-keys \ - --from-file=id_rsa=$HOME/.ssh/id_rsa \ - --from-file=id_rsa.pub=$HOME/.ssh/id_rsa.pub \ - --from-file=known_hosts=$HOME/.ssh/known_hosts \ - --namespace $namespace -} - -$* - -exit 0 diff --git a/bin/custom-values.yaml b/bin/custom-values.yaml deleted file mode 100644 index e60f3297..00000000 --- a/bin/custom-values.yaml +++ /dev/null @@ -1,157 +0,0 @@ -# -# NOTE: -# - This is intended to be a `custom-values.yaml` starting point for non-production deployment (like minikube) - -# External Dependencies: -# - A PUBLIC git repo for DAGs: ssh://git@repo.example.com:my-airflow-dags.git -# - -################################### -# Airflow - Common Configs -################################### -airflow: - ## the airflow executor type to use - ## - executor: CeleryExecutor -# executor: KubernetesExecutor - - ## the fernet key used to encrypt the connections in the database - ## - fernetKey: "7T512UXSSmBOkpWimFHIVb8jK6lfmSAvx4mO6Arehnc=" - - ## environment variables for the web/scheduler/worker Pods (for airflow configs) - ## - config: - # Security - AIRFLOW__CORE__SECURE_MODE: "True" - AIRFLOW__API__AUTH_BACKEND: "airflow.api.auth.backend.deny_all" - AIRFLOW__WEBSERVER__EXPOSE_CONFIG: "False" - AIRFLOW__WEBSERVER__RBAC: "False" - - # DAGS - AIRFLOW__CORE__LOAD_EXAMPLES: "False" - - ## Disable noisy "Handling signal: ttou" Gunicorn log messages - GUNICORN_CMD_ARGS: "--log-level WARNING" - -################################### -# Airflow - Scheduler Configs -################################### -scheduler: - - ## custom airflow connections for the airflow scheduler - ## -# connections: -# - id: my_aws -# type: aws -# extra: | -# { -# "aws_access_key_id": "XXXXXXXXXXXXXXXXXXX", -# "aws_secret_access_key": "XXXXXXXXXXXXXXX", -# "region_name":"eu-central-1" -# } - - ## custom airflow variables for the airflow scheduler - ## - variables: | - { "environment": "dev" } - - ## custom airflow pools for the airflow scheduler - ## - pools: | - { - "example": { - "description": "This is an example pool with 2 slots.", - "slots": 2 - } - } - -################################### -# Airflow - WebUI Configs -################################### -web: - ## configs for the Service of the web Pods - ## - service: - type: NodePort - -################################### -# Airflow - Worker Configs -################################### -workers: - ## the number of workers Pods to run - ## - replicas: 1 - -################################### -# Airflow - DAGs Configs -################################### -dags: - ## configs for the DAG git repository & sync container - ## - git: - ## url of the git repository - ## - #url: "ssh://git@repo.example.com/my-airflow-dags.git" - #url: "ssh://git@github.com/stevencox/airflow.git" - url: "ssh://git@github.com/stevencox/roger.git" - - ## the branch/tag/sha1 which we clone - ## - ref: main - - ## the name of a pre-created secret containing files for ~/.ssh/ - ## - ## NOTE: - ## - this is ONLY RELEVANT for SSH git repos - ## - the secret commonly includes files: id_rsa, id_rsa.pub, known_hosts - ## - known_hosts is NOT NEEDED if `git.sshKeyscan` is true - ## - secret: airflow-git-keys - - ## the name of the private key file in your `git.secret` - ## - ## NOTE: - ## - this is ONLY RELEVANT for PRIVATE SSH git repos - ## - privateKeyName: id_rsa - - ## the host name of the git repo - ## - ## NOTE: - ## - this is ONLY REQUIRED for SSH git repos - ## - ## EXAMPLE: - ## repoHost: "github.com" - ## - repoHost: "github.com" - - ## the port of the git repo - ## - ## NOTE: - ## - this is ONLY REQUIRED for SSH git repos - ## - repoPort: 22 - - ## configs for the git-sync container - ## - gitSync: - ## enable the git-sync sidecar container - ## - enabled: true - - ## the git sync interval in seconds - ## - refreshTime: 60 - -################################### -# Database - PostgreSQL Chart -################################### -postgresql: - enabled: true - -################################### -# Database - Redis Chart -################################### -redis: - enabled: true diff --git a/bin/deploy b/bin/deploy new file mode 100644 index 00000000..47dd6cba --- /dev/null +++ b/bin/deploy @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +# ---- Kubernetes ------ + +k8s () { + + namespace=${NAMESPACE:-} + release=${RELEASE:-roger} + cluster_domain=${CLUSTER_DOMAIN:-cluster.local} + branch=${WORKING_GIT_BRANCH:-develop} + + help () { + echo " + Usage : ./deploy k8s [sub-command] + + Deploys Roger pipeline on kubernetes along airflow. + + Available sub-commands: + - config : view configuration + - init : Initializes helm dependencies for install. + - start : Runs helm upgrade/install. + - stop : Stops running instance. + - restart : Restarts running instance. + - client : If redis is installed on the system, it will try to connect to + " + + } + config() { + echo " + Configuration for k8s instance. + To modify this values export variables with new values. + eg: export NAMESPACE=my-namespace + + NAMESPACE: ${namespace} + RELEASE: ${release} + CLUSTER_DOMAIN: ${cluster_domain} + WORKING_GIT_BRANCH: ${branch} + " + } + init () { + helm dependency update ../helm + } + start () { + init + helm upgrade --install $release \ + --set redis.clusterDomain=$cluster_domain \ + --set airflow.airflow.config.AIRFLOW__KUBERNETES__GIT_BRANCH=$branch \ + --set airflow.dags.git.ref=$branch \ + --namespace=$namespace \ + ../helm + } + stop () { + helm delete $release \ + --namespace=$namespace + } + restart () { + stop + start + } + status () { + helm --namespace=$namespace status $release + } + client () { + redis-cli -h 127.0.0.1 -p 6379 -a $REDIS_PASSWORD + } + $* +} +#---------End Kubernetes------------------- + +#---------Docker-compose ------------------ + +docker() { + COMPOSE_FILE=./docker_backend/docker-compose.yaml + help () { + echo " + Usage: ./deploy docker [subcommand] + + Run docker based backends. + + Available sub-commands: + config: Print contents of ./.env file + init: Export ./.env file contents as shell variables. + start: Runs docker containers up using ${COMPOSE_FILE}. + stop: Stops running docker containers. + restart: Restarts containers. + " + } + config() { + grep -v "^#" dev.env + } + init() { + export $(config | xargs -0) + } + start() { + init + docker-compose -f ${COMPOSE_FILE} up -d + } + stop() { + init + docker-compose -f ${COMPOSE_FILE} down + } + $* +} + +help () { + echo " + Usage : ./deploy [env-type] [subcommand] + + Deploys roger dependencies in docker / k8s + + env-type: either k8s or docker + + Read below for the subcommands avaible or use + ./deploy [env-type] help . + + " + docker help + k8s help +} +$* \ No newline at end of file diff --git a/bin/dev.env b/bin/dev.env new file mode 100644 index 00000000..1653a62d --- /dev/null +++ b/bin/dev.env @@ -0,0 +1,4 @@ +ROGERENV_DATA__ROOT=~/roger-data +ROGERENV_KGX_DATASET__VERSION=test +ROGERENV_ELASTIC__SEARCH_PASSWORD=changeme +ROGERENV_REDISGRAPH_PASSWORD=changeme \ No newline at end of file diff --git a/bin/docker_backend/docker-compose.yaml b/bin/docker_backend/docker-compose.yaml new file mode 100644 index 00000000..d87c6ae6 --- /dev/null +++ b/bin/docker_backend/docker-compose.yaml @@ -0,0 +1,68 @@ +version: '3.0' + +################################################################################# +## +## A service stack for the Roger pipeline. +## +################################################################################# +services: + + ################################################################################# + ## + ## The OpenAPI endpoint for search. This is the only service to be + ## exposed beyond the internal network. + ## + ################################################################################# + tranql: + image: renciorg/tranql-app:0.35 + depends_on: + - redis + restart: always + networks: + - roger-network + environment: + - REDIS_PASSWORD=$ROGERENV_REDISGRAPH_PASSWORD + entrypoint: /usr/local/bin/gunicorn --workers=2 --bind=0.0.0.0:8001 --name=tranql --timeout=600 tranql.api:app + ports: + - 8001:8001 + volumes: + - ./tranql-schema.yaml:/tranql/tranql/conf/schema.yaml + ################################################################################# + ## + ## A search engine providing scalable indexing and full text search. + ## + ################################################################################# + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:7.6.1 + networks: + - roger-network + environment: + - ELASTIC_PASSWORD=$ROGERENV_ELASTIC__SEARCH_PASSWORD + - discovery.type=single-node + - xpack.security.enabled=true + volumes: + - ./data/elastic:/bitnami/elasticsearch/data + ports: + - '9200:9200' + - '9300:9300' + + ################################################################################# + ## + ## A memory cache for results of high volume service requests. + ## + ################################################################################# + redis: + image: 'redislabs/redisgraph' + networks: + - roger-network + command: redis-server --requirepass ${ROGERENV_REDISGRAPH_PASSWORD} --loadmodule /usr/lib/redis/modules/redisgraph.so + environment: + - REDIS_DISABLE_COMMANDS=FLUSHDB,FLUSHALL + volumes: + - ./data/redis:/data + ports: + - '6379:6379' + +networks: + roger-network: + driver: bridge diff --git a/bin/docker_backend/tranql-schema.yaml b/bin/docker_backend/tranql-schema.yaml new file mode 100644 index 00000000..965d12c3 --- /dev/null +++ b/bin/docker_backend/tranql-schema.yaml @@ -0,0 +1,12 @@ +schema: + redis: + doc: | + Roger is a knowledge graph built by aggregeting several kgx formatted knowledge graphs from several sources. + url: "redis:" + redis: true + redis_connection_params: + # Host here is the service name in the docker composed container. + host: redis + port: 6379 + # SET USERNAME and PASSWORD + # via ROGER_USERNAME , ROGER_PASSWORD Env vars (i.e capitialize service name) diff --git a/bin/dug_annotate/Makefile b/bin/dug_annotate/Makefile new file mode 100644 index 00000000..34d350ce --- /dev/null +++ b/bin/dug_annotate/Makefile @@ -0,0 +1,44 @@ +########################################################## +## +## +## Annotate files using Dug. +## +## Operations +## +## annotate_and_normalize: Annotates Variable files using entity name resolution service with curies. +## +## create_kgx_files: Creates KGX formatted knowledge graphs from annotation result set. +## +## clean: Delete all data artifacts. +## +## +########################################################## + +# Root +THIS_MAKEFILE_PATH:=$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)) +THIS_DIR:=$(shell cd $(dir $(THIS_MAKEFILE_PATH));pwd) + +ROGER_HOME=${THIS_DIR}/../.. +CLI_WRAPPER=${ROGER_HOME}/cli.py + +# Override Roger data dir ENV +ANNOTATE_DIR=${ROGERENV_DATA__ROOT}/dug/annotations +KGX_DIR=${ROGERENV_DATA__ROOT}/dug/kgx + +RM=/bin/rm +TIME=/usr/bin/time + +clean: + $(RM) -rf ${ANNOTATE_DIR} + $(RM) -rf ${KGX_DIR} + +get_input_files: + $(TIME) python ${CLI_WRAPPER} -gd + +annotate_and_normalize: + $(TIME) python ${CLI_WRAPPER} -l + +create_kgx_files: + $(TIME) python ${CLI_WRAPPER} -t + +all: get_input_files annotate_and_normalize create_kgx_files diff --git a/bin/dug_indexing/Makefile b/bin/dug_indexing/Makefile new file mode 100644 index 00000000..5d015089 --- /dev/null +++ b/bin/dug_indexing/Makefile @@ -0,0 +1,53 @@ +########################################################## +## +## +## Annotate files using Dug. +## +## Operations +## +## annotate_and_normalize: Annotates Variable files using entity name resolution service with curies. +## +## create_kgx_files: Creates KGX formatted knowledge graphs from annotation result set. +## +## clean: Delete all data artifacts. +## +## +########################################################## + +# Root +THIS_MAKEFILE_PATH:=$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)) +THIS_DIR:=$(shell cd $(dir $(THIS_MAKEFILE_PATH));pwd) + +ROGER_HOME=${THIS_DIR}/../.. +CLI_WRAPPER=${ROGER_HOME}/cli.py + +# Override Roger data dir ENV +INDEXING_DIR=${ROGERENV_DATA__ROOT}/dug/expanded_concepts +CRAWL_DIR=${ROGERENV_DATA__ROOT}/dug/crawl + + +RM=/bin/rm +TIME=/usr/bin/time + +clean: + $(RM) -rf ${INDEXING_DIR} + $(RM) -rf ${CRAWL_DIR} + +crawl_concepts: + $(TIME) python ${CLI_WRAPPER} -C + +index_concepts: crawl_concepts + $(TIME) python ${CLI_WRAPPER} -ic + +index_variables: + $(TIME) python ${CLI_WRAPPER} -iv + +validate_indexed_concepts: index_concepts + $(TIME) python ${CLI_WRAPPER} -vc + +validate_indexed_variables: index_variables + $(TIME) python ${CLI_WRAPPER} -vv + +all: validate_indexed_concepts validate_indexed_variables + + diff --git a/bin/roger b/bin/roger index 5d96cde8..4626df88 100755 --- a/bin/roger +++ b/bin/roger @@ -1,53 +1,7 @@ +#!/usr/bin/env bash #set -x set -e -namespace=${NAMESPACE:-scox} -release=redisgraph -image_repository=redislabs/redisgraph -image_tag=edge - -# https://github.com/bitnami/charts/tree/master/bitnami/redis -init () { - helm repo add bitnami https://charts.bitnami.com/bitnami -} -start () { - helm install $release \ - --set image.repository=$image_repository \ - --set image.tag=$image_tag \ - --set redis.command="redis-server" \ - --set redis.args="--loadmodule /usr/lib/redis/modules/redisgraph.so" \ - --set master.command="redis-server --loadmodule /usr/lib/redis/modules/redisgraph.so" \ - --set slave.command="redis-server --loadmodule /usr/lib/redis/modules/redisgraph.so" \ - --namespace=$namespace \ - bitnami/redis -} -start () { - helm install $release \ - --set image.repository=$image_repository \ - --set image.tag=$image_tag \ - --namespace=$namespace \ - bitnami/redis -} -stop () { - helm delete $release \ - --namespace=$namespace -} -restart () { - stop - start -} -status () { - kubectl --namespace=$namespace get pods | grep $release - export REDIS_PASSWORD=$(kubectl get secret --namespace $namespace redisgraph -o jsonpath="{.data.redis-password}" | base64 --decode) -} -client () { - #kubectl port-forward --namespace $namespace svc/redisgraph-master 6380:6379 & - redis-cli -h 127.0.0.1 -p 6380 -a $REDIS_PASSWORD -} -#---------------------------- - - - DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" export ROGER_HOME=$( dirname $DIR ) @@ -56,7 +10,7 @@ export PYTHONPATH=$ROGER_HOME:$ROGER_HOME/../kgx export DB_NAME=test roger () { - python $ROGER_HOME/roger/core.py $* + python $ROGER_HOME/dags/roger/core.py $* } kgx () { diff --git a/bin/roger_graph_build/Makefile b/bin/roger_graph_build/Makefile new file mode 100644 index 00000000..f43c9c8c --- /dev/null +++ b/bin/roger_graph_build/Makefile @@ -0,0 +1,66 @@ +########################################################## +## +## +## Make the Roger database in phases. +## +## Opertions +## +## get: Fetch versioned knowledge graph exchange +## (KGX) formatted data files. +## +## merge: Merge nodes, consolidating duplicates +## and preserving fields. +## +## schema: Identify the all properties in each +## predicate and node type. +## +## tables: Write tabular formatted data for all +## edges and nodes. +## +## install: Bulk load a Redisgraph instance. +## +## validate: Validate database contents. +## +## clean: Delete all data artifacts. +## +## +########################################################## + +# Root of Roger +# Root +THIS_MAKEFILE_PATH:=$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)) +THIS_DIR:=$(shell cd $(dir $(THIS_MAKEFILE_PATH));pwd) + +ROGER_HOME=${THIS_DIR}/../.. + +# Path to Roger executable +CLI_WRAPPER=${ROGER_HOME}/cli.py + +# Location of data +DATA_ROOT=${ROGER_HOME}/roger/data + +RM=/bin/rm +TIME=/usr/bin/time + +clean: + $(RM) -rf $(DATA_ROOT) + +get: + $(TIME) python ${CLI_WRAPPER} -g + +merge: get + $(TIME) python ${CLI_WRAPPER} -m + +schema: merge + $(TIME) python ${CLI_WRAPPER} -s + +tables: schema + $(TIME) python ${CLI_WRAPPER} -b + +install: tables + $(TIME) python ${CLI_WRAPPER} -i + +validate: + $(TIME) python ${CLI_WRAPPER} -a + +all: install validate \ No newline at end of file diff --git a/cli.py b/cli.py new file mode 100644 index 00000000..be77525a --- /dev/null +++ b/cli.py @@ -0,0 +1,112 @@ +import roger.core.base as RogerUtil +from roger.config import config +from roger.logger import get_logger +from dug_helpers.dug_utils import DugUtil, get_topmed_files, get_dbgap_files, get_sparc_files, get_anvil_files, get_nida_files +import sys +import argparse +import os +import time + + +log = get_logger() + +if __name__ == "__main__": + start = time.time() + log.info(f"Start TIME:{start}") + parser = argparse.ArgumentParser(description='Roger common cli tool.') + """ Common CLI. """ + parser.add_argument('-d', '--data-root', help="Root of data hierarchy", default=None) + + """ Roger CLI. """ + parser.add_argument('-v', '--dataset-version', help="Dataset version.", default="v1.0") + parser.add_argument('-g', '--get-kgx', help="Get KGX objects", action='store_true') + parser.add_argument('-s', '--create-schema', help="Infer schema", action='store_true') + parser.add_argument('-m', '--merge-kgx', help="Merge KGX nodes", action='store_true') + parser.add_argument('-b', '--create-bulk', help="Create bulk load", action='store_true') + parser.add_argument('-i', '--insert', help="Do the bulk insert", action='store_true') + parser.add_argument('-a', '--validate', help="Validate the insert", action='store_true') + + """ Dug Annotation CLI. """ + parser.add_argument('-gd', '--get_dug_input_files', help="Gets input files for annotation", + action="store_true") + parser.add_argument('-l', '--load-and-annotate',help="Annotates and normalizes datasets of varaibles.", + action="store_true") + parser.add_argument('-t', '--make-tagged-kg', help="Creates KGX files from annotated variable datesets.", + action="store_true") + + """ Dug indexing CLI . """ + parser.add_argument('-iv', '--index-variables', help="Index annotated variables to elastic search.", + action="store_true") + parser.add_argument('-C', '--crawl-concepts', help="Crawl tranql and index concepts", + action="store_true") + + parser.add_argument('-ic', '--index-concepts', help="Index expanded concepts to elastic search.", + action="store_true") + + parser.add_argument('-vc', '--validate-concepts', help="Validates indexing of concepts", + action="store_true") + + parser.add_argument('-vv', '--validate-variables', help="Validates indexing of variables", + action="store_true") + + args = parser.parse_args () + + if args.data_root is not None: + data_root = args.data_root + config.data_root = data_root + log.info (f"data root:{data_root}") + + # When all lights are on... + + # Annotation comes first + if args.get_dug_input_files: + get_topmed_files(config) + get_dbgap_files(config) + get_anvil_files(config) + # get_sparc_files(config) + # get_nida_files(config) + + if args.load_and_annotate: + DugUtil.clear_annotation_cached(config=config) + DugUtil.annotate_db_gap_files(config=config) + DugUtil.annotate_topmed_files(config=config) + DugUtil.annotate_anvil_files(config=config) + if args.make_tagged_kg: + DugUtil.make_kg_tagged(config=config) + + # Roger things + if args.get_kgx: + RogerUtil.get_kgx(config=config) + if args.merge_kgx: + RogerUtil.merge_nodes(config=config) + if args.create_schema: + RogerUtil.create_schema(config=config) + if args.create_bulk: + RogerUtil.create_bulk_load(config=config) + if args.insert: + RogerUtil.bulk_load(config=config) + if args.validate: + RogerUtil.validate(config=config) + RogerUtil.check_tranql(config=config) + + # Back to dug indexing + if args.index_variables: + DugUtil.index_variables(config=config) + + if args.validate_variables: + DugUtil.validate_indexed_variables(config=config) + + if args.crawl_concepts: + DugUtil.crawl_tranql(config=config) + + if args.index_concepts: + DugUtil.index_concepts(config=config) + + if args.validate_concepts: + DugUtil.validate_indexed_concepts(config=config) + + end = time.time() + time_elapsed = end - start + log.info(f"Completion TIME:{time_elapsed}") + + sys.exit (0) diff --git a/dags/__init__.py b/dags/__init__.py new file mode 100644 index 00000000..f0aee1ff --- /dev/null +++ b/dags/__init__.py @@ -0,0 +1 @@ +from ._version import version as __version__ diff --git a/dags/_version.py b/dags/_version.py new file mode 100644 index 00000000..adcf54c7 --- /dev/null +++ b/dags/_version.py @@ -0,0 +1,2 @@ +version = "0.10.4" + diff --git a/dags/annotate_and_index.py b/dags/annotate_and_index.py new file mode 100644 index 00000000..884cd149 --- /dev/null +++ b/dags/annotate_and_index.py @@ -0,0 +1,44 @@ +"""DAG which performs Dug annotate and index operations + +This DAG differes slightly from prior versions of the same functionality in +Roger not only in that the annotation and indexing happen in the same DAG, but +also those tasks are broken out into sub-DAGs organized by dataset. Each dataset +has a subdag for all tasks. +""" + +import os + +from airflow.models import DAG +from airflow.operators.empty import EmptyOperator +from roger.tasks import default_args, create_pipeline_taskgroup + +env_enabled_datasets = os.getenv( + "ROGER_DUG__INPUTS_DATA__SETS", "topmed,anvil").split(",") + +with DAG( + dag_id='annotate_and_index', + default_args=default_args, + schedule_interval=None +) as dag: + init = EmptyOperator(task_id="init", dag=dag) + finish = EmptyOperator(task_id="finish", dag=dag) + + from roger import pipelines + from roger.config import config + envspec = os.getenv("ROGER_DUG__INPUTS_DATA__SETS","topmed:v2.0") + data_sets = envspec.split(",") + pipeline_names = {x.split(':')[0]: x.split(':')[1] for x in data_sets} + for pipeline_class in pipelines.get_pipeline_classes(pipeline_names): + # Only use pipeline classes that are in the enabled datasets list and + # that have a properly defined pipeline_name attribute + + # TODO + # Overriding environment variable just to see if this is working. + # name = getattr(pipeline_class, 'pipeline_name', '*not defined*') + # if not name in env_enabled_datasets: + # continue + + # Do the thing to add the pipeline's subdag to the dag in the right way + # . . . + + init >> create_pipeline_taskgroup(dag, pipeline_class, config) >> finish diff --git a/dags/dug_helpers/__init__.py b/dags/dug_helpers/__init__.py new file mode 100644 index 00000000..9e28ad28 --- /dev/null +++ b/dags/dug_helpers/__init__.py @@ -0,0 +1,3 @@ +from pathlib import Path + +DUG_DATA_DIR = Path(__file__).parent.resolve() / 'dug_data' diff --git a/dags/dug_helpers/dug_utils.py b/dags/dug_helpers/dug_utils.py new file mode 100644 index 00000000..52db9624 --- /dev/null +++ b/dags/dug_helpers/dug_utils.py @@ -0,0 +1,1019 @@ +import asyncio +import hashlib +import logging +import os +import re +import tarfile +import traceback +from functools import reduce +from io import StringIO +from pathlib import Path +from typing import Union, List + +import requests +from dug.core import get_parser, get_annotator, get_plugin_manager, DugConcept +from dug.core.annotators._base import Annotator +from dug.core.concept_expander import ConceptExpander +from dug.core.crawler import Crawler +from dug.core.factory import DugFactory +from dug.core.parsers import Parser, DugElement +from dug.core.async_search import Search +from dug.core.index import Index + +from roger.config import RogerConfig +from roger.core import storage +from roger.models.biolink import BiolinkModel +from roger.logger import get_logger +from utils.s3_utils import S3Utils + +log = get_logger() + + + +class Dug: + + def __init__(self, config: RogerConfig, to_string=True): + self.config = config + self.bl_toolkit = BiolinkModel() + dug_conf = config.to_dug_conf() + self.element_mapping = config.indexing.element_mapping + self.factory = DugFactory(dug_conf) + self.cached_session = self.factory.build_http_session() + self.event_loop = asyncio.new_event_loop() + if to_string: + self.log_stream = StringIO() + self.string_handler = logging.StreamHandler(self.log_stream) + log.addHandler(self.string_handler) + + self.annotator_name: str = config.annotation.annotator_type + + self.tranqlizer: ConceptExpander = self.factory.build_tranqlizer() + + graph_name = self.config["redisgraph"]["graph"] + source = f"redis:{graph_name}" + self.tranql_queries: dict = self.factory.build_tranql_queries(source) + self.node_to_element_queries: list = self.factory.build_element_extraction_parameters(source) + + indexing_config = config.indexing + self.variables_index = indexing_config.get('variables_index') + self.concepts_index = indexing_config.get('concepts_index') + self.kg_index = indexing_config.get('kg_index') + + self.search_obj: Search = self.factory.build_search_obj([ + self.variables_index, + self.concepts_index, + self.kg_index, + ]) + self.index_obj: Index = self.factory.build_indexer_obj([ + self.variables_index, + self.concepts_index, + self.kg_index, + + ]) + + def __enter__(self): + self.event_loop = asyncio.new_event_loop() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # close elastic search connection + self.event_loop.run_until_complete(self.search_obj.es.close()) + # close async loop + if self.event_loop.is_running() and not self.event_loop.is_closed(): + self.event_loop.close() + if exc_type or exc_val or exc_tb: + traceback.print_exc() + log.error(f"{exc_val} {exc_val} {exc_tb}") + log.exception("Got an exception") + + def annotate_files(self, parser_name, parsable_files, + output_data_path=None): + """ + Annotates a Data element file using a Dug parser. + :param parser_name: Name of Dug parser to use. + :param parsable_files: Files to parse. + :return: None. + """ + dug_plugin_manager = get_plugin_manager() + parser: Parser = get_parser(dug_plugin_manager.hook, parser_name) + annotator: Annotator = get_annotator(dug_plugin_manager.hook, annotator_name=self.annotator_name, config=self.config.to_dug_conf()) + if not output_data_path: + output_data_path = storage.dug_annotation_path('') + log.info("Parsing files") + for parse_file in parsable_files: + log.debug("Creating Dug Crawler object") + crawler = Crawler( + crawl_file=parse_file, + parser=parser, + annotator=annotator, + tranqlizer='', + tranql_queries=[], + http_session=self.cached_session + ) + + # configure output space. + current_file_name = '.'.join(os.path.basename(parse_file).split('.')[:-1]) + elements_file_path = os.path.join(output_data_path, current_file_name) + elements_file_name = 'elements.pickle' + concepts_file_name = 'concepts.pickle' + + # create an empty elements file. This also creates output dir if it doesn't exist. + log.debug(f"Creating empty file: {elements_file_path}/element_file.json") + storage.write_object({}, os.path.join(elements_file_path, 'element_file.json')) + log.debug(parse_file) + log.debug(parser) + elements = parser(parse_file) + log.debug(elements) + crawler.elements = elements + + # @TODO propose for Dug to make this a crawler class init parameter(??) + crawler.crawlspace = elements_file_path + log.debug(f"Crawler annotator: {crawler.annotator}") + crawler.annotate_elements() + + # Extract out the concepts gotten out of annotation + # Extract out the elements + non_expanded_concepts = crawler.concepts + elements = crawler.elements + + # Write pickles of objects to file + log.info(f"Parsed and annotated: {parse_file}") + elements_out_file = os.path.join(elements_file_path, elements_file_name) + storage.write_object(elements, elements_out_file) + log.info(f"Pickled annotated elements to : {elements_file_path}/{elements_file_name}") + concepts_out_file = os.path.join(elements_file_path, concepts_file_name) + storage.write_object(non_expanded_concepts, concepts_out_file) + log.info(f"Pickled annotated concepts to : {elements_file_path}/{concepts_file_name}") + + def make_edge(self, + subj, + obj, + predicate='biolink:related_to', + predicate_label='related to', + relation='biolink:related_to', + relation_label='related to' + ): + """ + Create an edge between two nodes. + + :param subj: The identifier of the subject. + :param pred: The predicate linking the subject and object. + :param obj: The object of the relation. + :param predicate: Biolink compatible edge type. + :param predicate_label: Edge label. + :param relation: Ontological edge type. + :param relation_label: Ontological edge type label. + :returns: Returns and edge. + """ + edge_id = hashlib.md5(f'{subj}{predicate}{obj}'.encode('utf-8')).hexdigest() + return { + "subject": subj, + "predicate": predicate, + "predicate_label": predicate_label, + "id": edge_id, + "relation": relation, + "relation_label": relation_label, + "object": obj, + "provided_by": "renci.bdc.semanticsearch.annotator" + } + + def convert_to_kgx_json(self, elements, written_nodes=set()): + """ + Given an annotated and normalized set of study variables, + generate a KGX compliant graph given the normalized annotations. + Write that grpah to a graph database. + See BioLink Model for category descriptions. https://biolink.github.io/biolink-model/notes.html + """ + graph = { + "nodes": [], + "edges": [] + } + edges = graph['edges'] + nodes = graph['nodes'] + + for index, element in enumerate(elements): + # DugElement means a variable (Study variable...) + if not isinstance(element, DugElement): + continue + study_id = element.collection_id + if study_id not in written_nodes: + nodes.append({ + "id": study_id, + "category": ["biolink:Study"], + "name": study_id + }) + written_nodes.add(study_id) + """ connect the study and the variable. """ + edges.append(self.make_edge( + subj=element.id, + relation_label='part of', + relation='BFO:0000050', + obj=study_id, + predicate='biolink:part_of', + predicate_label='part of')) + edges.append(self.make_edge( + subj=study_id, + relation_label='has part', + relation="BFO:0000051", + obj=element.id, + predicate='biolink:has_part', + predicate_label='has part')) + + """ a node for the variable. Should be BL compatible """ + variable_node = { + "id": element.id, + "name": element.name, + "category": ["biolink:StudyVariable"], + "description": element.description.replace("'", '`').replace('\n', ' ') # bulk loader parsing issue + } + if element.id not in written_nodes: + nodes.append(variable_node) + written_nodes.add(element.id) + + for identifier, metadata in element.concepts.items(): + identifier_object = metadata.identifiers.get(identifier) + # This logic is treating DBGap files. + # First item in current DBGap xml files is a topmed tag, + # This is treated as a DugConcept Object. But since its not + # a concept we get from annotation (?) its never added to + # variable.concepts.items (Where variable is a DugElement object) + # The following logic is trying to extract types, and for the + # aformentioned topmed tag it adds `biolink:InfomrmationContentEntity` + # Maybe a better solution could be adding types on DugConcept objects + # More specifically Biolink compatible types (?) + # + if identifier_object: + category = identifier_object.types + elif identifier.startswith("TOPMED.TAG:"): + category = ["biolink:InformationContentEntity"] + else: + continue + if identifier not in written_nodes: + if isinstance(category, str): + bl_element = self.bl_toolkit.toolkit.get_element(category) + category = [bl_element.class_uri or bl_element.slot_uri] + nodes.append({ + "id": identifier, + "category": category, + "name": metadata.name + }) + written_nodes.add(identifier) + # related to edge + edges.append(self.make_edge( + subj=element.id, + obj=identifier + )) + # related to edge + edges.append(self.make_edge( + subj=identifier, + obj=element.id)) + return graph + + def make_tagged_kg(self, elements): + """ Make a Translator standard knowledge graph representing + tagged study variables. + :param variables: The variables to model. + :param tags: The tags characterizing the variables. + :returns: Returns dictionary with nodes and edges modeling a Translator/Biolink KG. + """ + graph = { + "nodes": [], + "edges": [] + } + edges = graph['edges'] + nodes = graph['nodes'] + studies = {} + + """ Create graph elements to model tags and their + links to identifiers gathered by semantic tagging. """ + tag_map = {} + # @TODO extract this into config or maybe dug ?? + topmed_tag_concept_type = "TOPMed Phenotype Concept" + nodes_written = set() + for tag in elements: + if not (isinstance(tag, DugConcept) and tag.type == topmed_tag_concept_type): + continue + tag_id = tag.id + tag_map[tag_id] = tag + nodes.append({ + "id": tag_id, + "name": tag.name, + "description": tag.description.replace("'", "`"), + "category": ["biolink:InformationContentEntity"] + }) + """ Link ontology identifiers we've found for this tag via nlp. """ + for identifier, metadata in tag.identifiers.items(): + if isinstance(metadata.types, str): + bl_element = self.bl_toolkit.toolkit.get_element(metadata.types) + category = [bl_element.class_uri or bl_element.slot_uri] + else: + category = metadata.types + synonyms = metadata.synonyms if metadata.synonyms else [] + nodes.append({ + "id": identifier, + "name": metadata.label, + "category": category, + "synonyms": synonyms + }) + nodes_written.add(identifier) + edges.append(self.make_edge( + subj=tag_id, + obj=identifier)) + edges.append(self.make_edge( + subj=identifier, + obj=tag_id)) + + concepts_graph = self.convert_to_kgx_json(elements, written_nodes=nodes_written) + graph['nodes'] += concepts_graph['nodes'] + graph['edges'] += concepts_graph['edges'] + + return graph + + def index_elements(self, elements_file): + log.info(f"Indexing {elements_file}...") + elements = storage.read_object(elements_file) + count = 0 + total = len(elements) + # Index Annotated Elements + log.info(f"found {len(elements)} from elements files.") + for element in elements: + count += 1 + # Only index DugElements as concepts will be indexed differently in next step + if not isinstance(element, DugConcept): + # override data-type with mapping values + if element.type.lower() in self.element_mapping: + element.type = self.element_mapping[element.type.lower()] + self.index_obj.index_element(element, index=self.variables_index) + percent_complete = (count / total) * 100 + if percent_complete % 10 == 0: + log.info(f"{percent_complete} %") + log.info(f"Done indexing {elements_file}.") + + def validate_indexed_elements(self, elements_file): + elements = [x for x in storage.read_object(elements_file) if not isinstance(x, DugConcept)] + # Pick ~ 10 % + sample_size = int(len(elements) * 0.1) + test_elements = elements[:sample_size] # random.choices(elements, k=sample_size) + log.info(f"Picked {len(test_elements)} from {elements_file} for validation.") + for element in test_elements: + # Pick a concept + concepts = [element.concepts[curie] for curie in element.concepts if element.concepts[curie].name] + + if len(concepts): + # Pick the first concept + concept = concepts[0] + curie = concept.id + search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', concept.name) + log.debug(f"Searching for Concept: {curie} and Search term: {search_term}") + all_elements_ids = self._search_elements(curie, search_term) + present = element.id in all_elements_ids + if not present: + log.error(f"Did not find expected variable {element.id} in search result.") + log.error(f"Concept id : {concept.id}, Search term: {search_term}") + raise Exception(f"Validation exception - did not find variable {element.id} " + f"from {str(elements_file)}" + f"when searching variable index with" + f" Concept ID : {concept.id} using Search Term : {search_term} ") + else: + log.info( + f"{element.id} has no concepts annotated. Skipping validation for it." + ) + + def _search_elements(self, curie, search_term): + response = self.event_loop.run_until_complete(self.search_obj.search_vars_unscored( + concept=curie, + query=search_term + )) + ids_dict = [] + if 'total_items' in response: + if response['total_items'] == 0: + log.error(f"No search elements returned for variable search: {self.variables_index}.") + log.error(f"Concept id : {curie}, Search term: {search_term}") + raise Exception(f"Validation error - Did not find {curie} for" + f"Search term: {search_term}") + else: + del response['total_items'] + for element_type in response: + all_elements_ids = [e['id'] for e in + reduce(lambda x, y: x + y['elements'], response[element_type], [])] + ids_dict += all_elements_ids + return ids_dict + + def crawl_concepts(self, concepts, data_set_name): + """ + Adds tranql KG to Concepts, terms grabbed from KG are also added as search terms + :param concepts: + :param data_set_name: + :return: + """ + crawl_dir = storage.dug_crawl_path('crawl_output') + output_file_name = os.path.join(data_set_name, 'expanded_concepts.pickle') + extracted_dug_elements_file_name = os.path.join(data_set_name, 'extracted_graph_elements.pickle') + output_file = storage.dug_expanded_concepts_path(output_file_name) + extracted_output_file = storage.dug_expanded_concepts_path(extracted_dug_elements_file_name) + Path(crawl_dir).mkdir(parents=True, exist_ok=True) + extracted_dug_elements = [] + log.debug("Creating Dug Crawler object") + crawler = Crawler( + crawl_file="", + parser=None, + annotator=None, + tranqlizer=self.tranqlizer, + tranql_queries=self.tranql_queries, + http_session=self.cached_session, + ) + crawler.crawlspace = crawl_dir + counter = 0 + total = len(concepts) + for concept_id, concept in concepts.items(): + counter += 1 + try: + crawler.expand_concept(concept) + concept.set_search_terms() + concept.set_optional_terms() + except Exception as e: + log.error(concept) + raise e + for query in self.node_to_element_queries: + log.info(query) + casting_config = query['casting_config'] + tranql_source = query['tranql_source'] + dug_element_type = query['output_dug_type'] + new_elements = crawler.expand_to_dug_element( + concept=concept, + casting_config=casting_config, + dug_element_type=dug_element_type, + tranql_source=tranql_source + ) + log.debug("extracted:") + log.debug(str(list([el.get_searchable_dict() for el in new_elements]))) + extracted_dug_elements += new_elements + concept.clean() + percent_complete = int((counter / total) * 100) + if percent_complete % 10 == 0: + log.info(f"{percent_complete}%") + storage.write_object(obj=concepts, path=output_file) + storage.write_object(obj=extracted_dug_elements, path=extracted_output_file) + + def index_concepts(self, concepts): + log.info("Indexing Concepts") + total = len(concepts) + count = 0 + for concept_id, concept in concepts.items(): + count += 1 + self.index_obj.index_concept(concept, index=self.concepts_index) + # Index knowledge graph answers for each concept + for kg_answer_id, kg_answer in concept.kg_answers.items(): + self.index_obj.index_kg_answer( + concept_id=concept_id, + kg_answer=kg_answer, + index=self.kg_index, + id_suffix=kg_answer_id + ) + percent_complete = int((count / total) * 100) + if percent_complete % 10 == 0: + log.info(f"{percent_complete} %") + log.info("Done Indexing concepts") + + def validate_indexed_concepts(self, elements, concepts): + """ + Validates linked concepts are searchable + :param elements: Annotated dug elements + :param concepts: Crawled (expanded) concepts + :return: + """ + # 1 . Find concepts with KG <= 10% of all concepts, + # <= because we might have no results for some concepts from tranql + sample_concepts = {key: value for key, value in concepts.items() if value.kg_answers} + if len(concepts) == 0: + log.info(f"No Concepts found.") + return + log.info( + f"Found only {len(sample_concepts)} Concepts with Knowledge graph out of {len(concepts)}. {(len(sample_concepts) / len(concepts)) * 100} %") + # 2. pick elements that have concepts in the sample concepts set + sample_elements = {} + for element in elements: + if isinstance(element, DugConcept): + continue + for concept in element.concepts: + # add elements that have kg + if concept in sample_concepts: + sample_elements[concept] = sample_elements.get(concept, set()) + sample_elements[concept].add(element.id) + + # Time for some validation + for curie in concepts: + concept = concepts[curie] + if not len(concept.kg_answers): + continue + search_terms = [] + for key in concept.kg_answers: + kg_object = concept.kg_answers[key] + search_terms += kg_object.get_node_names() + search_terms += kg_object.get_node_synonyms() + # reduce(lambda x,y: x + y, [[node.get("name")] + node.get("synonyms", []) + # for node in concept.kg_answers["knowledge_graph"]["nodes"]], []) + # validation here is that for any of these nodes we should get back + # the variable. + # make unique + search_terms_cap = 10 + search_terms = list(set(search_terms))[:search_terms_cap] + log.debug(f"Using {len(search_terms)} Search terms for concept {curie}") + for search_term in search_terms: + # avoids elastic failure due to some reserved characters + # 'search_phase_execution_exception', 'token_mgr_error: Lexical error ... + search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', search_term) + + searched_element_ids = self._search_elements(curie, search_term) + + if curie not in sample_elements: + log.error(f"Did not find Curie id {curie} in Elements.") + log.error(f"Concept id : {concept.id}, Search term: {search_term}") + raise Exception(f"Validation error - Did not find {element.id} for" + f" Concept id : {concept.id}, Search term: {search_term}") + else: + present = bool(len([x for x in sample_elements[curie] if x in searched_element_ids])) + if not present: + log.error(f"Did not find expected variable {element.id} in search result.") + log.error(f"Concept id : {concept.id}, Search term: {search_term}") + raise Exception(f"Validation error - Did not find {element.id} for" + f" Concept id : {concept.id}, Search term: {search_term}") + + def clear_index(self, index_id): + exists = self.search_obj.es.indices.exists(index=index_id) + if exists: + log.info(f"Deleting index {index_id}") + response = self.event_loop.run_until_complete(self.search_obj.es.indices.delete(index=index_id)) + log.info(f"Cleared Elastic : {response}") + log.info("Re-initializing the indicies") + self.index_obj.init_indices() + + def clear_variables_index(self): + self.clear_index(self.variables_index) + + def clear_kg_index(self): + self.clear_index(self.kg_index) + + def clear_concepts_index(self): + self.clear_index(self.concepts_index) + + +class DugUtil(): + + @staticmethod + def clear_annotation_cached(config=None, to_string=False): + with Dug(config, to_string=to_string) as dug: + annotation_path = storage.dug_annotation_path("") + storage.clear_dir(annotation_path) + # Clear http session cache + if config.annotation.clear_http_cache: + dug.cached_session.cache.clear() + + @staticmethod + def annotate_db_gap_files(config=None, to_string=False, input_data_path=None, output_data_path=None): + with Dug(config, to_string=to_string) as dug: + if not input_data_path: + files = storage.dug_dd_xml_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + parser_name = "DbGaP" + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def annotate_anvil_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): + with Dug(config, to_string=to_string) as dug: + if not input_data_path: + files = storage.dug_anvil_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + parser_name = "Anvil" + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def annotate_cancer_commons_files(config=None, to_string=False, + input_data_path=None, + output_data_path=None): + with Dug(config, to_string=to_string) as dug: + if not input_data_path: + files = storage.dug_crdc_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + parser_name = "crdc" + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def annotate_kids_first_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): + with Dug(config, to_string=to_string) as dug: + if not input_data_path: + files = storage.dug_kfdrc_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + parser_name = "kfdrc" + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def annotate_nida_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): + with Dug(config, to_string=to_string) as dug: + if not input_data_path: + files = storage.dug_nida_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + parser_name = "NIDA" + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def annotate_sparc_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): + with Dug(config, to_string=to_string) as dug: + if not input_data_path: + files = storage.dug_sparc_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + parser_name = "SciCrunch" + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def annotate_sprint_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): + with Dug(config, to_string=to_string) as dug: + if not input_data_path: + files = storage.dug_sprint_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + parser_name = "SPRINT" + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def annotate_topmed_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): + with Dug(config, to_string=to_string) as dug: + if not input_data_path: + files = storage.dug_topmed_objects( + input_data_path=None) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + parser_name = "TOPMedTag" + log.info(files) + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def annotate_bacpac_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): + + log.info(f"Input data path is: {input_data_path}") + with Dug(config, to_string=to_string) as dug: + files = storage.dug_bacpac_objects( + input_data_path=input_data_path) + + parser_name = "BACPAC" + log.info(files) + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + + @staticmethod + def annotate_heal_study_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): + with Dug(config, to_string=to_string) as dug: + if not input_data_path: + files = storage.dug_heal_study_objects( + input_data_path=None) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + + parser_name = "heal-studies" + log.info(files) + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + + @staticmethod + def annotate_heal_research_program_files(config=None, to_string=False, + input_data_path=None, + output_data_path=None): + with Dug(config, to_string=to_string) as dug: + if not input_data_path: + files = storage.dug_heal_research_program_objects( + input_data_path=None) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + parser_name = "heal-research" + log.info(files) + dug.annotate_files(parser_name=parser_name, + parsable_files=files, + output_data_path=output_data_path) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def make_kg_tagged(config=None, to_string=False, input_data_path=None, output_data_path=None): + with Dug(config, to_string=to_string) as dug: + output_base_path = output_data_path + if not output_data_path: + output_base_path = storage.dug_kgx_path("") + storage.clear_dir(output_base_path) + log.info("Starting building KGX files") + if not input_data_path: + elements_files = storage.dug_elements_objects() + else: + import glob + glob_pattern = str(input_data_path / "**" / 'elements.pickle') + elements_files = glob.glob(glob_pattern, recursive=True) + log.info(f"making kgx files for the following pickles: {elements_files}") + for file in elements_files: + elements = storage.read_object(file) + if "topmed_" in file: + kg = dug.make_tagged_kg(elements) + else: + kg = dug.convert_to_kgx_json(elements) + dug_base_file_name = file.split(os.path.sep)[-2] + output_file_path = os.path.join(output_base_path, dug_base_file_name + '_kgx.json') + storage.write_object(kg, output_file_path) + log.info(f"Wrote {len(kg['nodes'])} nodes and {len(kg['edges'])} edges, to {output_file_path}.") + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def index_variables(config=None, to_string=False): + with Dug(config, to_string=to_string) as dug: + dug.clear_variables_index() + elements_object_files = storage.dug_elements_objects() + for file in elements_object_files: + dug.index_elements(file) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def index_extracted_elements(config=None, to_string=False): + with Dug(config, to_string=to_string) as dug: + elements_object_files = storage.dug_extracted_elements_objects() + for file in elements_object_files: + dug.index_elements(file) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def index_concepts(config=None, to_string=False): + with Dug(config=config, to_string=to_string) as dug: + # These are concepts that have knowledge graphs from tranql + # clear out concepts and kg indicies from previous runs + dug.clear_concepts_index() + dug.clear_kg_index() + expanded_concepts_files = storage.dug_expanded_concept_objects() + for file in expanded_concepts_files: + concepts = storage.read_object(file) + dug.index_concepts(concepts=concepts) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def validate_indexed_variables(config=None, to_string=False): + with Dug(config, to_string=to_string) as dug: + elements_object_files = storage.dug_elements_objects() + for elements_object_file in elements_object_files: + log.info(f"Validating {elements_object_file}") + dug.validate_indexed_elements(elements_object_file) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def crawl_tranql(config=None, to_string=False): + log.info(config.dict) + with Dug(config, to_string=to_string) as dug: + concepts_files = storage.dug_concepts_objects() + crawl_dir = storage.dug_crawl_path('crawl_output') + log.info(f'Clearing crawl output dir {crawl_dir}') + storage.clear_dir(crawl_dir) + expanded_concepts_dir = storage.dug_expanded_concepts_path("") + log.info(f'Clearing expanded concepts dir: {expanded_concepts_dir}') + storage.clear_dir(expanded_concepts_dir) + log.info(f'Crawling Dug Concepts, found {len(concepts_files)} file(s).') + for file in concepts_files: + data_set = storage.read_object(file) + original_variables_dataset_name = os.path.split(os.path.dirname(file))[-1] + dug.crawl_concepts(concepts=data_set, + data_set_name=original_variables_dataset_name) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + @staticmethod + def validate_indexed_concepts(config=None, to_string=False): + with Dug(config, to_string=to_string) as dug: + get_data_set_name = lambda file: os.path.split(os.path.dirname(file))[-1] + expanded_concepts_files_dict = { + get_data_set_name(file): file for file in storage.dug_expanded_concept_objects() + } + annotated_elements_files_dict = { + get_data_set_name(file): file for file in storage.dug_elements_objects() + } + try: + assert len(expanded_concepts_files_dict) == len(annotated_elements_files_dict) + except: + log.error("Files Annotated Elements files and Expanded concepts files, should be pairs") + if len(expanded_concepts_files_dict) > len(annotated_elements_files_dict): + log.error("Some Annotated Elements files (from load_and_annotate task) are missing") + else: + log.error("Some Expanded Concepts files (from crawl task) are missing") + log.error(f"Annotated Datasets : {list(annotated_elements_files_dict.keys())}") + log.error(f"Expanded Concepts Datasets: {list(expanded_concepts_files_dict.keys())}") + exit(-1) + for data_set_name in annotated_elements_files_dict: + log.debug(f"Reading concepts and elements for dataset {data_set_name}") + elements_file_path = annotated_elements_files_dict[data_set_name] + concepts_file_path = expanded_concepts_files_dict[data_set_name] + dug_elements = storage.read_object(elements_file_path) + dug_concepts = storage.read_object(concepts_file_path) + log.debug(f"Read {len(dug_elements)} elements, and {len(dug_concepts)} Concepts") + log.info(f"Validating {data_set_name}") + dug.validate_indexed_concepts(elements=dug_elements, concepts=dug_concepts) + output_log = dug.log_stream.getvalue() if to_string else '' + return output_log + + +class FileFetcher: + + def __init__( + self, + remote_host: str, + remote_dir: Union[str, Path], + local_dir: Union[str, Path] = "." + ): + self.remote_host = remote_host + self.remote_dir = remote_dir.rstrip("/") if isinstance(remote_dir, str) else str(remote_dir.as_posix()) + self.local_dir = Path(local_dir).resolve() + + def __call__(self, remote_file_path: Union[str, Path]) -> Path: + remote_path = self.remote_dir + "/" + remote_file_path + local_path = self.local_dir / remote_file_path + url = f"{self.remote_host}{remote_path}" + log.debug(f"Fetching {url}") + try: + response = requests.get(url, allow_redirects=True) + except Exception as e: + log.error(f"Unexpected {e.__class__.__name__}: {e}") + raise RuntimeError(f"Unable to fetch {url}") + else: + log.debug(f"Response: {response.status_code}") + if response.status_code == 200: + with local_path.open('wb') as file_obj: + file_obj.write(response.content) + return local_path + else: + log.debug(f"Unable to fetch {url}: {response.status_code}") + raise RuntimeError(f"Unable to fetch {url}") + + +def get_versioned_files(config: RogerConfig, data_format, output_file_path, data_store="s3", unzip=False): + """ + Fetches a dug inpu data files to input file directory + """ + meta_data = storage.read_relative_object("../../metadata.yaml") + output_dir: Path = storage.dug_input_files_path(output_file_path) + # clear dir + storage.clear_dir(output_dir) + data_sets = config.dug_inputs.data_sets + log.info(f"dataset: {data_sets}") + pulled_files = [] + s3_utils = S3Utils(config.s3_config) + for data_set in data_sets: + data_set_name, current_version = data_set.split(':') + for item in meta_data["dug_inputs"]["versions"]: + if item["version"] == current_version and item["name"] == data_set_name and item["format"] == data_format: + if data_store == "s3": + for filename in item["files"]["s3"]: + log.info(f"Fetching {filename}") + output_name = filename.split('/')[-1] + output_path = output_dir / output_name + s3_utils.get( + str(filename), + str(output_path), + ) + if unzip: + log.info(f"Unzipping {output_path}") + tar = tarfile.open(str(output_path)) + tar.extractall(path=output_dir) + pulled_files.append(output_path) + else: + for filename in item["files"]["stars"]: + log.info(f"Fetching {filename}") + # fetch from stars + remote_host = config.annotation_base_data_uri + fetch = FileFetcher( + remote_host=remote_host, + remote_dir=current_version, + local_dir=output_dir) + output_path = fetch(filename) + if unzip: + log.info(f"Unzipping {output_path}") + tar = tarfile.open(str(output_path)) + tar.extractall(path=output_dir) + pulled_files.append(output_path) + return [str(filename) for filename in pulled_files] + + +def get_dbgap_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, 'dbGaP', 'db_gap', data_store=config.dug_inputs.data_source, unzip=True) + + +def get_nida_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, "nida", "nida", data_store=config.dug_inputs.data_source, unzip=True) + + +def get_sparc_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, "sparc", "sparc", data_store=config.dug_inputs.data_source, unzip=True) + + +def get_anvil_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, "anvil", "anvil", data_store=config.dug_inputs.data_source, unzip=True) + + +def get_kids_first_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, "kfdrc", "kfdrc", data_store=config.dug_inputs.data_source, unzip=True) + + +def get_cancer_data_commons_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, "crdc", "crdc", data_store=config.dug_inputs.data_source, unzip=True) + + +def get_sprint_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, "sprint", "sprint", data_store=config.dug_inputs.data_source, unzip=True) + +def get_bacpac_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, "bacpac", "bacpac", data_store=config.dug_inputs.data_source, unzip=True) + +def get_topmed_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, "topmed", "topmed", data_store=config.dug_inputs.data_source, unzip=False) + +def get_heal_study_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, "heal-studies", "heal-study-imports", data_store=config.dug_inputs.data_source, unzip=True) + +def get_heal_research_program_files(config: RogerConfig, to_string=False) -> List[str]: + return get_versioned_files(config, "heal-research", "heal-research-programs", data_store=config.dug_inputs.data_source, unzip=True) diff --git a/dags/knowledge_graph_build.py b/dags/knowledge_graph_build.py new file mode 100644 index 00000000..de28aa78 --- /dev/null +++ b/dags/knowledge_graph_build.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +# + +""" +An Airflow workflow for the Roger Translator KGX data pipeline. +""" + +from airflow.models import DAG +from airflow.operators.empty import EmptyOperator +import roger +from roger.tasks import default_args, create_python_task +from roger.config import config + +""" Build the workflow's tasks and DAG. """ +with DAG( + dag_id='knowledge_graph_build', + default_args=default_args, + schedule_interval=None +) as dag: + + """ Build the workflow tasks. """ + intro = EmptyOperator(task_id='Intro') + + # Merge nodes needs inputs from two sources + # 1. baseline and/or CDE KGX files from LakeFS (External repo) + # 2. Infer which local kgx files are needed based on dug_inputs and grab them from the current repo + + # build the annotate and index pipeline output locations + #lakefs://yk-heal/main/annotate_and_index/crdc_dataset_pipeline_task_group.make_kgx_crdc/ + working_repo = config.lakefs_config.repo + branch = config.lakefs_config.branch + kgx_repos = config.kgx.data_sets + input_repos = [{ + 'name': repo.split(':')[0], + 'branch': repo.split(':')[1], + 'path': '*' + } for repo in kgx_repos] + + # Figure out a way to extract paths + get_path_on_lakefs = lambda d: f"annotate_and_index/{d}_dataset_pipeline_task_group.make_kgx_{d}/" + + + for dataset in config.dug_inputs.data_sets: + dataset_name = dataset.split(":")[0] + # add datasets from the other pipeline + input_repos.append( + { + 'name': working_repo, + 'branch': branch, + 'path': get_path_on_lakefs(dataset_name) + } + ) + + merge_nodes = create_python_task (dag, name="MergeNodes", + a_callable=roger.merge_nodes, + external_repos=input_repos + ) + + # The rest of these guys can just operate on the local lakefs repo/branch + # we need to add input dir and output dir similar to what we did for dug tasks + + create_nodes_schema = create_python_task(dag, + name="CreateNodesSchema", + a_callable=roger.create_nodes_schema + ) + create_edges_schema = create_python_task(dag, + name="CreateEdgesSchema", + a_callable=roger.create_edges_schema) + + create_bulk_load_nodes = create_python_task(dag, + name="CreateBulkLoadNodes", + a_callable=roger.create_bulk_nodes) + create_bulk_load_edges = create_python_task(dag, + name="CreateBulkLoadEdges", + a_callable=roger.create_bulk_edges) + bulk_load = create_python_task(dag, + name="BulkLoad", + a_callable=roger.bulk_load, + no_output_files=True) + check_tranql = create_python_task(dag, + name="CheckTranql", + a_callable=roger.check_tranql, + no_output_files=True) + validate = create_python_task(dag, + name="Validate", + a_callable=roger.validate, + no_output_files=True) + + + """ Build the DAG. """ + merge_nodes.set_upstream(intro) + create_nodes_schema.set_upstream(merge_nodes) + create_edges_schema.set_upstream(merge_nodes) + create_bulk_load_nodes.set_upstream(create_nodes_schema) + create_bulk_load_nodes.set_upstream(merge_nodes) + create_bulk_load_edges.set_upstream(create_edges_schema) + create_bulk_load_edges.set_upstream(merge_nodes) + bulk_load.set_upstream(create_bulk_load_nodes) + bulk_load.set_upstream(create_bulk_load_edges) + validate.set_upstream(bulk_load) + check_tranql.set_upstream(bulk_load) + diff --git a/dags/metadata.yaml b/dags/metadata.yaml new file mode 100644 index 00000000..0cedb6a0 --- /dev/null +++ b/dags/metadata.yaml @@ -0,0 +1,206 @@ +kgx: + versions: + - files: + - biolink-v1.0.json + - ctd-v1.0.json + - gtopdb-v1.0.json + - hetio-v1.0.json + - hgnc-v1.0.json + - hmdb-v1.0.json + - kegg-v1.0.json + - mychem-v1.0.json + - ontological-hierarchy-v1.0.json + - panther-v1.0.json + - foodb-v1.0.json + - pharos-v1.0.json + - intact-v1.0.json + - human-goa-v1.0.json + - uberongraph-v1.0.json + - viral-proteome-v1.0.json + version: v1.0 + name: baseline-graph + format: json + - files: + - biolink-v2.0.json + - ctd-v2.0.json + - gtopdb-v2.0.json + - hetio-v2.0.json + - hgnc-v2.0.json + - hmdb-v2.0.json + - kegg-v2.0.json + - mychem-v2.0.json + - ontological-hierarchy-v2.0.json + - panther-v2.0.json + - foodb-v2.0.json + - pharos-v2.0.json + - intact-v2.0.json + - human-goa-v2.0.json + - uberongraph-v2.0.json + - viral-proteome-v2.0.json + version: v2.0 + name: baseline-graph + format: json + - files: + - heal/sparc/curation-export-processed.json + version: v2.0 + name: sparc-kgx + format: json + - files: + - Biolink_edges_v3.0.jsonl + - Biolink_nodes_v3.0.jsonl + - CTD_edges_v3.0.jsonl + - CTD_nodes_v3.0.jsonl + - DrugCentral_edges_v3.0.jsonl + - DrugCentral_nodes_v3.0.jsonl + - GtoPdb_edges_v3.0.jsonl + - GtoPdb_nodes_v3.0.jsonl + - Hetio_edges_v3.0.jsonl + - Hetio_nodes_v3.0.jsonl + - HGNC_edges_v3.0.jsonl + - HGNC_nodes_v3.0.jsonl + - HMDB_edges_v3.0.jsonl + - HMDB_nodes_v3.0.jsonl + - HumanGOA_edges_v3.0.jsonl + - HumanGOA_nodes_v3.0.jsonl + - IntAct_edges_v3.0.jsonl + - IntAct_nodes_v3.0.jsonl + - OntologicalHierarchy_edges_v3.0.jsonl + - OntologicalHierarchy_nodes_v3.0.jsonl + - PANTHER_edges_v3.0.jsonl + - PANTHER_nodes_v3.0.jsonl + - PHAROS_edges_v3.0.jsonl + - PHAROS_nodes_v3.0.jsonl + - UberGraph_edges_v3.0.jsonl + - UberGraph_nodes_v3.0.jsonl + version: v3.0 + name: baseline-graph + format: jsonl + - version: test + files: + - panther.json + name: test + - version: v3.0 + name: cde-graph + format: jsonl + files: + - cde/annotated_edges_v3.0.jsonl + - cde/annotated_nodes_v3.0.jsonl + - version: v4.0 + name: baseline-graph + format: jsonl + files: + - baseline-4.0/edges_v4.0.jsonl + - baseline-4.0/nodes_v4.0.jsonl + - version: v4.0 + name: cde-graph + format: jsonl + files: + - cde/annotated_edges_v4.0.jsonl + - cde/annotated_nodes_v4.0.jsonl + - version: v5.0 + name: baseline-graph + format: jsonl + files: + - baseline-5.0/edges_v5.0.jsonl + - baseline-5.0/nodes_v5.0.jsonl + - version: v5.0 + name: cde-graph + format: jsonl + files: + - cde/annotated_edges_v5.0.jsonl + - cde/annotated_nodes_v5.0.jsonl +dug_inputs: + versions: + - name: bdc + version: v1.0 + files: + s3: + - "bdc/v1.0/bdc_dbgap_data_dicts.tar.gz" + stars: + - "bdc_dbgap_data_dicts.tar.gz" + format: dbGaP + - name: bdc + version: v2.0 + files: + s3: + - "bdc/v2.0/bdc_dbgap_data_dicts.tar.gz" + stars: + - "bdc_dbgap_data_dicts.tar.gz" + format: dbGaP + - name: bdc + version: v3.0 + files: + s3: + - "bdc/v3.0/bdc_dbgap_data_dicts.tar.gz" + format: dbGaP + - name: nida + version: v1.0 + files: + s3: + - "nida/v1.0/nida-12studies.tar.gz" + stars: + - "nida-12studies.tar.gz" + format: nida + - name: sparc + version: v1.0 + files: + s3: + - "sparc/v1.0/sparc-dbgap-xml-formatted.tar.gz" + stars: + - "sparc-dbgap-xml-formatted.tar.gz" + format: sparc + - name: topmed + version: v2.0 + files: + s3: + - "topmed/v2.0/topmed_tags_v2.0.json" + - "topmed/v2.0/topmed_variables_v2.0.csv" + stars: + - topmed_variables_v2.0.csv + - topmed_tags_v2.0.json + format: topmed + - name: anvil + version: v1.0 + files: + s3: + - "bdc/v1.0/anvil_dbgap_data_dicts.tar.gz" + stars: + - "anvil_dbgap_data_dicts.tar.gz" + format: anvil + - name: kfdrc + version: v1.0 + files: + s3: + - "bdc/v1.0/KFDRC.tar.gz" + format: kfdrc + - name: crdc + version: v1.0 + files: + s3: + - "bdc/v1.0/CRDC.tar.gz" + format: crdc + - name: sprint + version: v1.0 + files: + s3: + - "sprint/v1.0/StanfordSPRINT_DataDictionary_2020-12-16.tar.gz" + format: sprint + - name: bacpac + version: v1.0 + files: + s3: + - "heal-datasets/bacpac/bacpac_baseline_do_measures.tar.gz" + format: bacpac + - name: heal-studies + version: v1.0 + files: + s3: + - heal-datasets/ingest-8-23/heal_studies.tar.gz + - heal-datasets/ingest-8-23/heal_mds_import.tar.gz + format: heal-studies + - name: heal-research-programs + version: v1.0 + files: + s3: + - heal-datasets/ingest-8-23/heal_research_programs.tar.gz + format: heal-research diff --git a/dags/roger/__init__.py b/dags/roger/__init__.py new file mode 100644 index 00000000..e950b109 --- /dev/null +++ b/dags/roger/__init__.py @@ -0,0 +1,20 @@ +"Roger: an automated graph data curation pipeline." + +from roger.core.base import ( + Roger, + roger_cli, + get_kgx, + create_schema, + create_edges_schema, + create_nodes_schema, + merge_nodes, + create_bulk_load, + create_bulk_nodes, + create_bulk_edges, + bulk_load, + validate, + check_tranql, +) + +if __name__ == "__main__": + roger_cli() diff --git a/dags/roger/components/__init__.py b/dags/roger/components/__init__.py new file mode 100644 index 00000000..49314f47 --- /dev/null +++ b/dags/roger/components/__init__.py @@ -0,0 +1 @@ +"Data conversion utilities" diff --git a/dags/roger/components/data_conversion.py b/dags/roger/components/data_conversion.py new file mode 100644 index 00000000..46dd61fe --- /dev/null +++ b/dags/roger/components/data_conversion.py @@ -0,0 +1,71 @@ +"Data conversion utility methods" + +from typing import Any + + +_type_map = { + list.__name__: { + 'priority': 0, + 'constructor': lambda x: list([x]) + }, + str.__name__: { + 'priority': 1, + 'constructor': lambda x: str(x) + }, + bool.__name__: { + 'priority': 2, + 'constructor': lambda x: True if x else False + }, + float.__name__: { + 'priority': 2, + 'constructor': lambda x: float(x), + }, + int.__name__: { + 'priority': 2, + 'constructor': lambda x: int(x) + }, + type(None).__name__: { + 'priority': 3, + 'constructor': lambda x: '', + } +} + +def cast(value: Any, to_type: str): + """ + Parses a value to dest type. + :param value: value to parse + :param to_type: destination type + :return: parsed value + """ + if to_type not in _type_map: + raise TypeError( + f'Type {to_type} not found in conversion map. ' + f'Available types are {_type_map.keys()}') + dest_type_constructor = _type_map[to_type]['constructor'] + return dest_type_constructor(value) + +def compare_types(data_type: str, data_type_2: str): + """ + Of two python types selects the one we would like to upcast to. + :param data_type: + :param data_type_2: + :return: + """ + assert data_type in _type_map, ( + f"Unrecognised type {data_type} From types:" + f"{list(_type_map.keys())}") + + assert data_type_2 in _type_map, ( + f"Unrecognised type {data_type} From types: " + f"{list(_type_map.keys())}") + + d1_val = _type_map[data_type]['priority'] + d2_val = _type_map[data_type_2]['priority'] + + if data_type != data_type_2 and d1_val == d2_val: + # For float int and bool have same priority + # treat them as strings. + d1_val = (d1_val - 1) + data_type = str.__name__ + + return data_type if d1_val < d2_val else data_type_2 diff --git a/dags/roger/components/data_conversion_utils.py b/dags/roger/components/data_conversion_utils.py new file mode 100644 index 00000000..f6f60eb0 --- /dev/null +++ b/dags/roger/components/data_conversion_utils.py @@ -0,0 +1,69 @@ +from typing import Any + + +class TypeConversionUtil: + + type_map = { + list.__name__: { + 'priority': 0, + 'constructor': lambda x: list([x]) + }, + str.__name__: { + 'priority': 1, + 'constructor': lambda x: str(x) + }, + bool.__name__: { + 'priority': 2, + 'constructor': lambda x: True if x else False + }, + float.__name__: { + 'priority': 2, + 'constructor': lambda x: float(x), + }, + int.__name__: { + 'priority': 2, + 'constructor': lambda x: int(x) + }, + type(None).__name__: { + 'priority': 3, + 'constructor': lambda x: '', + } + } + + @staticmethod + def cast(value: Any, to_type: str): + """ + Parses a value to dest type. + :param value: value to parse + :param to_type: destination type + :return: parsed value + """ + if to_type not in TypeConversionUtil.type_map: + raise TypeError(f'Type {to_type} not found in conversion map. Available types are {TypeConversionUtil.type_map.keys()}') + dest_type_constructor = TypeConversionUtil.type_map[to_type]['constructor'] + return dest_type_constructor(value) + + @staticmethod + def compare_types(data_type: str, data_type_2: str): + """ + Of two python types selects the one we would like to upcast to. + :param data_type: + :param data_type_2: + :return: + """ + assert data_type in TypeConversionUtil.type_map, f"Unrecognised type {data_type} From types:" \ + f"{list(TypeConversionUtil.type_map.keys())}" + + assert data_type_2 in TypeConversionUtil.type_map, f"Unrecognised type {data_type} From types: " \ + f"{list(TypeConversionUtil.type_map.keys())}" + + d1_val = TypeConversionUtil.type_map[data_type]['priority'] + d2_val = TypeConversionUtil.type_map[data_type_2]['priority'] + + if data_type != data_type_2 and d1_val == d2_val: + # For float int and bool have same priority + # treat them as strings. + d1_val = (d1_val - 1) + data_type = str.__name__ + + return data_type if d1_val < d2_val else data_type_2 diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py new file mode 100644 index 00000000..71111f39 --- /dev/null +++ b/dags/roger/config/__init__.py @@ -0,0 +1,399 @@ +import json +import os +import warnings +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, Optional, List + +import yaml +from dug.config import Config as DugConfig +from flatten_dict import flatten, unflatten + +from ._base import DictLike +from .s3_config import S3Config + +CONFIG_FILENAME = Path(__file__).parent.resolve() / "config.yaml" + +@dataclass +class RedisConfig(DictLike): + username: str = "" + password: str = "" + host: str = "redis" + graph: str = "test" + port: int = 6379 + + def __post_init__(self): + self.port = int(self.port) + + +@dataclass +class LakefsConfig(DictLike): + host: str + access_key_id: str + secret_access_key: str + branch: str + repo: str + enabled: bool = False + + def __post_init__(self): + if isinstance(self.enabled, str): + self.enabled = self.enabled.lower() == "true" + + + +@dataclass +class LoggingConfig(DictLike): + level: str = "DEBUG" + format: str = '[%(name)s][%(filename)s][%(lineno)d][%(funcName)20s] %(levelname)s: %(message)s' + + +@dataclass +class KgxConfig(DictLike): + biolink_model_version: str = "1.5.0" + merge_db_temp_dir: str = "workspace" + data_sets: List = field(default_factory=lambda: ['baseline-graph:v5.0']) + + def __post_init__(self): + # Convert strings to list. In cases where this is passed as env variable with a single value + # cast it to a list. eg ROGER_KGX_DATA__SET="spark,baseline-data" could be converted to + # config.kgx.data_set = ["spark", "baseline-data"] + self.data_sets = [data_set.strip(" ") for data_set in self.data_sets.split(",")] \ + if isinstance(self.data_sets, str) else self.data_sets + + +@dataclass +class DugInputsConfig(DictLike): + data_source: str = 'stars' + data_sets: List = field(default_factory=lambda: ['topmed', 'bdc']) + + def __post_init__(self): + # Convert strings to list. In cases where this is passed as env variable with a single value + # cast it to a list. eg ROGER_KGX_DATA__SET="spark,baseline-data" could be converted to + # config.kgx.data_set = ["spark", "baseline-data"] + self.data_sets = [data_set.strip(" ") for data_set in self.data_sets.split(",")] \ + if isinstance(self.data_sets, str) else self.data_sets + + +@dataclass +class BulkLoaderConfig(DictLike): + separator: str = "0x1E" + enforce_schema: bool = False + skip_invalid_nodes: bool = False + skip_invalid_edges: bool = False + quote: int = 0 + max_token_count: int = 1024 + max_buffer_size: int = 2048 + max_token_size: int = 500 + index: list = field(default_factory=list) + full_text_index: list = field(default_factory=list) + + +@dataclass +class AnnotationConfig(DictLike): + annotator_type: str = "monarch" + annotator_args: dict = field( + default_factory=lambda: { + "monarch": { + "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" + }, + "sapbert": { + "classification_url": "https://med-nemo.apps.renci.org/annotate/", + "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/", + "score_threshold": 0.8, + "bagel": { + "enabled": False, + "url": "https://bagel.apps.renci.org/group_synonyms_openai", + "prompt": "bagel/ask_classes", + "llm_args": { + "llm_model_name": "gpt-4o-2024-05-13", + "organization": "", + "access_key": "", + "llm_model_args": { + "top_p": 0, + "temperature": 0.1 + } + } + } + }, + } + ) + normalizer: str = "https://nodenormalization-sri.renci.org/get_normalized_nodes?curie=" + synonym_service: str = "https://onto.renci.org/synonyms/" + ontology_metadata: str = "https://api.monarchinitiative.org/api/bioentity/" + clear_http_cache: bool = False + preprocessor: dict = field(default_factory=lambda: + { + "debreviator": { + "BMI": "body mass index" + }, + "stopwords": "the", + } + ) + + ontology_greenlist: List[str] = field(default_factory=lambda: [ + "PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS" + ]) + + def __post_init__(self): + self.annotator_args["sapbert"]["bagel"]["enabled"] = self.annotator_args["sapbert"]["bagel"][ + "enabled"].lower() == "true" + + +@dataclass +class IndexingConfig(DictLike): + variables_index: str = "variables_index" + concepts_index: str = "concepts_index" + kg_index: str = "kg_index" + tranql_min_score: float = 0.2 + excluded_identifiers: List[str] = field(default_factory=lambda: [ + "CHEBI:17336" + ]) + + queries: dict = field(default_factory=lambda: { + "disease": ["disease", "phenotypic_feature"], + "pheno": ["phenotypic_feature", "disease"], + "anat": ["disease", "anatomical_entity"], + "chem_to_disease": ["chemical_substance", "disease"], + "phen_to_anat": ["phenotypic_feature", "anatomical_entity"], + "anat_to_disease": ["anatomical_entity", "disease"], + "anat_to_pheno": ["anatomical_entity", "phenotypic_feature"], + }) + tranql_endpoint: str = "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false" + # by default skips node to element queries + node_to_element_queries: dict = field(default_factory=lambda: {}) + element_mapping: str = "" + def __post_init__(self): + # convert element mapping to dict + if self.element_mapping and len(self.element_mapping.split(',')): + final_element_mapping = {} + for mapping in self.element_mapping.split(','): + if not mapping: + continue + original_name = mapping.split(':')[0].lower().strip() + final_name = mapping.split(':')[1].strip() + final_element_mapping[original_name] = final_name + self.element_mapping = final_element_mapping + node_to_el_enabled = True if str(self.node_to_element_queries.get("enabled")).lower() == "true" else False + final_node_to_element_queries = {} + if node_to_el_enabled: + for key in filter(lambda k: k != "enabled", self.node_to_element_queries.keys()): + final_node_to_element_queries[key] = self.node_to_element_queries[key] + self.node_to_element_queries = final_node_to_element_queries + +@dataclass +class ElasticsearchConfig(DictLike): + host: str = "elasticsearch" + username: str = "elastic" + password: str = "" + nboost_host: str = "" + scheme: str = "http" + ca_path: str = "" + + + +class RogerConfig(DictLike): + + OS_VAR_PREFIX = "ROGER_" + + def __init__(self, **kwargs): + self.redisgraph = RedisConfig(**kwargs.pop('redisgraph', {})) + self.logging = LoggingConfig(**kwargs.pop('logging', {})) + self.kgx = KgxConfig(**kwargs.pop('kgx', {})) + self.dug_inputs = DugInputsConfig(**kwargs.pop('dug_inputs', {})) + self.bulk_loader = BulkLoaderConfig(**kwargs.pop('bulk_loader', {})) + self.annotation = AnnotationConfig(**kwargs.pop('annotation', {})) + self.indexing = IndexingConfig(**kwargs.pop('indexing', {})) + self.elasticsearch = ElasticsearchConfig(**kwargs.pop('elasticsearch')) + self.s3_config = S3Config(**kwargs.pop('s3', {})) + + self.data_root: str = kwargs.pop("data_root", "") + self.dug_data_root: str = kwargs.pop("dug_data_root", "") + self.kgx_base_data_uri: str = kwargs.pop("kgx_base_data_uri", "") + self.annotation_base_data_uri: str = kwargs.pop("annotation_base_data_uri", "") + self.validation = kwargs.pop("validation") + self.dag_run = kwargs.pop('dag_run', None) + self.lakefs_config = LakefsConfig(**kwargs.pop("lakefs_config")) + + def to_dug_conf(self) -> DugConfig: + return DugConfig( + elastic_host=self.elasticsearch.host, + elastic_password=self.elasticsearch.password, + elastic_username=self.elasticsearch.username, + elastic_scheme=self.elasticsearch.scheme, + elastic_ca_path=self.elasticsearch.ca_path, + redis_host=self.redisgraph.host, + redis_password=self.redisgraph.password, + redis_port=self.redisgraph.port, + nboost_host=self.elasticsearch.nboost_host, + preprocessor=self.annotation.preprocessor, + annotator_type=self.annotation.annotator_type, + annotator_args=self.annotation.annotator_args, + normalizer={ + 'url': self.annotation.normalizer, + }, + synonym_service={ + 'url': self.annotation.synonym_service, + }, + ontology_helper={ + 'url': self.annotation.ontology_metadata, + }, + tranql_exclude_identifiers=self.indexing.excluded_identifiers, + tranql_queries=self.indexing.queries, + concept_expander={ + 'url': self.indexing.tranql_endpoint, + 'min_tranql_score': self.indexing.tranql_min_score, + }, + ontology_greenlist=self.annotation.ontology_greenlist, + node_to_element_queries=self.indexing.node_to_element_queries, + ) + + @property + def dict(self): + output = {} + for key, value in self.__dict__.items(): + if hasattr(value, '__dict__'): + output[key] = value.__dict__ + else: + output[key] = value + return output + + @classmethod + def factory(cls, file_path: str): + file_path = Path(file_path).resolve() + with file_path.open() as config_file: + file_data = yaml.load(config_file, Loader=yaml.FullLoader) + + override_data = cls.get_override_data(cls.OS_VAR_PREFIX) + + combined_data = cls.merge_dicts(file_data, override_data) + + return RogerConfig(**combined_data) + + @staticmethod + def merge_dicts(dict_a, dict_b): + flat_a = flatten(dict_a, reducer='dot') + flat_b = flatten(dict_b, reducer='dot') + flat_a.update(flat_b) + return unflatten(flat_a, 'dot') + + @staticmethod + def get_override_data(prefix): + override_data = {} + os_var_keys = os.environ.keys() + keys_of_interest = filter(lambda x: x.startswith(prefix), os_var_keys) + for key in keys_of_interest: + value = os.environ.get(key) + var_name = key.replace(prefix, "", 1) + var_name = var_name.lstrip("_") + var_name = var_name.replace("__", "~") + var_name = var_name.replace("_", ".") + var_name = var_name.replace("~", "_") + var_name = var_name.lower() + override_data[var_name] = value + return unflatten(override_data, 'dot') + + +class Config: + """ + Singleton config wrapper + """ + __instance__: Optional[Dict] = None + os_var_prefix = "ROGERENV_" + + def __init__(self, file_name: str): + if not Config.__instance__: + Config.__instance__ = Config.read_config_file(file_name=file_name) + os_var_keys = os.environ.keys() + keys_of_interest = [x for x in os_var_keys if x.startswith(Config.os_var_prefix)] + for key in keys_of_interest: + new_key = key.replace(Config.os_var_prefix, "") + value = os.environ[key] + new_dict = Config.os_var_to_dict(new_key, value) + try: + Config.update(new_dict) + except ValueError as e: + warnings.warn(f"{e} encountered trying to assign string from " + f"OS variable `{key}` to a dictionary object." + f"Please specify inner keys.") + + @staticmethod + def os_var_to_dict(var_name, value): + var_name = var_name.replace("__", "~") + var_name = var_name.replace("_", ".") + var_name = var_name.replace("~", "_") + var_name = var_name.lower() + m = {var_name: value} + result = unflatten(m, "dot") + return result + + @staticmethod + def read_config_file(file_name: str): + return yaml.load(open(file_name), Loader=yaml.FullLoader) + + def __getattr__(self, item): + """ + Proxies calls to instance dict. + Note: dict.update is overridden to do partial updates. + Refer to Config.update method. + :param item: method called + :return: proxied method + """ + if item == 'update': + # overrides default dict update method + return self.update + return getattr(Config.__instance__, item) + + def __getitem__(self, item): + """ + Makes config object subscriptable + :param item: key to lookup + :return: value stored in key + """ + return self.__instance__.get(item) + + @staticmethod + def update(new_value: Dict): + """ + Updates dictionary partially. + Given a config {'name': {'first': 'name', 'last': 'name'}} + and a partial update {'name': {'first': 'new name'} } + result would be {'name': {'first': 'new name', 'last': 'name'}} + :param new_value: parts to update + :return: updated dict + """ + config_flat = flatten(Config.__instance__) + new_value_flat = flatten(new_value) + config_flat.update(new_value_flat) + Config.__instance__ = unflatten(config_flat) + return Config.__instance__ + + def __str__(self): + flat = flatten(Config.__instance__) + for k in flat: + if 'PASSWORD' in k or 'password' in k or 'key' in k.lower(): + flat[k] = '******' + flat = unflatten(flat) + result = json.dumps(flat) + return f"""{result}""" + + +def get_default_config(file_name: str = CONFIG_FILENAME) -> RogerConfig: + """ + Get config as a dictionary + + Parameters + ---------- + file_name: str + The filename with all the configuration + + Returns + ------- + dict + A dictionary containing all the entries from the config YAML + + """ + config_instance = RogerConfig.factory(file_name) + return config_instance + + +config: RogerConfig = get_default_config() diff --git a/dags/roger/config/_base.py b/dags/roger/config/_base.py new file mode 100644 index 00000000..77309666 --- /dev/null +++ b/dags/roger/config/_base.py @@ -0,0 +1,11 @@ +class DictLike: + def __getitem__(self, item): + if not hasattr(self, item): + raise KeyError(item) + return getattr(self, item) + + def __setitem__(self, key, value): + setattr(self, key, value) + + def get(self, key, default=None): + return getattr(self, key, default) \ No newline at end of file diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml new file mode 100644 index 00000000..c407555f --- /dev/null +++ b/dags/roger/config/config.yaml @@ -0,0 +1,178 @@ +redisgraph: + username: "" + password: "weak" + host: localhost + graph: test + port: 6379 + +logging: + level: DEBUG + format: '[%(name)s][%(filename)s][%(lineno)d][%(funcName)20s] %(levelname)s: %(message)s' + +data_root: roger/data + +kgx_base_data_uri: https://stars.renci.org/var/kgx_data/ +annotation_base_data_uri: https://stars.renci.org/var/dug/ + +kgx: + biolink_model_version: v3.1.2 + merge_db_temp_dir: workspace + data_sets: + - baseline-graph:v5.0 + +dug_inputs: + data_source: s3 + data_sets: + - topmed:v1.0 + - bdc:v1.0 + - anvil:v1.0 + +#https://github.com/RedisGraph/redisgraph-bulk-loader/blob/master/redisgraph_bulk_loader/bulk_insert.py#L43 +bulk_loader: + separator: 0x1E + enforce_schema: False + skip_invalid_nodes: False + skip_invalid_edges: False + quote: 0 + max_token_count: 1024 + max_buffer_size: 2048 + max_token_size: 500 + index: [] + full_text_index: [] + +annotation: + clear_http_cache: false + annotator_type: sapbert + annotator_args: + monarch: + url: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" + sapbert: + classification_url: "https://med-nemo.apps.renci.org/annotate/" + annotator_url: "https://sap-qdrant.apps.renci.org/annotate/" + score_threshold: 0.8 + bagel: + enabled: false + url: "http://localhost:9099/group_synonyms_openai" + prompt: "bagel/ask_classes" + llm_args: + llm_model_name: "gpt-4o-2024-05-13" + organization: + access_key: + llm_model_args: + top_p: 0 + temperature: 0.1 + normalizer: "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" + synonym_service: "https://name-resolution-sri.renci.org/reverse_lookup" + ontology_metadata: "https://api.monarchinitiative.org/api/bioentity/" + + preprocessor: + debreviator: + BMI: "body mass index" + stopwords: "the" + ontology_greenlist: ["PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS"] + +indexing: + # colon seperated mappings list by comma + # eg : dbgap:Non-HEAL Studies,bacpac:HEAL Research Programs + element_mapping: "" + variables_index: "variables_index" + concepts_index: "concepts_index" + kg_index: "kg_index" + tranql_min_score: 0.2 + excluded_identifiers: + - "CHEBI:17336" + queries: + "disease": ["disease", "phenotypic_feature"] + "pheno": ["phenotypic_feature", "disease"] + "anat": ["disease", "anatomical_entity"] + "chem_to_disease": ["chemical_entity", "disease"] + "small_molecule_to_disease": ["small_molecule", "disease"] + "chemical_mixture_to_disease": ["chemical_mixture", "disease"] + "phen_to_anat": ["phenotypic_feature", "anatomical_entity"] + tranql_endpoint: "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false" + node_to_element_queries: + enabled: false + cde: + node_type: biolink:Publication + curie_prefix: "HEALCDE" + list_field_choose_first: + - "files" + attribute_mapping: + name: "name" + desc: "summary" + collection_name: "cde_category" + collection_id: "cde_category" + action: "files" + +elasticsearch: + host: localhost + username: elastic + password: "12345" + nboost_host: "" + scheme: "http" + ca_path: "" + +validation: + queries: + count_nodes: + name: "Count Nodes" + query: "MATCH (a) RETURN COUNT(a)" + count_edges: + name: "Count Edges" + query: "MATCH (a)-[e]-(b) RETURN COUNT(e)" + connectivity: + name: TOPMED Connectivity + query: "MATCH (a { id : '$var' })--(b) RETURN a.category, b.id" + args: + - var: TOPMED.TAG:8 + - var: TOPMED.VAR:phv00000484.v1.p10 + - var: TOPMED.VAR:phv00000487.v1.p10 + - var: TOPMED.VAR:phv00000496.v1.p10 + - var: TOPMED.VAR:phv00000517.v1.p10 + - var: TOPMED.VAR:phv00000518.v1.p10 + - var: TOPMED.VAR:phv00000528.v1.p10 + - var: TOPMED.VAR:phv00000529.v1.p10 + - var: TOPMED.VAR:phv00000530.v1.p10 + - var: TOPMED.VAR:phv00000531.v1.p10 + count_connected_nodes: + name: Count Connected Nodes + query: "MATCH (a)-[e]-(b) RETURN count(a), count(b)" + query_by_type: + name: Query by Type + query: "MATCH (a:gene)-[e]-(b) WHERE 'chemical_substance' IN b.category RETURN count(distinct(a)), count(distinct(b))" + smiles_values: + name: Query Chemicals with smiles that look like arrays + query: "Match (a: chemical_substance { simple_smiles: '$var' }) RETURN a.id" + args: + - var: "[Os+6]" + - var: "[SiH2]" + - var: "[CH]" + - var: "[S-2]" + - var: "[Ti+4]" + - var: "[P-3]" + - var: "[Ca+2]" + - var: "[Au+3]" + - var: "[TeH2]" + - var: "[Pb]" + - var: "[B+]" + - var: "[AsH]" + - var: "[O-][I+2]([O-])[O-]" + - var: "[He+]" + - var: "[Mo+6]" + - var: "[N-]=[N+]=[N-]" + - var: "[Ag+]" + - var: "[Zn+2]" + - var: "[C-]#[O+]" +s3: + host: "" + bucket: "" + access_key: "" + secret_key: "" + +lakefs_config: + enabled: false + access_key_id: "" + secret_access_key: "" + host: "" + branch: "" + repo: "" diff --git a/dags/roger/config/dev-config.yaml b/dags/roger/config/dev-config.yaml new file mode 100644 index 00000000..bece11a8 --- /dev/null +++ b/dags/roger/config/dev-config.yaml @@ -0,0 +1,118 @@ +redisgraph: + username: "" + password: "" + host: redis + graph: test + port: 6379 + +logging: + level: DEBUG + format: '[%(name)s][%(filename)s][%(funcName)20s] %(levelname)s: %(message)s' + +data_root: "/Users/schreepc/Projects/helxplatform/roger/roger/test/data" +dug_data_root: dug_helpers/dug_data/topmed_data +base_data_uri: https://stars.renci.org/var/kgx_data/trapi-1.0/ +kgx: + biolink_model_version: test + +#https://github.com/RedisGraph/redisgraph-bulk-loader/blob/master/redisgraph_bulk_loader/bulk_insert.py#L43 +bulk_loader: + separator: 0x1E + enforce_schema: False + skip_invalid_nodes: False + skip_invalid_edges: False + quote: 0 + max_token_count: 1024 + max_buffer_size: 2048 + max_token_size: 500 + index: [] + full_text_index: [] + +annotation: + annotator: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" + normalizer: "https://nodenormalization-sri.renci.org/get_normalized_nodes?curie=" + synonym_service: "https://onto.renci.org/synonyms/" + ontology_metadata: "https://api.monarchinitiative.org/api/ontology/term/" + # The following are neo4j params that would not be used + # need to remove them from annotator constructor. + db_url: "" + username: "" + password: "" + +indexing: + variables_index: "variables_index" + concepts_index: "concepts_index" + kg_index: "kg_index" + tranql_min_score: 0.2 + excluded_identifiers: + - "CHEBI:17336" + queries: + "disease": ["disease", "phenotypic_feature"] + "pheno": ["phenotypic_feature", "disease"] + "anat": ["disease", "anatomical_entity"] + "chem_to_disease": ["chemical_substance", "disease"] + "phen_to_anat": ["phenotypic_feature", "anatomical_entity"] + "anat_to_disease": ["anatomical_entity", "disease"] + "anat_to_pheno": ["anatomical_entity", "phenotypic_feature"] + tranql_endpoint: "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false" + +elasticsearch: + host: elasticsearch + username: elastic + # temporary + password: "13431" + nboost_host: "" + + + +validation: + queries: + count_nodes: + name: "Count Nodes" + query: "MATCH (a) RETURN COUNT(a)" + count_edges: + name: "Count Edges" + query: "MATCH (a)-[e]-(b) RETURN COUNT(e)" + connectivity: + name: TOPMED Connectivity + query: "MATCH (a { id : '$var' })--(b) RETURN a.category, b.id" + args: + - var: TOPMED.TAG:8 + - var: TOPMED.VAR:phv00000484.v1.p10 + - var: TOPMED.VAR:phv00000487.v1.p10 + - var: TOPMED.VAR:phv00000496.v1.p10 + - var: TOPMED.VAR:phv00000517.v1.p10 + - var: TOPMED.VAR:phv00000518.v1.p10 + - var: TOPMED.VAR:phv00000528.v1.p10 + - var: TOPMED.VAR:phv00000529.v1.p10 + - var: TOPMED.VAR:phv00000530.v1.p10 + - var: TOPMED.VAR:phv00000531.v1.p10 + count_connected_nodes: + name: Count Connected Nodes + query: "MATCH (a)-[e]-(b) RETURN count(a), count(b)" + query_by_type: + name: Query by Type + query: "MATCH (a:gene)-[e]-(b) WHERE 'chemical_substance' IN b.category RETURN count(distinct(a)), count(distinct(b))" + smiles_values: + name: Query Chemicals with smiles that look like arrays + query: "Match (a: chemical_substance { simple_smiles: '$var' }) RETURN a.id" + args: + - var: "[Os+6]" + - var: "[SiH2]" + - var: "[CH]" + - var: "[S-2]" + - var: "[Ti+4]" + - var: "[P-3]" + - var: "[Ca+2]" + - var: "[Au+3]" + - var: "[TeH2]" + - var: "[Pb]" + - var: "[B+]" + - var: "[AsH]" + - var: "[O-][I+2]([O-])[O-]" + - var: "[He+]" + - var: "[Mo+6]" + - var: "[N-]=[N+]=[N-]" + - var: "[Ag+]" + - var: "[Zn+2]" + - var: "[C-]#[O+]" diff --git a/dags/roger/config/s3_config.py b/dags/roger/config/s3_config.py new file mode 100644 index 00000000..41fcccab --- /dev/null +++ b/dags/roger/config/s3_config.py @@ -0,0 +1,11 @@ +from dataclasses import dataclass + +from ._base import DictLike + + +@dataclass +class S3Config(DictLike): + host: str = "" + bucket: str = "" + access_key: str = "" + secret_key: str = "" \ No newline at end of file diff --git a/dags/roger/core/__init__.py b/dags/roger/core/__init__.py new file mode 100644 index 00000000..5a5c5e90 --- /dev/null +++ b/dags/roger/core/__init__.py @@ -0,0 +1,4 @@ +"Core roger modules, now broken out into a submodule" + +from roger.core.enums import SchemaType, FileFormat +from roger.core.bulkload import BulkLoad diff --git a/dags/roger/core/base.py b/dags/roger/core/base.py new file mode 100644 index 00000000..7ba9409a --- /dev/null +++ b/dags/roger/core/base.py @@ -0,0 +1,217 @@ +"Core Roger object and utilities" + +import argparse +import sys +from io import StringIO +import logging + +from roger.config import get_default_config as get_config +from roger.logger import get_logger +from roger.core.bulkload import BulkLoad +from roger.models.kgx import KGXModel +from roger.models.biolink import BiolinkModel + +log = get_logger() + +class Roger: + """ Consolidate Roger functionality for a cleaner interface. """ + + def __init__(self, to_string=False, config=None): + """ Initialize. + :param to_string: Log to str, available as self.log_stream.getvalue() + after execution completes. + """ + self.has_string_handler = to_string + if not config: + config = get_config() + self.config = config + if to_string: + # Add a stream handler to enable to_string. + self.log_stream = StringIO() + self.string_handler = logging.StreamHandler (self.log_stream) + log.addHandler (self.string_handler) + log.debug("config is %s", config.kgx.biolink_model_version) + self.biolink = BiolinkModel (config.kgx.biolink_model_version) + self.kgx = KGXModel (self.biolink, config=config) + self.bulk = BulkLoad (self.biolink, config=config) + + def __enter__(self): + """ Implement Python's Context Manager interface. """ + return self + + def __exit__(self, exception_type, exception_value, traceback): + """ Implement Python's Context Manager interface. We use this finalizer + to detach the stream handler appended in the constructor. + :param exception_type: Type of exception, if one occurred. + :param exception_value: The exception, if one occurred. + :param traceback: The stack trace explaining the exception. + """ + if exception_type or exception_value or traceback: + log.error (msg="Error:", + exc_info=(exception_type, exception_value, traceback)) + if self.has_string_handler: + log.removeHandler (self.string_handler) + +# interfaces abstracting Roger's inner workings to make it easier to +# incorporate into external tools like workflow engines. + +def get_kgx (to_string=False, config=None): + "get KGX dataset" + output = None + log.debug("Getting KGX method called.") + with Roger (to_string, config=config) as roger: + dataset_version=config.get('kgx', {}).get('dataset_version') + log.debug("dataset_version is %s", dataset_version) + roger.kgx.get(dataset_version=dataset_version) + output = roger.log_stream.getvalue() if to_string else None + return output + +def create_schema(to_string=False, config=None): + "Create noders and edges schemata" + o1 = create_nodes_schema(to_string=to_string, config=config) + o2 = create_edges_schema(to_string=to_string, config=config) + output = (o1 + o2 ) if to_string else None + return output + +def create_edges_schema(to_string=False, config=None, input_data_path=None, output_data_path=None): + "Create edges schema on KGX object" + output = None + with Roger(to_string, config=config) as roger: + roger.kgx.create_edges_schema( + input_data_path=input_data_path, + output_data_path=output_data_path + ) + output = roger.log_stream.getvalue() if to_string else None + return output + +def create_nodes_schema(to_string=False, config=None, input_data_path=None, output_data_path=None): + "Create nodes schema on KGX object" + output = None + with Roger(to_string, config=config) as roger: + roger.kgx.create_nodes_schema(input_data_path=input_data_path, + output_data_path=output_data_path) + output = roger.log_stream.getvalue() if to_string else None + return output + +def merge_nodes(to_string=False, config=None, input_data_path=None, output_data_path=None): + "Run KGX merge" + output = None + with Roger (to_string, config=config) as roger: + roger.kgx.merge(input_path=input_data_path, output_path=output_data_path) + output = roger.log_stream.getvalue () if to_string else None + return output + +def create_bulk_load(to_string=False, config=None, input_data_path=None, output_data_path=None): + "Generate bulk load files" + o1 = create_bulk_nodes(to_string=to_string, config=config) + o2 = create_bulk_edges(to_string=to_string, config=config) + output = (o1 + o2) if to_string else None + return output + +def create_bulk_nodes(to_string=False, config=None, input_data_path=None, output_data_path=None): + "Generate bulk node CSV file" + output = None + with Roger(to_string, config=config) as roger: + log.info("input path: %s", input_data_path) + log.info("output path: %s", output_data_path) + roger.bulk.create_nodes_csv_file(input_data_path, output_data_path) + output = roger.log_stream.getvalue() if to_string else None + return output + +def create_bulk_edges(to_string=False, config=None, input_data_path=None, output_data_path=None): + "Create bulk edges CSV file" + output = None + with Roger(to_string, config=config) as roger: + roger.bulk.create_edges_csv_file(input_data_path, output_data_path) + output = roger.log_stream.getvalue() if to_string else None + return output + +def bulk_load(to_string=False, config=None, input_data_path=None, output_data_path=None): + "Run bulk load insert process" + output = None + with Roger (to_string, config=config) as roger: + roger.bulk.insert(input_data_path=input_data_path) + output = roger.log_stream.getvalue () if to_string else None + return output + +def validate (to_string=False, config=None, input_data_path=None, output_data_path=None): + "Run bulk validate process" + output = None + with Roger (to_string, config=config) as roger: + roger.bulk.validate() + output = roger.log_stream.getvalue () if to_string else None + return output + +def check_tranql(to_string=False, config=None, input_data_path=None, output_data_path=None): + "Tranql server smoke check" + output = None + with Roger(to_string, config=config) as roger: + roger.bulk.wait_for_tranql() + output = roger.log_stream.getvalue() if to_string else None + return output + +def roger_cli(): + " Roger CLI. " + parser = argparse.ArgumentParser(description='Roger') + parser.add_argument('-v', + '--dataset-version', + help="Dataset version.", + default="v1.0") + parser.add_argument('-d', + '--data-root', + help="Root of data hierarchy", + default=None) + parser.add_argument('-g', + '--get-kgx', + help="Get KGX objects", + action='store_true') + parser.add_argument('-l', + '--load-kgx', + help="Load via KGX", + action='store_true') + parser.add_argument('-s', + '--create-schema', + help="Infer schema", + action='store_true') + parser.add_argument('-m', + '--merge-kgx', + help="Merge KGX nodes", + action='store_true') + parser.add_argument('-b', + '--create-bulk', + help="Create bulk load", + action='store_true') + parser.add_argument('-i', + '--insert', + help="Do the bulk insert", + action='store_true') + parser.add_argument('-a', + '--validate', + help="Validate the insert", + action='store_true') + args = parser.parse_args () + + biolink = BiolinkModel () + kgx = KGXModel (biolink) + bulk = BulkLoad (biolink) + if args.data_root is not None: + config = get_config() + data_root = args.data_root + config.update({'data_root': data_root}) + log.info("data root: %s", data_root) + if args.get_kgx: + kgx.get (dataset_version=args.dataset_version) + if args.load_kgx: + kgx.load () + if args.merge_kgx: + kgx.merge () + if args.create_schema: + kgx.create_schema () + if args.create_bulk: + bulk.create () + if args.insert: + bulk.insert () + if args.validate: + bulk.validate () + + sys.exit (0) diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py new file mode 100644 index 00000000..a8ddbc15 --- /dev/null +++ b/dags/roger/core/bulkload.py @@ -0,0 +1,432 @@ +"Bulk loader for Roger" + +import os +import glob +import shutil +from collections import defaultdict +from functools import reduce +from string import Template +import time + +import requests +import redis +from falkordb_bulk_loader.bulk_insert import bulk_insert + +from roger.config import get_default_config as get_config +from roger.logger import get_logger +from roger.core.redis_graph import RedisGraph +from roger.core.enums import SchemaType +from roger.models.biolink import BiolinkModel +from roger.components.data_conversion import cast +from roger.core import storage + +log = get_logger() + +class BulkLoad: + """ Tools for creating a Redisgraph bulk load dataset. """ + def __init__(self, biolink, config=None): + self.biolink = biolink + if not config: + config = get_config() + self.config = config + separator = self.config.get('bulk_loader',{}).get('separator', '|') + self.separator =(chr(separator) if isinstance(separator, int) + else separator) + + def create (self): + """Used in the CLI on args.create_bulk""" + self.create_nodes_csv_file() + self.create_edges_csv_file() + + def create_nodes_csv_file(self, input_data_path=None, output_data_path=None): + # clear out previous data + bulk_path = storage.bulk_path("nodes", output_data_path) + if os.path.exists(bulk_path): + shutil.rmtree(bulk_path) + categories_schema = storage.read_schema (SchemaType.CATEGORY, input_data_path) + state = defaultdict(lambda: None) + log.info(f"processing nodes") + """ Write node data for bulk load. """ + + categories = defaultdict(lambda: []) + category_error_nodes = set() + merged_nodes_file = storage.merged_objects('nodes', input_data_path) + counter = 1 + for node in storage.json_line_iter(merged_nodes_file): + if not node.get('category'): + category_error_nodes.add(node['id']) + node['category'] = [BiolinkModel.root_type] + index = self.biolink.get_leaf_class(node['category']) + categories[index].append(node) + if category_error_nodes: + log.error( + f"some nodes didn't have category assigned. " + f"KGX file has errors. " + f"Nodes {len(category_error_nodes)}. " + f"They will be typed {BiolinkModel.root_type}. " + f"Showing first 10: {list(category_error_nodes)[:10]}.") + # flush every 100K + if counter % 100_000 == 0: + self.write_bulk(storage.bulk_path("nodes", output_data_path), + categories, categories_schema, + state=state, is_relation=False) + # reset variables. + category_error_nodes = set() + categories = defaultdict(lambda: []) + counter += 1 + # write back if any thing left. + if len(categories): + self.write_bulk(storage.bulk_path("nodes", output_data_path), + categories, categories_schema, + state=state, is_relation=False) + + def create_edges_csv_file(self, input_data_path=None, output_data_path=None): + """ Write predicate data for bulk load. """ + # Clear out previous data + bulk_path = storage.bulk_path("edges", output_data_path) + if os.path.exists(bulk_path): + shutil.rmtree(bulk_path) + predicates_schema = storage.read_schema(SchemaType.PREDICATE, input_data_path) + predicates = defaultdict(lambda: []) + edges_file = storage.merged_objects('edges', input_data_path) + counter = 1 + state = {} + for edge in storage.json_line_iter(edges_file): + predicates[edge['predicate']].append(edge) + # write out every 100K , to avoid large predicate dict. + if counter % 100_000 == 0: + self.write_bulk( + storage.bulk_path("edges", output_data_path),predicates, predicates_schema, + state=state, is_relation=True) + predicates = defaultdict(lambda : []) + counter += 1 + # if there are some items left (if loop ended before counter reached the + # specified value) + if len(predicates): + self.write_bulk(storage.bulk_path("edges", output_data_path), predicates, + predicates_schema,state=state, is_relation=True) + + @staticmethod + def create_redis_schema_header(attributes: dict, is_relation=False): + """Creates col headers for csv to be used by redis bulk loader + + Column headers are generated by assigning redis types + :param attributes: dict of data labels with values as python type strs + :param separator: CSV separator + :return: list of attrs, each item is attributeLabel:redisGraphDataType + """ + redis_type_conversion_map = { + 'str': 'STRING', + 'float': 'FLOAT', # Do we need to handle double + 'int': 'INT', + 'bool': 'BOOL', + 'list': 'ARRAY' + } + col_headers = [] + def format_for_redis(label, typ): + return f'{label}:{typ}' + for attribute, attribute_type in attributes.items(): + col_headers.append(format_for_redis( + attribute, redis_type_conversion_map[attribute_type])) + # Note this two fields are only important to bulk loader + # they will not be members of the graph + # https://github.com/RedisGraph/redisgraph-bulk-loader/tree/master#input-schemas + if is_relation: + col_headers.append('internal_start_id:START_ID') + col_headers.append('internal_end_id:END_ID') + # replace id:STRING with id:ID + col_headers.append('id:ID') + col_headers = list(filter(lambda x: x != 'id:STRING', col_headers)) + return col_headers + + @staticmethod + def group_items_by_attributes_set(objects: list, processed_object_ids: set): + """ Groups items into a dictionary + + The keys the output dictionary are sets of attributes set for all + items accessed in that key. + + Eg.: + { set(id,name,category): [{id:'xx0',name:'bbb', 'category':['type']}.... + {id:'xx1', name:'bb2', category: ['type1']}] } + :param objects: list of nodes or edges + :param processed_object_ids: ids to skip since they are processed. + :return: dictionary grouping based on set attributes + """ + clustered_by_set_values = {} + improper_keys = set() + def value_set_test(val): + "Converted from lambda function, is this just 'if x:'?" + if (val is not None and val != [] and val != ''): + return True + return False + for obj in objects: + # redis bulk loader needs columns not to include ':' + # till backticks are implemented we should avoid these. + def key_filter(key): + # Make sure no colons in key names + return ':' not in key + keys_with_values = frozenset( + [k for k in obj.keys() + if value_set_test(obj[k]) and key_filter(k)]) + for key in [k for k in obj.keys() if obj[k] and not key_filter(k)]: + improper_keys.add(key) + # group by attributes that have values. # Why? + # Redis bulk loader has one issue + # imagine we have: + # + #{'name': 'x'} , {'name': 'y', 'is_metabolite': true} + # + # we have a common schema name:STRING,is_metabolite: + # + # BOOL values `x,` and `y,true` + # + # but x not having value for is_metabolite is not handled well, + # redis bulk loader says we should give it default if we were to + # enforce schema but due to the nature of the data assigning + # defaults is very not an option. hence grouping data into several + # csv's might be the right way (?) + if obj['id'] not in processed_object_ids: + val_list = clustered_by_set_values.get(keys_with_values, []) + val_list.append(obj) + clustered_by_set_values[keys_with_values] = val_list + return clustered_by_set_values, improper_keys + + def write_bulk(self, bulk_path, obj_map, schema, state={}, + is_relation=False): + """ Write a bulk load group of objects. + :param bulk_path: Path to the bulk loader object to write. + :param obj_map: A map of biolink type to list of objects. + :param schema: The schema (nodes or predicates) containing identifiers. + :param state: Track state of already written objs to avoid duplicates. + """ + + os.makedirs (bulk_path, exist_ok=True) + processed_objects_id = state.get('processed_id', set()) + called_x_times = state.get('called_times', 0) + called_x_times += 1 + for key, objects in obj_map.items (): + if len(objects) == 0: + continue + try: + all_keys = schema[key] + except Exception as e: + log.error(f"{key} not in {schema.keys()} " ) + raise Exception("error") from e + """ Make all objects conform to the schema. """ + clustered_by_set_values, improper_redis_keys = ( + self.group_items_by_attributes_set(objects, + processed_objects_id)) + + if improper_redis_keys: + log.warning( + "The following keys were skipped since they include " + "conflicting `:` that would cause errors while bulk " + "loading to redis. [%s]", str(improper_redis_keys)) + for index, set_attributes in enumerate( + clustered_by_set_values.keys()): + items = clustered_by_set_values[set_attributes] + # When parted files are saved let the file names be collected + # here + state['file_paths'] = state.get('file_paths', {}) + state['file_paths'][key] = state['file_paths'].get(key, {}) + out_file = state['file_paths'][key][set_attributes] = ( + state['file_paths'].get(key, {}).get(set_attributes, '')) + + # When calling write bulk , lets say we have processed some + # chemicals from file 1 and we start processing file 2 if we are + # using just index then we might (rather will) end up adding + # records to the wrong file so we need this to be unique as + # possible by adding called_x_times , if we already found + # out-file from state obj we are sure that the schemas match. + + # biolink: is not valid name so we need to remove : + file_key = key.replace(':', '~') + + out_file = ( + f"{bulk_path}/{file_key}.csv-{index}-{called_x_times}" + if not out_file + else out_file) + # store back file name + state['file_paths'][key][set_attributes] = out_file + new_file = not os.path.exists(out_file) + keys_for_header = {x: all_keys[x] for x in all_keys + if x in set_attributes} + redis_schema_header = self.create_redis_schema_header( + keys_for_header, is_relation) + with open(out_file, "a", encoding='utf-8') as stream: + if new_file: + state['file_paths'][key][set_attributes] = out_file + log.info(f" --creating {out_file}") + stream.write(self.separator.join(redis_schema_header)) + stream.write("\n") + else: + log.info(f" --appending to {out_file}") + + # Write fields, skipping duplicate objects. + for obj in items: + oid = str(obj['id']) + if oid in processed_objects_id: + continue + processed_objects_id.add(oid) + + # Add ID / START_ID / END_ID depending + internal_id_fields = { + 'internal_id': obj['id'] + } + if is_relation: + internal_id_fields.update({ + 'internal_start_id': obj['subject'], + 'internal_end_id': obj['object'] + }) + obj.update(internal_id_fields) + values = [] + + # uses redis schema header to preserve order when + # writing lines out. + for column_name in redis_schema_header: + # last key is the type + obj_key = ':'.join(column_name.split(':')[:-1]) + value = obj[obj_key] + + if obj_key not in internal_id_fields: + current_type = type(value).__name__ + expected_type = all_keys[obj_key] + # cast it if it doesn't match type in schema + # keys i.e all_keys + value = ( + cast(obj[obj_key], all_keys[obj_key]) + if expected_type != current_type + else value) + # escape quotes . + values.append(str(value).replace("\"", "\\\"")) + s = self.separator.join(values) + stream.write(s) + stream.write("\n") + state['processed_id'] = processed_objects_id + state['called_times'] = called_x_times + + def insert (self, input_data_path=None): + redisgraph = self.config.redisgraph + nodes = sorted(glob.glob (storage.bulk_path ("**/nodes/**.csv*", input_data_path), recursive=True)) + edges = sorted(glob.glob (storage.bulk_path ("**/edges/**.csv*", input_data_path), recursive=True)) + graph = redisgraph['graph'] + log.info(f"bulk loading \n nodes: {nodes} \n edges: {edges}") + + try: + log.info (f"deleting graph {graph} in preparation for bulk load.") + db = self.get_redisgraph() + db.redis_graph.delete () + except redis.exceptions.ResponseError: + log.info("no graph to delete") + + log.info ("bulk loading graph: %s", str(graph)) + args = [] + if len(nodes) > 0: + bulk_path_root = glob.glob(storage.bulk_path('**/nodes', path=input_data_path), recursive=True)[0] + os.path.sep + nodes_with_type = [] + collect_labels = set() + for x in nodes: + """ + These lines prep nodes bulk load by: + 1) appending to labels 'biolink.' + 2) combine labels to create a multilabel redis node i.e. "biolink.OrganismalEntity:biolink.SubjectOfInvestigation" + """ + file_name_type_part = x.replace(bulk_path_root, '').split('.')[0].split('~')[1] + all_labels = "biolink." + file_name_type_part + ":" + ":".join([f'biolink.{v.lstrip("biolink:")}' for v in self.biolink.toolkit.get_ancestors("biolink:" + file_name_type_part, reflexive=False, formatted=True )] ) + collect_labels.add("biolink." + file_name_type_part) + for v in self.biolink.toolkit.get_ancestors("biolink:" + file_name_type_part, reflexive=False, + formatted=True): + collect_labels.add(f'biolink.{v.lstrip("biolink:")}') + nodes_with_type.append(f"{all_labels} {x}") + args.extend(("-N " + " -N ".join(nodes_with_type)).split()) + if len(edges) > 0: + bulk_path_root = glob.glob(storage.bulk_path('**/edges', path=input_data_path), recursive=True)[0] + os.path.sep + edges_with_type = [f"biolink.{x.replace(bulk_path_root, '').strip(os.path.sep).split('.')[0].split('~')[1]} {x}" + for x in edges] + # Edge label now no longer has 'biolink:' + args.extend(("-R " + " -R ".join(edges_with_type)).split()) + args.extend([f"--separator={self.separator}"]) + args.extend([f"--server-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}"]) + args.extend(['--enforce-schema']) + args.extend(['-e']) + for lbl in collect_labels: + args.extend([f'-i `{lbl}`:id', f'-f {lbl}:name', f'-f {lbl}:synonyms']) + args.extend([f"{redisgraph['graph']}"]) + """ standalone_mode=False tells click not to sys.exit() """ + log.debug(f"Calling bulk_insert with extended args: {args}") + try: + bulk_insert(args, standalone_mode=False) + # self.add_indexes() + except Exception as e: + log.error(f"Unexpected {e.__class__.__name__}: {e}") + raise + + def add_indexes(self): + redis_connection = self.get_redisgraph() + all_labels = redis_connection.query( + "Match (c) return distinct labels(c)").result_set + all_labels = reduce(lambda x, y: x + y, all_labels, []) + id_index_queries = [ + f'CREATE INDEX on :`{label}`(id)' for label in all_labels + ] + name_index_queries = ( + "CALL db.labels() YIELD label " + "CALL db.idx.fulltext.createNodeIndex(label, 'name', 'synonyms')") + + for query in id_index_queries: + redis_connection.query(query=query) + redis_connection.query(query=name_index_queries) + log.info(f"Indexes created for {len(all_labels)} labels.") + + def get_redisgraph(self): + return RedisGraph( + host=self.config.redisgraph.host, + port=self.config.redisgraph.port, + password=self.config.redisgraph.password, + graph=self.config.redisgraph.graph, + ) + + def validate(self): + + db = self.get_redisgraph() + validation_queries = self.config.get( + 'validation', {}).get('queries', []) + for key, query in validation_queries.items (): + text = query['query'] + name = query['name'] + args = query.get('args', [{}]) + for arg in args: + start = storage.current_time_in_millis () + instance = Template (text).safe_substitute (arg) + db.query (instance) + duration = storage.current_time_in_millis () - start + log.info (f"Query {key}:{name} ran in {duration}ms: {instance}") + + def wait_for_tranql(self): + retry_secs = 3 + tranql_endpoint = self.config.indexing.tranql_endpoint + log.info(f"Contacting {tranql_endpoint}") + graph_name = self.config["redisgraph"]["graph"] + test_query = "SELECT disease-> phenotypic_feature " \ + f"FROM 'redis:{graph_name}'" \ + f"WHERE disease='MONDO:0004979'" + is_done_loading = False + try: + while not is_done_loading: + response = requests.post(tranql_endpoint, data=test_query) + response_code = response.status_code + response = response.json() + is_done_loading = "message" in response and response_code == 200 + if is_done_loading: + break + else: + log.info(f"Tranql responsed with response: {response}") + log.info(f"Retrying in {retry_secs} secs...") + time.sleep(retry_secs) + except ConnectionError as e: + # convert exception to be more readable. + raise ConnectionError( + f"Attempting to contact {tranql_endpoint} " + f"failed due to connection error. " + f"Please check status of Tranql server.") from e diff --git a/dags/roger/core/enums.py b/dags/roger/core/enums.py new file mode 100644 index 00000000..b44323af --- /dev/null +++ b/dags/roger/core/enums.py @@ -0,0 +1,15 @@ +"Enums for Roger" + +from enum import Enum + +class SchemaType(Enum): + """ High level semantic metatdata concepts. + Categories are classes in an ontological model like Biolink. + Predicates are links between nodes. """ + CATEGORY = "category" + PREDICATE = "predicate" + +class FileFormat(Enum): + """ File formats this module knows about. """ + JSON = "json" + YAML = "yaml" diff --git a/roger/roger_db.py b/dags/roger/core/redis_graph.py similarity index 87% rename from roger/roger_db.py rename to dags/roger/core/redis_graph.py index 75b94e6b..ca65ddce 100644 --- a/roger/roger_db.py +++ b/dags/roger/core/redis_graph.py @@ -1,18 +1,22 @@ -import logging +import copy + import redis -from redisgraph import Node, Edge, Graph -from redis.exceptions import ResponseError -from roger.roger_util import get_config, get_logger +# from redisgraph import Node, Edge, Graph +# https://redis-py.readthedocs.io/en/v4.5.1/redismodules.html#redisgraph-commands +from redis.commands.graph.node import Node +from redis.commands.graph.edge import Edge + +from roger.logger import get_logger logger = get_logger () class RedisGraph: """ Graph abstraction over RedisGraph. A thin wrapper but provides us some options. """ - def __init__(self, host='localhost', port=6379, graph='default'): + def __init__(self, host='localhost', port=6379, graph='default', password=''): """ Construct a connection to Redis Graph. """ - self.r = redis.Redis(host=host, port=port) - self.redis_graph = Graph(graph, self.r) + self.r = redis.Redis(host=host, port=port, password=password) + self.redis_graph = self.r.graph(graph) def add_node (self, identifier=None, label=None, properties=None): """ Add a node with the given label and properties. """ @@ -57,7 +61,7 @@ def commit (self): def query (self, query): """ Query and return result set. """ result = self.redis_graph.query(query) - result.pretty_print() + print(result) return result def delete (self): @@ -84,4 +88,4 @@ def test (): rg.delete () # rg.query ("""MATCH (a { id : 'chemical_substance' }) RETURN a""") -#test () +#test () \ No newline at end of file diff --git a/dags/roger/core/storage.py b/dags/roger/core/storage.py new file mode 100644 index 00000000..b4869758 --- /dev/null +++ b/dags/roger/core/storage.py @@ -0,0 +1,502 @@ +""" utils for roger + +This is home to the utilities that were formerly in dags/roger/core.py:Util +""" + +import os +import glob +import time +import pathlib +import pickle +import shutil +import yaml +import orjson as json +import requests +from urllib.request import urlretrieve +from pathlib import Path + +from roger.logger import get_logger +from roger.config import get_default_config as get_config +from roger.core import SchemaType + +log = get_logger() +config = get_config() + +data_dir_env_value = os.getenv("ROGER_DATA_DIR") + +if data_dir_env_value is None: + ROGER_DATA_DIR = Path(__file__).parent.resolve() / 'data' +else: + ROGER_DATA_DIR = Path(data_dir_env_value) + + +def current_time_in_millis(): + """ + Get current time in milliseconds. + + Returns + ------- + int + Time in milliseconds + + """ + return int(round(time.time() * 1000)) + +# A just do it approach to getting data. +def read_file(path): + """ Read a file. + :param path: Path to a file. + """ + text = None + with open(path, "r", encoding='utf-8') as stream: + text = stream.read() + return text + +def read_url(url): + """ Read data from a URL. + :param url: The URL to read. """ + return requests.get(url, timeout=60).text + +def read_data(path): + """ Read data from a URL or File. HTTP(S) is the only supported protocol. + :param path: A URL or file path. """ + text = None + if is_web(path): + text = read_url(path) + else: + text = read_file(path) + return text + +def read_object(path, key=None): + """ Read on object from a path. + :param path: A URL or file path. + Supports YAML and JSON depending on extension. + :param key: A configuration key. This is prepended to the path if present. + :raises ValueError: If the key is not in the configuration. """ + if key is not None: + prefix = config[key] + path = f"{prefix}/{path}" if is_web(prefix) \ + else os.path.join (prefix, path) + obj = None + if path.endswith(".yaml") or path.endswith (".yml"): + obj = yaml.safe_load (read_data (path)) + elif path.endswith(".json"): + obj = json.loads (read_data (path)) + elif path.endswith(".pickle"): + with open(file=path, mode="rb") as stream: + obj = pickle.load(stream) + elif path.endswith(".jsonl") or path.endswith('.txt'): + obj = read_data(path) + return obj + +def is_web (uri): + """ The URI is a web URI (starts with http or https). + :param uri: A URI """ + return uri.startswith("http://") or uri.startswith ("https://") + +def write_object (obj, path, key=None): + """ Write an object to a path. YAML and JSON supported based on extension. + :param obj: The object to write. + :param path: The path to write to. + :param key: The configuration key to prepend to the path. + """ + # Prepend a prefix from the configuration file if a key is given. + if key is not None: + prefix = config[key] + path = (f"{prefix}/{path}" if is_web(prefix) + else os.path.join (prefix, path)) + + # Ensure the directory to be written to exists. + dirname = os.path.dirname (path) + if not os.path.exists (dirname): + os.makedirs (dirname, exist_ok=True) + + # Write the file in the specified format. + if path.endswith (".yaml") or path.endswith (".yml"): + with open(path, 'w') as outfile: + yaml.dump (obj, outfile) + elif path.endswith (".json"): + with open (path, "w", encoding='utf-8') as stream: + stream.write(str(json.dumps (obj, option=json.OPT_INDENT_2).decode('utf-8'))) + elif path.endswith(".pickle"): + with open (path, "wb") as stream: + pickle.dump(obj, file=stream) + elif path.endswith(".jsonl") or path.endswith('.txt'): + with open (path, "w", encoding="utf-8") as stream: + stream.write(obj) + else: + # Raise an exception if invalid. + raise ValueError (f"Unrecognized extension: {path}") + +def mkdir(path, is_dir=False): + directory = os.path.dirname(path) if not is_dir else path + if not os.path.exists(directory): + os.makedirs(directory) + +def remove(path): + if os.path.exists(path): + if os.path.isdir(path): + shutil.rmtree(path) + else: + os.remove(path) + +def clear_dir(path): + remove(path) + mkdir(path, is_dir=True) + +###################### +# Path methods + +def kgx_path(name): + """ Form a KGX object path. + :path name: Name of the KGX object. """ + return str(ROGER_DATA_DIR / "kgx" / name) + +def kgx_objects(format_="json", path=None): + """ A list of KGX objects. """ + kgx_pattern = kgx_path(f"**.{format_}") + if path: + kgx_pattern = f"{path}/**/*.{format_}" + return sorted(glob.glob (kgx_pattern, recursive=True)) + +def merge_path(name, path: Path=None): + """ Form a merged KGX object path. + :path name: Name of the merged KGX object. """ + if path is None: + # create output dir + if not os.path.exists(ROGER_DATA_DIR / 'merge'): + os.makedirs(ROGER_DATA_DIR / 'merge') + return str(ROGER_DATA_DIR / 'merge' / name) + if not os.path.exists(path): + os.makedirs(path) + + return str(path.joinpath(name)) + +def merged_objects(file_type, path=None): + """ A list of merged KGX objects. """ + if not path: + merged_pattern = merge_path(f"**/{file_type}.jsonl") + else: + merged_pattern = merge_path(f"**/{file_type}.jsonl", path=path) + # this thing should always return one edges or nodes file (based on file_type) + try: + return sorted(glob.glob(merged_pattern, recursive=True))[0] + except IndexError: + raise ValueError(f"Could not find merged KGX of type {file_type} in {merged_pattern}") + + +def schema_path(name, path=None): + """ Path to a schema object. + :param name: Name of the object to get a path for. """ + if not path: + return str(ROGER_DATA_DIR / 'schema' / name) + return str (path / 'schema' / name) + +def bulk_path(name, path=None): + """ Path to a bulk load object. + :param name: Name of the object. """ + if not path: + return str(ROGER_DATA_DIR / 'bulk' / name) + else: + return str(path / name) + +def metrics_path(name): + """ + Path to write metrics to + :param name: + :return: + """ + return str(ROGER_DATA_DIR / "metrics" / name) + +def dug_kgx_path(name): + return str(ROGER_DATA_DIR / "dug" / "kgx" / name) + +def dug_annotation_path(name): + return str(ROGER_DATA_DIR / "dug" / "annotations" / name) + +def dug_expanded_concepts_path(name): + return str(ROGER_DATA_DIR / 'dug' / 'expanded_concepts' / name) + +def dug_expanded_concept_objects(data_path=None, format="pickle"): + "Return a list of files containing expaneded concept objects" + if data_path: + file_pattern = os.path.join(data_path, '**', f'expanded_concepts.{format}') + else: + file_pattern = dug_expanded_concepts_path( + os.path.join('*',f'expanded_concepts.{format}')) + return sorted(glob.glob(file_pattern, recursive=True)) + +def dug_extracted_elements_objects(data_path=None, format="txt"): + if data_path: + file_pattern = os.path.join(data_path, '**', f'extracted_graph_elements.{format}') + else: + file_pattern = dug_expanded_concepts_path( + os.path.join('*', f'extracted_graph_elements.{format}')) + return sorted(glob.glob(file_pattern, recursive=True)) + +def dug_crawl_path(name): + return str(ROGER_DATA_DIR / 'dug' / 'crawl' / name) + +def dug_kgx_objects(): + """ A list of dug KGX objects. """ + dug_kgx_pattern = dug_kgx_path("**.json") + return sorted(glob.glob(dug_kgx_pattern)) + +def dug_concepts_objects(data_path, format="pickle"): + """ A list of dug annotation Objects. """ + if not data_path: + concepts_file_path = dug_annotation_path( + os.path.join('*',f'concepts.{format}')) + else: + concepts_file_path = os.path.join( + data_path, '**', f'concepts.{format}') + return sorted(glob.glob(concepts_file_path, recursive=True)) + +def dug_elements_objects(data_path=None, format='pickle'): + """ A list of dug annotation Objects. """ + if not data_path: + concepts_file_pattern = dug_annotation_path( + os.path.join('*', f'elements.{format}')) + else: + concepts_file_pattern = os.path.join( + data_path, '**', f'elements.{format}') + return sorted(glob.glob(concepts_file_pattern, recursive=True)) + +def dug_input_files_path(name) -> pathlib.Path: + path = ROGER_DATA_DIR / "dug" / "input_files" / name + if not path.exists(): + log.info(f"Input file path: {path} does not exist, creating") + path.mkdir(parents=True, exist_ok=True) + else: + log.info(f"Input file path: {path} already exists") + return path + +def dug_topmed_objects(input_data_path=None): + "Return list of TOPMed source files" + if not input_data_path: + input_data_path = str(dug_input_files_path('topmed')) + topmed_file_pattern = os.path.join(input_data_path, "topmed_*.csv") + return sorted(glob.glob(topmed_file_pattern)) + +def dug_anvil_path(): + """Anvil source files""" + return dug_input_files_path('anvil') + +def dug_sprint_path(): + """Anvil source files""" + return dug_input_files_path('sprint') + +def dug_bacpac_path(): + """Anvil source files""" + return dug_input_files_path('bacpac') + +def dug_heal_mds_path(): + """HEAL MDS source files""" + return dug_input_files_path('heal-mds-imports') + +def dug_heal_research_program_path(): + """HEAL research programs source files""" + return dug_input_files_path('heal-research-programs') + +def dug_heal_study_path(): + """HEAL study source files""" + return dug_input_files_path('heal-study-imports') + +def dug_crdc_path(): + """Anvil source files""" + return dug_input_files_path('crdc') + +def dug_kfdrc_path(): + """Anvil source files""" + return dug_input_files_path('kfdrc') + +def dug_nida_objects(input_data_path=None): + "Return list of NIDA source files" + if not input_data_path: + input_data_path = str(dug_input_files_path('nida')) + nida_file_pattern = os.path.join(input_data_path, "NIDA-*.xml") + return sorted(glob.glob(nida_file_pattern)) + +def dug_sparc_objects(input_data_path=None): + if not input_data_path: + input_data_path = str(dug_input_files_path('sparc')) + file_pattern = os.path.join(input_data_path, "scicrunch/*.xml") + return sorted(glob.glob(file_pattern)) + +def dug_anvil_objects(input_data_path=None): + if not input_data_path: + input_data_path = dug_anvil_path() + files = get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) + +def dug_sprint_objects(input_data_path=None): + if not input_data_path: + input_data_path = dug_sprint_path() + files = get_files_recursive( + lambda file_name: file_name.endswith('.xml'), input_data_path) + return sorted([str(f) for f in files]) + +def dug_bacpac_objects(input_data_path=None): + "Return list of BACPAC source files" + if not input_data_path: + input_data_path = dug_bacpac_path() + files = get_files_recursive( + lambda file_name: file_name.endswith('.xml'), input_data_path) + return sorted([str(f) for f in files]) + +def dug_crdc_objects(input_data_path=None): + if not input_data_path: + input_data_path = dug_crdc_path() + files = get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) + +def dug_heal_study_objects(input_data_path=None): + "Return list of HEAL study source files" + if not input_data_path: + input_data_path = dug_heal_study_path() + files = get_files_recursive(lambda file_name : file_name.endswith('.xml'), + input_data_path) + return sorted([str(f) for f in files]) + +def dug_heal_research_program_objects(input_data_path=None): + "Return list of HEAL research program source files" + if not input_data_path: + input_data_path = dug_heal_research_program_path() + files = get_files_recursive(lambda file_name : file_name.endswith('.xml'), + input_data_path) + return sorted([str(f) for f in files]) + +def dug_kfdrc_objects(input_data_path=None): + if not input_data_path: + input_data_path = dug_kfdrc_path() + files = get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) + + +def dug_dd_xml_path(): + """ Topmed source files""" + return dug_input_files_path('db_gap') + +def get_files_recursive(file_name_filter, current_dir): + file_paths = [] + for child in current_dir.iterdir(): + if child.is_dir(): + file_paths += get_files_recursive(file_name_filter, child) + continue + if not file_name_filter(child.name): + continue + else: + file_paths += [child] + return file_paths + +def dug_dd_xml_objects(input_data_path=None): + if not input_data_path: + input_data_path = dug_dd_xml_path() + files = get_files_recursive( + lambda file_name: ( + not file_name.startswith('._') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) + +def copy_file_to_dir(file_location, dir_name): + return shutil.copy(file_location, dir_name) + +def read_schema (schema_type: SchemaType, path=None): + """ Read a schema object. + :param schema_type: Schema type of the object to read. """ + if path is not None: + path = path / '**' + location = glob.glob(schema_path (f"{schema_type.value}-schema.json", path=path), recursive=True)[0] + return read_object (location) + +def get_uri (path, key): + """ Build a URI. + :param path: The path of an object. + :param key: The key of a configuration value to prepend to the object. """ + # Incase config has http://..../ or http://... remove / and add back to + # avoid double http://...// + root_url = config[key].rstrip('/') + return f"{root_url}/{path}" + +def get_relative_path (path): + return os.path.join (os.path.dirname (__file__), path) + +def read_relative_object (path): + return read_object (get_relative_path(path)) + +def trunc(text, limit): + return ('..' + text[-limit-2:]) if len(text) > limit else text + + + +def json_line_iter(jsonl_file_path): + f = open(file=jsonl_file_path, mode='r', encoding='utf-8') + for line in f: + yield json.loads(line) + f.close() + +def jsonl_iter(file_name): + # iterating over jsonl files + with open(file_name) as stream: + for line in stream: + # yield on line at time + yield json.loads(line) + +def json_iter(json_file,entity_key): + with open(json_file) as stream: + data = json.loads(stream.read()) + return data[entity_key] + +def downloadfile(thread_num, inputq, doneq): + url = "" + t0 = 0 + pct = 0 + + def downloadprogress(blocknumber, readsize, totalfilesize): + nonlocal thread_num + nonlocal url, t0, pct + blocks_expected = ( + int(totalfilesize/readsize) + + (1 if totalfilesize%readsize != 0 else 0)) + t1 = int(current_time_in_millis()/1000) + elapsed_delta = t1 - t0 + pct = int(100 * blocknumber / blocks_expected) + if elapsed_delta >= 30: # every n seconds + log.info(f"thread-{thread_num} {pct}% of size:{totalfilesize} " + f"({blocknumber}/{blocks_expected}) url:{url}") + t0 = t1 + + num_files_processed = 0 + while inputq.empty() is False: + t0 = int(current_time_in_millis()/1000) + url, dst = inputq.get() + num_files_processed += 1 + log.info(f"thread-{thread_num} downloading {url}") + try: + path, httpMessage = urlretrieve( + url, dst, reporthook=downloadprogress) + if pct < 100: + httpMessageKeys = httpMessage.keys() + log.info(f"thread-{thread_num} urlretrieve path:'{path}' " + f"http-keys:{httpMessageKeys} " + f"httpMessage:'{httpMessage.as_string()}") + except Exception as e: + log.error(f"thread-{thread_num} downloadfile excepton: {e}") + continue + log.info(f"thread-{thread_num} downloaded {dst}") + doneq.put((thread_num,num_files_processed)) + log.info(f"thread-{thread_num} done!") + return diff --git a/dags/roger/logger.py b/dags/roger/logger.py new file mode 100644 index 00000000..04e47897 --- /dev/null +++ b/dags/roger/logger.py @@ -0,0 +1,34 @@ +import logging +import sys +from typing import Optional +from roger.config import get_default_config + +logger: Optional[logging.Logger] = None + + +def get_logger(name: str = 'roger') -> logging.Logger: + """ + Get an instance of logger. + + Parameters + ---------- + name: str + The name of logger + + Returns + ------- + logging.Logger + An instance of logging.Logger + + """ + global logger + if logger is None: + config = get_default_config() + logger = logging.getLogger(name) + handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter(config['logging']['format']) + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(config['logging']['level']) + logger.propagate = True + return logger diff --git a/dags/roger/models/__init__.py b/dags/roger/models/__init__.py new file mode 100644 index 00000000..3a994ff8 --- /dev/null +++ b/dags/roger/models/__init__.py @@ -0,0 +1,4 @@ +"Data models for Roger" + +from roger.models.kgx import KGXModel +from roger.models.biolink import BiolinkModel diff --git a/dags/roger/models/biolink.py b/dags/roger/models/biolink.py new file mode 100644 index 00000000..43102188 --- /dev/null +++ b/dags/roger/models/biolink.py @@ -0,0 +1,48 @@ +"Biolink data model for Roger" + +from bmt import Toolkit +from roger.logger import get_logger + +log = get_logger() + +class BiolinkModel: + "Biolink data model for Roger" + root_type = 'biolink:NamedThing' + + def __init__(self, bl_version='v3.1.2'): + self.bl_url = (f'https://raw.githubusercontent.com/biolink' + f'/biolink-model/{bl_version}/biolink-model.yaml') + log.info("bl_url is %s", self.bl_url) + self.toolkit = Toolkit() + + def find_biolink_leaves(self, biolink_concepts): + """Given list of concepts, returns leaves minus any parent concepts + :param biolink_concepts: list of biolink concepts + :return: leave concepts. + """ + ancestry_set = set() + all_concepts = set(biolink_concepts) + unknown_elements = set() + + for x in all_concepts: + current_element = self.toolkit.get_element(x) + if not current_element: + unknown_elements.add(x) + ancestors = set(self.toolkit.get_ancestors( + x, mixin=True, reflexive=False, formatted=True)) + ancestry_set = ancestry_set.union(ancestors) + leaf_set = all_concepts - ancestry_set - unknown_elements + return leaf_set + + def get_leaf_class (self, names): + """ Return the leaf classes in the provided list of names. """ + leaves = list(self.find_biolink_leaves(names)) + return leaves[0] + + def get_label(self, class_name): + "Return the label for the given class name" + element = self.toolkit.get_element(class_name) + if element: + name = element.name + return name + return class_name.replace("biolink:", "").replace("_", " ") diff --git a/dags/roger/models/kgx.py b/dags/roger/models/kgx.py new file mode 100644 index 00000000..e35ab07e --- /dev/null +++ b/dags/roger/models/kgx.py @@ -0,0 +1,503 @@ +"KGX data model for Roger" + +import os +import time +import queue +from itertools import chain +import threading +from collections import defaultdict +from xxhash import xxh64_hexdigest +import orjson as json +import ntpath +from kg_utils.merging import DiskGraphMerger +from kg_utils.constants import * + +from roger.config import get_default_config +from roger.logger import get_logger +from roger.components.data_conversion import compare_types +from roger.core import storage +from roger.models.biolink import BiolinkModel +from roger.core.enums import SchemaType + +log = get_logger() + +class KGXModel: + """ Abstractions for transforming KGX formatted data. + + KGX stands for Knowledge Graph Exchange + """ + def __init__(self, biolink=None, config=None): + if not config: + config = get_default_config() + self.config = config + + # We need a temp director for the DiskGraphMerger + self.temp_directory = storage.merge_path( + self.config.kgx.merge_db_temp_dir) + log.debug(f"Setting temp_directory to : {self.temp_directory}") + isExist = os.path.exists(self.temp_directory) + if not isExist: + os.makedirs(self.temp_directory) + + self.merger = DiskGraphMerger(temp_directory=self.temp_directory, + chunk_size=5_000_000) + self.biolink_version = self.config.kgx.biolink_model_version + log.debug(f"Trying to get biolink version : {self.biolink_version}") + if biolink is None: + self.biolink = BiolinkModel(self.biolink_version) + else: + self.biolink = biolink + self.enable_metrics = self.config.get('enable_metrics', False) + + def get_kgx_json_format(self, files: list, dataset_version: str): + """Gets Json formatted kgx files. + + These files have a the following structure: + {"nodes": [{"id":"..."},...], "edges": [{"id":...},...}] } + + Parameters + ---------- + files : list of file names + dataset_version : dataset version from dataset meta-data information + + Returns None + ------- + + """ + file_tuple_q = queue.Queue() + thread_done_q = queue.Queue() + for nfile, file_name in enumerate(files): + # file_url or skip + file_name = dataset_version + "/" + file_name + file_url = storage.get_uri(file_name, "kgx_base_data_uri") + subgraph_basename = os.path.basename(file_name) + subgraph_path = storage.kgx_path(subgraph_basename) + if os.path.exists(subgraph_path): + log.info(f"cached kgx: {subgraph_path}") + continue + log.debug("#{}/{} to get: {}".format( + nfile+1, len(files), file_url)) + # folder + dirname = os.path.dirname (subgraph_path) + if not os.path.exists (dirname): + os.makedirs (dirname, exist_ok=True) + # add to queue + file_tuple_q.put((file_url,subgraph_path)) + + # start threads for each file download + threads = [] + for thread_num in range(len(files)): # len(files) + th = threading.Thread( + target=storage.downloadfile, + args=(thread_num, file_tuple_q, thread_done_q)) + th.start() + threads.append(th) + + # wait for each thread to complete + for nwait in range(len(threads)): + thread_num, num_files_processed = thread_done_q.get() + th = threads[thread_num] + th.join() + log.info(f"#{nwait+1}/{len(threads)} joined: " + f"thread-{thread_num} processed: " + f"{num_files_processed} file(s)") + + all_kgx_files = [] + for nfile, file_name in enumerate(files): + start = storage.current_time_in_millis() + file_name = dataset_version + "/" + file_name + file_url = storage.get_uri(file_name, "kgx_base_data_uri") + subgraph_basename = os.path.basename(file_name) + subgraph_path = storage.kgx_path(subgraph_basename) + all_kgx_files.append(subgraph_path) + if os.path.exists(subgraph_path): + log.info(f"cached kgx: {subgraph_path}") + continue + log.info ("#{}/{} read: {}".format(nfile+1, len(files), file_url)) + subgraph = storage.read_object(file_url) + storage.write_object(subgraph, subgraph_path) + total_time = storage.current_time_in_millis() - start + edges = len(subgraph['edges']) + nodes = len(subgraph['nodes']) + log.info( + "#{}/{} edges:{:>7} nodes: {:>7} time:{:>8} wrote: {}".format( + nfile+1, len(files), edges, nodes, + total_time/1000, subgraph_path)) + return all_kgx_files + + def get_kgx_jsonl_format(self, files, dataset_version): + """gets pairs of jsonl formatted kgx files. + + Files is expected to have all the pairs. + + I.e if kgx_1_nodes.jsonl exists its expected that kgx_1_edges.jsonl + exists in the same path. + File names should have strings *nodes*.jsonl and *edges*.jsonl. + Parameters + ---------- + files + dataset_version + + Returns + ------- + + """ + # make a paired list + paired_up = [] + log.info(f"getting {files}") + for file_name in files: + if "nodes" in file_name: + paired_up.append( + [file_name, file_name.replace('nodes', 'edges')]) + error = False + # validate that all pairs exist + if len(files) / 2 != len(paired_up): + log.error("Error paired up kgx jsonl files don't match " + "list of files specified in metadata.yaml") + error = True + for pairs in paired_up: + if pairs[0] not in files: + log.error( + f"{pairs[0]} not in original list " + f"of files from metadata.yaml") + error = True + if pairs[1] not in files: + error = True + log.error( + f"{pairs[1]} not in original list " + f"of files from metadata.yaml") + if error: + raise Exception("Metadata.yaml has inconsistent jsonl files") + + file_tuple_q = queue.Queue() + thread_done_q = queue.Queue() + for npairs, pairs in enumerate(paired_up): + for npair, p in enumerate(pairs): + file_name = dataset_version + "/" + p + file_url = storage.get_uri(file_name, "kgx_base_data_uri") + subgraph_basename = os.path.basename(file_name) + subgraph_path = storage.kgx_path(subgraph_basename) + if os.path.exists(subgraph_path): + log.info(f"skip cached kgx: {subgraph_path}") + continue + log.info ("#{}.{}/{} read: {}".format( + npairs+1, npair+1, len(paired_up), file_url)) + # folder + dirname = os.path.dirname (subgraph_path) + if not os.path.exists (dirname): + os.makedirs (dirname, exist_ok=True) + # add to queue + file_tuple_q.put((file_url,subgraph_path)) + + # start threads for each file download + threads = [] + for thread_num in range(file_tuple_q.qsize()): + th = threading.Thread( + target=storage.downloadfile, + args=(thread_num, file_tuple_q, thread_done_q)) + th.start() + threads.append(th) + + # wait for each thread to complete + for nwait in range(len(threads)): + thread_num, num_files_processed = thread_done_q.get() + th = threads[thread_num] + th.join() + log.info(f"#{nwait+1}/{len(threads)} joined: " + f"thread-{thread_num} processed: " + f"{num_files_processed} file(s)") + + all_kgx_files = [] + for pairs in paired_up: + nodes = 0 + edges = 0 + start = storage.current_time_in_millis() + for p in pairs: + file_name = dataset_version + "/" + p + file_url = storage.get_uri(file_name, "kgx_base_data_uri") + subgraph_basename = os.path.basename(file_name) + subgraph_path = storage.kgx_path(subgraph_basename) + all_kgx_files.append(subgraph_path) + if os.path.exists(subgraph_path): + log.info(f"cached kgx: {subgraph_path}") + continue + data = storage.read_object(file_url) + storage.write_object(data, subgraph_path) + if "edges" in p: + edges = len(data.split('\n')) + else: + nodes = len(data.split('\n')) + total_time = storage.current_time_in_millis() - start + log.info( + "wrote {:>45}: edges:{:>7} nodes: {:>7} time:{:>8}".format( + storage.trunc(subgraph_path, 45), edges, nodes, total_time)) + return all_kgx_files + + def get (self, dataset_version = "v1.0"): + """ Read metadata for KGX files and downloads them locally. + :param dataset_version: Data version to operate on. + """ + metadata = storage.read_relative_object ("../../metadata.yaml") + data_set_list = self.config.kgx.data_sets + kgx_files_remote = [] + for item in metadata['kgx']['versions']: + if (item['version'] == dataset_version and + item['name'] in data_set_list): + log.info(f"Getting KGX dataset {item['name']}, " + f"version {item['version']}") + if item['format'] == 'json': + kgx_files_remote += self.get_kgx_json_format( + item['files'], item['version']) + elif item['format'] == 'jsonl': + kgx_files_remote += self.get_kgx_jsonl_format( + item['files'], item['version']) + else: + raise ValueError( + f"Unrecognized format in metadata.yaml: " + f"{item['format']}, valid formats are `json` " + f"and `jsonl`.") + # Fetchs kgx generated from Dug Annotation workflow. + new_files = self.fetch_dug_kgx() + kgx_files_remote + all_files_in_dir = ( + storage.kgx_objects("json") + + storage.kgx_objects("jsonl")) + files_to_remove = [x for x in all_files_in_dir + if x not in new_files] + if len(files_to_remove): + log.info( + "Found some old files to remove from kgx dir : %s", + files_to_remove) + for file in files_to_remove: + storage.remove(file) + log.info("removed %s", file) + log.info("Done.") + + + + def fetch_dug_kgx(self): + """ + Copies files from dug output dir to roger kgx dir. + :return: + """ + dug_kgx_files = storage.dug_kgx_objects() + all_kgx_files = [] + log.info("Copying dug KGX files to %s. Found %d kgx files to copy.", + storage.kgx_path(''), len(dug_kgx_files)) + for file in dug_kgx_files: + file_name = ntpath.basename(file) + dest = storage.kgx_path(file_name) + all_kgx_files.append(dest) + storage.write_object({}, dest) + log.info(f"Copying from {file} to {dest}.") + storage.copy_file_to_dir(file, dest) + log.info("Done copying dug KGX files.") + return all_kgx_files + + def create_nodes_schema(self, input_data_path=None, output_data_path=None): + """ + Extracts schema for nodes based on biolink leaf types + :return: + """ + + category_schemas = defaultdict(lambda: None) + category_error_nodes = set() + merged_nodes_file = storage.merged_objects("nodes", input_data_path) + log.info(f"Processing : {merged_nodes_file}") + counter = 0 + for node in storage.json_line_iter(merged_nodes_file): + # Debuging code + if counter % 10000 == 0: + log.info(f"Processing node : {node} counter : {counter}") + counter += 1 + + if not node.get('category'): + category_error_nodes.add(node['id']) + node['category'] = [BiolinkModel.root_type] + + # Get all leaf types of this node + node_types = list( + self.biolink.find_biolink_leaves(node['category'])) + # pick the fist one to work on + node_type = node_types[0] + + + # make sure it is defined in the final dict + category_schemas[node_type] = category_schemas.get(node_type, {}) + + # compute full list of attributes and the value types of the + # attributes for that type. + for k in node.keys(): + current_type = type(node[k]).__name__ + if k not in category_schemas[node_type]: + category_schemas[node_type][k] = current_type + else: + previous_type = category_schemas[node_type][k] + category_schemas[node_type][k] = compare_types( + previous_type, current_type) + + # copy over final result to every other leaf type + for tp in node_types: + category_schemas[tp] = category_schemas[node_type] + + + if len(category_error_nodes): + log.warning(f"some nodes didn't have category assigned. " + f"KGX file has errors." + f"Nodes {len(category_error_nodes)}." + f"Showing first 10: {list(category_error_nodes)[:10]}." + f"These will be treated as {BiolinkModel.root_type}.") + + # Write node schemas. + self.write_schema(category_schemas, SchemaType.CATEGORY, output_path=output_data_path) + + def create_edges_schema(self, input_data_path=None, output_data_path=None): + """ + Create unified schema for all edges in an edges jsonl file. + :return: + """ + predicate_schemas = defaultdict(lambda: None) + merged_edges_file = storage.merged_objects("edges", input_data_path) + """ Infer predicate schemas. """ + for edge in storage.json_line_iter(merged_edges_file): + predicate = edge['predicate'] + predicate_schemas[predicate] = predicate_schemas.get(predicate, + {}) + for k in edge.keys(): + current_type = type(edge[k]).__name__ + if k not in predicate_schemas[predicate]: + predicate_schemas[predicate][k] = current_type + else: + previous_type = predicate_schemas[predicate][k] + predicate_schemas[predicate][k] = compare_types( + previous_type, current_type) + self.write_schema(predicate_schemas, SchemaType.PREDICATE, output_path=output_data_path) + + def create_schema (self): + """Determine the schema of each type of object. + + We have to do this to make it possible to write tabular data. Need to + know all possible columns in advance and correct missing fields. + """ + if self.schema_up_to_date(): + log.info (f"schema is up to date.") + return + + self.create_nodes_schema() + self.create_edges_schema() + + def schema_up_to_date (self): + return storage.is_up_to_date ( + source=storage.kgx_objects(), + targets=[ + storage.schema_path ( + f"{SchemaType.PREDICATE.value}-schema.json"), + storage.schema_path ( + f"{SchemaType.PREDICATE.value}-schema.json") + ]) + + def write_schema(self, schema, schema_type: SchemaType ,output_path=None): + """ Output the schema file. + + :param schema: Schema to get keys from. + :param schema_type: Type of schema to write. + """ + file_name = storage.schema_path (f"{schema_type.value}-schema.json", output_path) + log.info("writing schema: %s", file_name) + dictionary = { k : v for k, v in schema.items () } + storage.write_object (dictionary, file_name) + + def merge(self, input_path=None, output_path=None): + """ This version uses the disk merging from the kg_utils module """ + + metrics = {} + start = time.time() + + log.info(f"Input path = {input_path}, Output path = {output_path}") + + if input_path: + json_format_files = storage.kgx_objects("json", input_path) + jsonl_format_files = storage.kgx_objects("jsonl", input_path) + else: + json_format_files = storage.kgx_objects("json") + jsonl_format_files = storage.kgx_objects("jsonl") + + # Create lists of the nodes and edges files in both json and jsonl + # formats + jsonl_node_files = {file for file in jsonl_format_files + if "node" in file.split('/')[-1]} + jsonl_edge_files = {file for file in jsonl_format_files + if "edge" in file.split('/')[-1]} + log.info(f"Jsonl edge files : {jsonl_edge_files}") + log.info(f"Jsonl node files : {jsonl_node_files}") + + # Create all the needed iterators and sets thereof + jsonl_node_iterators = [storage.jsonl_iter(file_name) + for file_name in jsonl_node_files] + jsonl_edge_iterators = [storage.jsonl_iter(file_name) + for file_name in jsonl_edge_files] + json_node_iterators = [storage.json_iter(file_name, 'nodes') + for file_name in json_format_files] + json_edge_iterators = [storage.json_iter(file_name, 'edges') + for file_name in json_format_files] + all_node_iterators = json_node_iterators + jsonl_node_iterators + all_edge_iterators = json_edge_iterators + jsonl_edge_iterators + + # chain the iterators together + node_iterators = chain(*all_node_iterators) + edge_iterators = chain(*all_edge_iterators) + + # now do the merge + self.merger.merge_nodes(node_iterators) + merged_nodes = self.merger.get_merged_nodes_jsonl() + + + self.merger.merge_edges(edge_iterators) + merged_edges = self.merger.get_merged_edges_jsonl() + + write_merge_metric = {} + t = time.time() + start_nodes_jsonl = time.time() + + + nodes_file_path = storage.merge_path("nodes.jsonl", output_path) + + # stream out nodes to nodes.jsonl file + with open(nodes_file_path, 'w') as stream: + for nodes in merged_nodes: + stream.write(nodes) + + time_difference = time.time() - start_nodes_jsonl + log.info("writing nodes took : %s", str(time_difference)) + write_merge_metric['nodes_writing_time'] = time_difference + start_edge_jsonl = time.time() + + # stream out edges to edges.jsonl file + edges_file_path = storage.merge_path("edges.jsonl", output_path) + with open(edges_file_path, 'w') as stream: + for edges in merged_edges: + edges = json.loads(edges) + # Add an id field for the edges as some of the downstream + # processing expects it. + edges['id'] = xxh64_hexdigest( + edges['subject'] + edges['predicate'] + + edges['object'] + + edges.get("biolink:primary_knowledge_source", "")) + keys_to_del = set() + for key in edges: + if key.startswith('biolink:'): + keys_to_del.add(key) + for k in keys_to_del: + edges[k.replace('biolink:', '')] = edges[k] + del edges[k] + stream.write(json.dumps(edges).decode('utf-8') + '\n') + + write_merge_metric['edges_writing_time'] = time.time() - start_edge_jsonl + log.info(f"writing edges took: {time.time() - start_edge_jsonl}") + write_merge_metric['total_time'] = time.time() - t + metrics['write_jsonl'] = write_merge_metric + metrics['total_time'] = time.time() - start + log.info(f"total took: {time.time() - start}") + if self.enable_metrics: + metricsfile_path = storage.metrics_path('merge_metrics.yaml') + storage.write_object(metrics, metricsfile_path) + diff --git a/dags/roger/pipelines/README.md b/dags/roger/pipelines/README.md new file mode 100644 index 00000000..e77e6a29 --- /dev/null +++ b/dags/roger/pipelines/README.md @@ -0,0 +1,99 @@ +# Building custom Dug data pipelines + +The pipelines submodule is where data pipelines can be defined for specific data +sets with specific, custom behaviors for each one. In previous versions of the +code, customizations for each pipeline were spread across several modules. With +this instantiation, the customizations for each data set pipeline are +consolidated into a single overridden subclass of the DataPipeline class. + +## What the base pipeline does + +The function `roger.tasks.create_pipeline_taskgroup`, when called with the given +data pipeline class, will emit an Airflow task group with the following +structure. If Airflow is not being used, another executor should use a similarly +structured set of calls and dependencies to ensure that the task pipeline +executes fully and in order. + +```mermaid +graph TD; + annotate-->index_variables; + annotate-->validate_index_variables; + index_variables-->validate_index_variables; + annotate-->make_kg_tagged; + annotate-->crawl_tranql; + annotate-->index_concepts; + crawl_tranql-->validate_index_concepts; + index_concepts-->validate_index_concepts; + annotate-->validate_index_concepts; +``` +The pipeline steps are briefly described below + +### annotate + +By default, `annotate` will call the `get_objects` method to collect a list of +parsable files. For each of these files, a Dug Crawler object will be created +which will apply the parser returned by the pipeline class's `get_parser_name` +method. (This by default will return `parser_name` if it's defined, or will fall +back to `pipeline_name`.) The results will be written to `elements.json` and +`concepts.json` as appropriate. + +### index_variables + +This will load the `elements.json` files from `annotate` and pass them to the +indexer built from a DugFactory object. (This is sending them to ElasticSearch +for indexing under the hood.) + +### make_kg_tagged + +All `elements.json` files will be loaded, and based on the annotations, a +Translator-compliant knowledge graph will be written to a `_kgx.json` file. + +### index_concepts + +The `concepts.json` files are read and submitted to ElasticSearch using the +indexer object derived from the embedded DugFactory object. + +### validate_index_concepts + +Concepts from `concepts.json` are double-checked to ensure that the ES indexing +process actually worked. + +## Defining a basic pipeline, with no customizations + +Simple pipelines, such as that for the BACPAC dataset, need very little +customization. All pipelines must define a `pipeline_name`, which will be used +as the default value for a number of other parameters if they are not +defined. In the case of BACPAC, a difference in case means that both the +`pipeline_name` and the `parser_name` need to be defined. + +```python +from roger.pipelines import DugPipeline + +class BacPacPipeline(DugPipeline): + "Pipeline for BACPAC data set" + pipeline_name = "bacpac" + parser_name = "BACPAC" +``` + +This is the full extent of the code needed to adapt the DugPipeline object to +BACPAC. Other data sets have more specific customizations that need more custom +code or variables defined. + +## More extensive customization + +Because the base pipeline (defined in `roger/pipelines/base.py:DugPipeline`) is +inherited as a subclass for customizing, effectively any part of the pipeline +that isn't part of Dug proper can be overriden. Here are some common +customizations that are expected to be necessary for many parts of the process: + +### get_objects + +The `get_objects` method by default looks in the `input_data_path` that is +passed to it, and if that is None, loads the default from the `ROGER_DATA_DIR` +environment variable. By default, it reads all files with the `.xml` extension +recursively anywhere in that directory or its subdirectories. + +One example customization is the anvil data pipeline, which additionally +excludes any file that starts with 'GapExchange_'. Any overriden method should +accept an optional `input_data_path` parameter and return a list of files, +sorted in the order that they should be processed. diff --git a/dags/roger/pipelines/__init__.py b/dags/roger/pipelines/__init__.py new file mode 100644 index 00000000..d6664b8e --- /dev/null +++ b/dags/roger/pipelines/__init__.py @@ -0,0 +1,28 @@ +"Modules for individual datasets" + +import pkgutil +from pathlib import Path +import importlib + +from .base import DugPipeline + +def get_pipeline_classes(pipeline_names_dict): + """Return a list of all defined pipeline classes + """ + + base_path = Path(__file__).resolve().parent + + for (_, mod_name, _) in pkgutil.iter_modules([base_path]): + if mod_name == 'base': + continue + + # No need to actuall get the module symbol, once it's imported, it will + # show up below in __subclasses__. + importlib.import_module(f"{__name__}.{mod_name}") + pipeline_list = [] + + for subclass in DugPipeline.__subclasses__(): + if getattr(subclass, 'pipeline_name') and getattr(subclass, 'pipeline_name') in pipeline_names_dict.keys(): + subclass.input_version = pipeline_names_dict[getattr(subclass, 'pipeline_name')] + pipeline_list.append(subclass) + return pipeline_list diff --git a/dags/roger/pipelines/anvil.py b/dags/roger/pipelines/anvil.py new file mode 100644 index 00000000..baa82c05 --- /dev/null +++ b/dags/roger/pipelines/anvil.py @@ -0,0 +1,24 @@ +"Pipeline for anvil data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class AnvilPipeline(DugPipeline): + "Pipeline for Anvil data set" + pipeline_name = 'anvil' + parser_name = 'Anvil' + + def get_objects(self, input_data_path=None): + """Retrieve anvil objects + + This code is imported from roger.core.storage.dug_anvil_objects + """ + if not input_data_path: + input_data_path = storage.dug_input_files_path( + self.files_dir) + files = storage.get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/bacpac.py b/dags/roger/pipelines/bacpac.py new file mode 100644 index 00000000..495ba3b9 --- /dev/null +++ b/dags/roger/pipelines/bacpac.py @@ -0,0 +1,8 @@ +"Pipeline for BACPAC data" + +from roger.pipelines import DugPipeline + +class BacPacPipeline(DugPipeline): + "Pipeline for BACPAC data set" + pipeline_name = "bacpac" + parser_name = "BACPAC" diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py new file mode 100644 index 00000000..e108443e --- /dev/null +++ b/dags/roger/pipelines/base.py @@ -0,0 +1,994 @@ +"Base class for implementing a dataset annotate, crawl, and index pipeline" + +import os +import asyncio +from io import StringIO +import logging +import re +import hashlib +import traceback +from functools import reduce +from pathlib import Path +import tarfile +from typing import Union +import jsonpickle + +import requests + +from dug.core import get_parser, get_annotator, get_plugin_manager, DugConcept +from dug.core.concept_expander import ConceptExpander +from dug.core.crawler import Crawler +from dug.core.factory import DugFactory +from dug.core.parsers import Parser, DugElement +from dug.core.annotators import Annotator +from dug.core.async_search import Search +from dug.core.index import Index + +from roger.config import RogerConfig +from roger.core import storage +from roger.models.biolink import BiolinkModel +from roger.logger import get_logger + +from utils.s3_utils import S3Utils + +log = get_logger() + +class PipelineException(Exception): + "Exception raised from DugPipeline and related classes" + +def make_edge(subj, + obj, + predicate='biolink:related_to', + predicate_label='related to', + relation='biolink:related_to', + relation_label='related to' + ): + """Create an edge between two nodes. + + :param subj: The identifier of the subject. + :param pred: The predicate linking the subject and object. + :param obj: The object of the relation. + :param predicate: Biolink compatible edge type. + :param predicate_label: Edge label. + :param relation: Ontological edge type. + :param relation_label: Ontological edge type label. + :returns: Returns and edge. + """ + edge_id = hashlib.md5( + f'{subj}{predicate}{obj}'.encode('utf-8')).hexdigest() + return { + "subject": subj, + "predicate": predicate, + "predicate_label": predicate_label, + "id": edge_id, + "relation": relation, + "relation_label": relation_label, + "object": obj, + "provided_by": "renci.bdc.semanticsearch.annotator" + } + +class FileFetcher: + """A basic remote file fetcher class + """ + + def __init__( + self, + remote_host: str, + remote_dir: Union[str, Path], + local_dir: Union[str, Path] = "." + ): + self.remote_host = remote_host + if isinstance(remote_dir, str): + self.remote_dir = remote_dir.rstrip("/") + else: + self.remote_dir = str(remote_dir.as_posix()) + self.local_dir = Path(local_dir).resolve() + + def __call__(self, remote_file_path: Union[str, Path]) -> Path: + remote_path = self.remote_dir + "/" + remote_file_path + local_path = self.local_dir / remote_file_path + url = f"{self.remote_host}{remote_path}" + log.debug("Fetching %s", url) + try: + response = requests.get(url, allow_redirects=True, timeout=60) + except Exception as e: + log.error("Unexpected %s: %s", e.__class__.__name__, str(e)) + raise RuntimeError(f"Unable to fetch {url}") from e + + log.debug("Response: %d", response.status_code) + if response.status_code != 200: + log.debug("Unable to fetch %s: %d", url, response.status_code) + raise RuntimeError(f"Unable to fetch {url}") + + with local_path.open('wb') as file_obj: + file_obj.write(response.content) + return local_path + +class DugPipeline(): + "Base class for dataset pipelines" + + pipeline_name = None + unzip_source = True + input_version = "" + + def __init__(self, config: RogerConfig, to_string=False): + "Set instance variables and check to make sure we're overriden" + if not self.pipeline_name: + raise PipelineException( + "Subclass must at least define pipeline_name as class var") + self.config = config + self.bl_toolkit = BiolinkModel() + dug_conf = config.to_dug_conf() + self.element_mapping = config.indexing.element_mapping + self.factory = DugFactory(dug_conf) + self.cached_session = self.factory.build_http_session() + self.event_loop = asyncio.new_event_loop() + self.log_stream = StringIO() + if to_string: + self.string_handler = logging.StreamHandler(self.log_stream) + log.addHandler(self.string_handler) + self.s3_utils = S3Utils(self.config.s3_config) + + self.tranqlizer: ConceptExpander = self.factory.build_tranqlizer() + + graph_name = self.config["redisgraph"]["graph"] + source = f"redis:{graph_name}" + self.tranql_queries: dict = self.factory.build_tranql_queries(source) + self.node_to_element_queries: list = ( + self.factory.build_element_extraction_parameters(source)) + + indexing_config = config.indexing + self.variables_index = indexing_config.get('variables_index') + self.concepts_index = indexing_config.get('concepts_index') + self.kg_index = indexing_config.get('kg_index') + + self.search_obj: Search = self.factory.build_search_obj([ + self.variables_index, + self.concepts_index, + self.kg_index, + ]) + self.index_obj: Index = self.factory.build_indexer_obj([ + self.variables_index, + self.concepts_index, + self.kg_index, + + ]) + + def __enter__(self): + self.event_loop = asyncio.new_event_loop() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # close elastic search connection + self.event_loop.run_until_complete(self.search_obj.es.close()) + # close async loop + if self.event_loop.is_running() and not self.event_loop.is_closed(): + self.event_loop.close() + if exc_type or exc_val or exc_tb: + traceback.print_exc() + log.error("%s %s %s", exc_val, exc_val, exc_tb) + log.exception("Got an exception") + + def get_data_format(self): + """Access method for data_format parameter + + Defaults to pipeline_name unless self.data_format is set. This method + can also be overriden + """ + return getattr(self, 'data_format', self.pipeline_name) + + def get_files_dir(self): + """Access method for files_dir parameter + + Defaults to pipeline_name unless self.files_dir is set. This method can + also be overriden. + """ + return getattr(self, 'files_dir', self.pipeline_name) + + def get_parser_name(self): + """Access method for parser_name + + Defaults to pipeline_name unless self.parser_name is set. This method + can also be overriden. + """ + return getattr(self, 'parser_name', self.pipeline_name) + + def get_annotator_name(self): + """ + Access method for annotator_name + Defaults to annotator_monarch unless specified using annotation.annotator_type in the configuration file. + """ + return self.config.annotation.annotator_type + + + def get_parser(self): + dug_plugin_manager = get_plugin_manager() + parser: Parser = get_parser(dug_plugin_manager.hook, + self.get_parser_name()) + return parser + + def get_annotator(self): + dug_plugin_manager = get_plugin_manager() + annotator: Annotator = get_annotator( + dug_plugin_manager.hook, + self.get_annotator_name(), + self.config.to_dug_conf() + ) + return annotator + + def init_annotator(self, max_retries=5, base_delay=1, max_delay=10): + attempt = 0 + while attempt < max_retries: + try: + log.info("Initializing annotator") + annotator = self.get_annotator() + return annotator # success + except Exception as e: + attempt += 1 + if attempt == max_retries: + log.error("Max retries reached when creating annotator. Failing with error: %s", e) + raise + delay = min(base_delay * (2 ** (attempt - 1)), max_delay) + delay += random.uniform(0, 1) # add jitter + log.warning("Error occurred: %s. Retrying in %.2f seconds...", e, delay) + time.sleep(delay) + + + def annotate_files(self, parsable_files, output_data_path=None): + """ + Annotates a Data element file using a Dug parser. + :param parser_name: Name of Dug parser to use. + :param parsable_files: Files to parse. + :return: None. + """ + if not output_data_path: + output_data_path = storage.dug_annotation_path('') + log.info("Parsing files") + log.info("Intializing parser") + parser = self.get_parser() + log.info("Done intializing parser") + annotator = self.init_annotator() + log.info("Done intializing annotator") + for _, parse_file in enumerate(parsable_files): + log.debug("Creating Dug Crawler object on parse_file %s at %d of %d", + parse_file, _ , len(parsable_files)) + crawler = Crawler( + crawl_file=parse_file, + parser=parser, + annotator=annotator, + tranqlizer='', + tranql_queries=[], + http_session=self.cached_session + ) + + # configure output space. + current_file_name = '.'.join( + os.path.basename(parse_file).split('.')[:-1]) + elements_file_path = os.path.join( + output_data_path, current_file_name) + elements_file = os.path.join(elements_file_path, 'elements.txt') + concepts_file = os.path.join(elements_file_path, 'concepts.txt') + + # Use the specified parser to parse the parse_file into elements. + log.debug("Parser is %s", str(parser)) + elements = parser(parse_file) + log.debug("Parsed elements: %s", str(elements)) + + # This inserts the list of elements into the crawler where + # annotate_elements expects to find it. Maybe in some future version + # of Dug this could be a parameter instead of an attribute? + crawler.elements = elements + + # @TODO propose for Dug to make this a crawler class init param(??) + crawler.crawlspace = elements_file_path + log.debug("Crawler annotator: %s", str(crawler.annotator)) + crawler.annotate_elements() + + # Extract out the concepts gotten out of annotation + # Extract out the elements + non_expanded_concepts = crawler.concepts + # The elements object will have been modified by annotate_elements, + # so we want to make sure to catch those modifications. + elements = crawler.elements + + # Write pickles of objects to file + log.info("Parsed and annotated: %s", parse_file) + + storage.write_object(jsonpickle.encode(elements, indent=2), elements_file) + log.info("Serialized annotated elements to : %s", elements_file) + + storage.write_object(jsonpickle.encode(non_expanded_concepts, indent=2), concepts_file) + log.info("Serialized annotated concepts to : %s", concepts_file) + + def convert_to_kgx_json(self, elements, written_nodes=None): + """ + Given an annotated and normalized set of study variables, + generate a KGX compliant graph given the normalized annotations. + Write that grpah to a graph database. + See BioLink Model for category descriptions. + https://biolink.github.io/biolink-model/notes.html + """ + if written_nodes is None: + written_nodes = set() + graph = { + "nodes": [], + "edges": [] + } + edges = graph['edges'] + nodes = graph['nodes'] + + for _, element in enumerate(elements): + # DugElement means a variable (Study variable...) + if not isinstance(element, DugElement): + continue + study_id = element.collection_id + study_link = element.collection_action + study_desc = element.collection_desc + study_name = element.collection_name or element.collection_id + + + if study_id not in written_nodes: + nodes.append({ + "id": study_id, + "category": ["biolink:Study"], + "name": study_name, + "url": study_link, + "description": study_desc + }) + written_nodes.add(study_id) + + # connect the study and the variable. + edges.append(make_edge( + subj=element.id, + relation_label='part of', + relation='BFO:0000050', + obj=study_id, + predicate='biolink:part_of', + predicate_label='part of')) + edges.append(make_edge( + subj=study_id, + relation_label='has part', + relation="BFO:0000051", + obj=element.id, + predicate='biolink:has_part', + predicate_label='has part')) + + # a node for the variable. Should be BL compatible + variable_node = { + "id": element.id, + "name": element.name, + "category": ["biolink:StudyVariable"], + # bulk loader parsing issue + "description": ( + element.description.replace("'", '`').replace('\n', ' ')) + } + if element.id not in written_nodes: + nodes.append(variable_node) + written_nodes.add(element.id) + + for identifier, metadata in element.concepts.items(): + identifier_object = metadata.identifiers.get(identifier) + # This logic is treating DBGap files. + # First item in current DBGap xml files is a topmed tag, + # This is treated as a DugConcept Object. But since its not + # a concept we get from annotation (?) its never added to + # variable.concepts.items (Where variable is a DugElement obj) + # The following logic is trying to extract types, and for the + # aformentioned topmed tag it adds + # `biolink:InfomrmationContentEntity` + # Maybe a better solution could be adding types on + # DugConcept objects + # More specifically Biolink compatible types (?) + # + if identifier_object: + category = identifier_object.types + elif identifier.startswith("TOPMED.TAG:"): + category = ["biolink:InformationContentEntity"] + else: + continue + if identifier not in written_nodes: + if isinstance(category, str): + bl_element = self.bl_toolkit.toolkit.get_element( + category) + category = [bl_element.class_uri or bl_element.slot_uri] + nodes.append({ + "id": identifier, + "category": category, + "name": metadata.name + }) + written_nodes.add(identifier) + # related to edge + edges.append(make_edge( + subj=element.id, + obj=identifier + )) + # related to edge + edges.append(make_edge( + subj=identifier, + obj=element.id)) + return graph + + def make_tagged_kg(self, elements): + """ Make a Translator standard knowledge graph representing + tagged study variables. + :param variables: The variables to model. + :param tags: The tags characterizing the variables. + :returns: dict with nodes and edges modeling a Translator/Biolink KG. + """ + graph = { + "nodes": [], + "edges": [] + } + edges = graph['edges'] + nodes = graph['nodes'] + + # Create graph elements to model tags and their + # links to identifiers gathered by semantic tagging + tag_map = {} + # @TODO extract this into config or maybe dug ?? + topmed_tag_concept_type = "TOPMed Phenotype Concept" + nodes_written = set() + for tag in elements: + if not (isinstance(tag, DugConcept) + and tag.type == topmed_tag_concept_type): + continue + tag_id = tag.id + tag_map[tag_id] = tag + nodes.append({ + "id": tag_id, + "name": tag.name, + "description": tag.description.replace("'", "`"), + "category": ["biolink:InformationContentEntity"] + }) + + # Link ontology identifiers we've found for this tag via nlp. + for identifier, metadata in tag.identifiers.items(): + if isinstance(metadata.types, str): + bl_element = self.bl_toolkit.toolkit.get_element( + metadata.types) + category = [bl_element.class_uri or bl_element.slot_uri] + else: + category = metadata.types + synonyms = metadata.synonyms if metadata.synonyms else [] + nodes.append({ + "id": identifier, + "name": metadata.label, + "category": category, + "synonyms": synonyms + }) + nodes_written.add(identifier) + edges.append(make_edge( + subj=tag_id, + obj=identifier)) + edges.append(make_edge( + subj=identifier, + obj=tag_id)) + + concepts_graph = self.convert_to_kgx_json(elements, + written_nodes=nodes_written) + graph['nodes'] += concepts_graph['nodes'] + graph['edges'] += concepts_graph['edges'] + + return graph + + def index_elements(self, elements_file): + "Submit elements_file to ElasticSearch for indexing " + log.info("Indexing %s...", str(elements_file)) + elements =jsonpickle.decode(storage.read_object(elements_file)) + count = 0 + total = len(elements) + # Index Annotated Elements + log.info("found %d from elements files.", len(elements)) + for element in elements: + count += 1 + # Only index DugElements as concepts will be + # indexed differently in next step + if not isinstance(element, DugConcept): + # override data-type with mapping values + if element.type.lower() in self.element_mapping: + element.type = self.element_mapping[element.type.lower()] + if not element.id: + # no id no indexing + continue + # Use the Dug Index object to submit the element to ES + self.index_obj.index_element( + element, index=self.variables_index) + percent_complete = (count / total) * 100 + if percent_complete % 10 == 0: + log.info("%d %%", percent_complete) + log.info("Done indexing %s.", elements_file) + + def validate_indexed_element_file(self, elements_file): + "After submitting elements for indexing, verify that they're available" + elements = [x for x in jsonpickle.decode(storage.read_object(elements_file)) + if not isinstance(x, DugConcept)] + # Pick ~ 10 % + sample_size = int(len(elements) * 0.1) + + # random.choices(elements, k=sample_size) + test_elements = elements[:sample_size] + log.info("Picked %d from %s for validation.", len(test_elements), + elements_file) + for element in test_elements: + # Pick a concept + concepts = [element.concepts[curie] for curie in element.concepts + if element.concepts[curie].name] + + if len(concepts): + # Pick the first concept + concept = concepts[0] + curie = concept.id + search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', concept.name) + log.debug("Searching for Concept: %s and Search term: %s", + str(curie), search_term) + all_elements_ids = self._search_elements(curie, search_term) + present = element.id in all_elements_ids + if not present: + log.error("Did not find expected variable %s in search " + "result.", str(element.id)) + log.error("Concept id : %s, Search term: %s", + str(concept.id), search_term) + raise PipelineException( + f"Validation exception - did not find variable " + f"{element.id} from {str(elements_file)}" + f"when searching variable index with Concept ID : " + f"{concept.id} using Search Term : {search_term} ") + else: + log.info( + "%s has no concepts annotated. Skipping validation for it.", + str(element.id)) + + def _search_elements(self, curie, search_term): + "Asynchronously call a search on the curie and search term" + response = self.event_loop.run_until_complete(self.search_obj.search_vars_unscored( + concept=curie, + query=search_term + )) + ids_dict = [] + if 'total_items' in response: + if response['total_items'] == 0: + log.error(f"No search elements returned for variable search: {self.variables_index}.") + log.error(f"Concept id : {curie}, Search term: {search_term}") + raise Exception(f"Validation error - Did not find {curie} for" + f"Search term: {search_term}") + else: + del response['total_items'] + for element_type in response: + all_elements_ids = [e['id'] for e in + reduce(lambda x, y: x + y['elements'], response[element_type], [])] + ids_dict += all_elements_ids + return ids_dict + + def crawl_concepts(self, concepts, data_set_name, output_path=None): + """Adds tranql KG to Concepts + + Terms grabbed from KG are also added as search terms + :param concepts: + :param data_set_name: + :return: + """ + # TODO crawl dir seems to be storaing crawling info to avoid re-crawling, but is that consting us much? , it was when tranql was slow, but + # might right to consider getting rid of it. + crawl_dir = storage.dug_crawl_path('crawl_output') + output_file_name = os.path.join(data_set_name, + 'expanded_concepts.txt') + extracted_dug_elements_file_name = os.path.join(data_set_name, + 'extracted_graph_elements.txt') + if not output_path: + output_file = storage.dug_expanded_concepts_path(output_file_name) + extracted_output_file = storage.dug_expanded_concepts_path( + extracted_dug_elements_file_name + ) + else: + output_file = os.path.join(output_path, output_file_name) + extracted_output_file = os.path.join( output_path, extracted_dug_elements_file_name) + + Path(crawl_dir).mkdir(parents=True, exist_ok=True) + extracted_dug_elements = [] + log.debug("Creating Dug Crawler object") + crawler = Crawler( + crawl_file="", + parser=None, + annotator=None, + tranqlizer=self.tranqlizer, + tranql_queries=self.tranql_queries, + http_session=self.cached_session, + ) + crawler.crawlspace = crawl_dir + counter = 0 + total = len(concepts) + for concept in concepts.values(): + counter += 1 + try: + crawler.expand_concept(concept) + concept.set_search_terms() + concept.set_optional_terms() + except Exception as e: + log.error(concept) + raise e + for query in self.node_to_element_queries: + log.info(query) + casting_config = query['casting_config'] + tranql_source = query['tranql_source'] + dug_element_type = query['output_dug_type'] + extracted_dug_elements += crawler.expand_to_dug_element( + concept=concept, + casting_config=casting_config, + dug_element_type=dug_element_type, + tranql_source=tranql_source + ) + concept.clean() + percent_complete = int((counter / total) * 100) + if percent_complete % 10 == 0: + log.info("%d%%", percent_complete) + log.info("Crawling %s done", data_set_name) + storage.write_object(obj=jsonpickle.encode(concepts, indent=2), path=output_file) + log.info ("Concepts serialized to %s", output_file) + storage.write_object(obj=jsonpickle.encode(extracted_dug_elements, indent=2), + path=extracted_output_file) + log.info("Extracted elements serialized to %s", extracted_output_file) + + def _index_concepts(self, concepts): + "Submit concepts to ElasticSearch for indexing" + log.info("Indexing Concepts") + total = len(concepts) + count = 0 + for concept_id, concept in concepts.items(): + count += 1 + self.index_obj.index_concept(concept, index=self.concepts_index) + # Index knowledge graph answers for each concept + for kg_answer_id, kg_answer in concept.kg_answers.items(): + self.index_obj.index_kg_answer( + concept_id=concept_id, + kg_answer=kg_answer, + index=self.kg_index, + id_suffix=kg_answer_id + ) + percent_complete = int((count / total) * 100) + if percent_complete % 10 == 0: + log.info("%s %%", percent_complete) + log.info("Done Indexing concepts") + + def _validate_indexed_concepts(self, elements, concepts): + """ + Validates linked concepts are searchable + :param elements: Annotated dug elements + :param concepts: Crawled (expanded) concepts + :return: + """ + # 1 . Find concepts with KG <= 10% of all concepts, + # <= because we might have no results for some concepts from tranql + sample_concepts = {key: value for key, value + in concepts.items() if value.kg_answers} + if len(concepts) == 0: + log.info("No Concepts found.") + return + log.info("Found only %d Concepts with Knowledge graph out of %d. %d%%", + len(sample_concepts), len(concepts), + (len(sample_concepts) / len(concepts)) * 100) + # 2. pick elements that have concepts in the sample concepts set + sample_elements = {} + for element in elements: + if isinstance(element, DugConcept): + continue + for concept in element.concepts: + # add elements that have kg + if concept in sample_concepts: + sample_elements[concept] = sample_elements.get( + concept, set()) + sample_elements[concept].add(element.id) + + # Time for some validation + for curie in concepts: + concept = concepts[curie] + if not concept.kg_answers: + continue + search_terms = [] + for key in concept.kg_answers: + kg_object = concept.kg_answers[key] + search_terms += kg_object.get_node_names() + search_terms += kg_object.get_node_synonyms() + # reduce(lambda x,y: x + y, [[node.get("name")] + # + node.get("synonyms", []) + # for node in concept.kg_answers[ + # "knowledge_graph"]["nodes"]], []) + # validation here is that for any of these nodes we should get back + # the variable. + # make unique + search_terms_cap = 10 + search_terms = list(set(search_terms))[:search_terms_cap] + log.debug("Using %d Search terms for concept %s", len(search_terms), + str(curie)) + for search_term in search_terms: + # avoids elastic failure due to some reserved characters + # 'search_phase_execution_exception', + # 'token_mgr_error: Lexical error ... + search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', search_term) + + searched_element_ids = self._search_elements(curie, search_term) + + if curie not in sample_elements: + log.error("Did not find Curie id %s in Elements.", + str(curie)) + log.error("Concept id : %s, Search term: %s", + str(concept.id), search_term) + raise PipelineException( + f"Validation error - Did not find {curie} for " + f"Concept id : {concept.id}, " + f"Search term: {search_term}") + + present = bool([x for x in sample_elements[curie] + if x in searched_element_ids]) + if not present: + log.error("Did not find expected variable %s " + "in search result.", + str(curie)) + log.error("Concept id : %s, Search term: %s", + str(concept.id), search_term) + raise PipelineException( + f"Validation error - Did not find {curie} for" + f" Concept id : {concept.id}, " + f"Search term: {search_term}") + + def clear_index(self, index_id): + "Delete the index specified by index_id from ES" + exists = self.event_loop.run_until_complete(self.search_obj.es.indices.exists(index=index_id)) + if exists: + log.info("Deleting index %s", str(index_id)) + response = self.event_loop.run_until_complete( + self.search_obj.es.indices.delete(index=index_id)) + log.info("Cleared Elastic : %s", str(response)) + log.info("Re-initializing the indicies") + self.index_obj.init_indices() + + def clear_variables_index(self): + "Delete the variables index from ES" + self.clear_index(self.variables_index) + + def clear_kg_index(self): + "Delete the KG index from ES" + self.clear_index(self.kg_index) + + def clear_concepts_index(self): + "Delete the concepts index from ES" + self.clear_index(self.concepts_index) + + #### + # Methods above this are directly from what used to be + # dug_helpers.dug_utils.Dug. Methods below are consolidated from what used + # to be dug_helpers.dug_utils.DugUtil. These are intented to be the "top + # level" interface to Roger, which Airflow DAGs or other orchestrators can + # call directly. + + def _fetch_s3_file(self, filename, output_dir): + "Fetch a file from s3 to output_dir" + log.info("Fetching %s", filename) + output_name = filename.split('/')[-1] + output_path = output_dir / output_name + self.s3_utils.get( + str(filename), + str(output_path), + ) + if self.unzip_source: + log.info("Unzipping %s", str(output_path)) + with tarfile.open(str(output_path)) as tar: + tar.extractall(path=output_dir) + return output_path + + def _fetch_remote_file(self, filename, output_dir, current_version): + "Fetch a file from a location using FileFetcher" + log.info("Fetching %s", filename) + # fetch from stars + remote_host = self.config.annotation_base_data_uri + fetch = FileFetcher( + remote_host=remote_host, + remote_dir=current_version, + local_dir=output_dir) + output_path = fetch(filename) + if self.unzip_source: + log.info("Unzipping %s", str(output_path)) + with tarfile.open(str(output_path)) as tar: + tar.extractall(path=output_dir) + return output_path + + def get_versioned_files(self): + """ Fetches a dug input data files to input file directory + """ + meta_data = storage.read_relative_object("../../metadata.yaml") + output_dir: Path = storage.dug_input_files_path( + self.get_files_dir()) + data_store = self.config.dug_inputs.data_source + + # clear dir + storage.clear_dir(output_dir) + data_sets = self.config.dug_inputs.data_sets + log.info("dataset: %s", data_sets) + pulled_files = [] + for data_set in data_sets: + data_set_name, current_version = data_set.split(':') + for item in meta_data["dug_inputs"]["versions"]: + if (item["version"] == current_version and + item["name"] == data_set_name and + item["format"] == self.get_data_format()): + if data_store == "s3": + for filename in item["files"]["s3"]: + pulled_files.append( + self._fetch_s3_file(filename, output_dir)) + else: + for filename in item["files"]["stars"]: + pulled_files.append( + self.fetch_remote_file(filename, output_dir, + current_version)) + return [str(filename) for filename in pulled_files] + + def get_objects(self, input_data_path=None): + """Retrieve initial source objects for parsing + + This is a default method that will be overridden by subclasses + frequently, it is expected. + """ + if not input_data_path: + input_data_path = storage.dug_input_files_path( + self.get_files_dir()) + files = storage.get_files_recursive( + lambda file_name: file_name.endswith('.xml'), + input_data_path) + return sorted([str(f) for f in files]) + + def annotate(self, to_string=False, files=None, input_data_path=None, + output_data_path=None): + "Annotate files with the appropriate parsers and crawlers" + if files is None: + files = self.get_objects(input_data_path=input_data_path) + self.annotate_files(parsable_files=files, + output_data_path=output_data_path) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def index_variables(self, to_string=False, element_object_files=None, + input_data_path=None, output_data_path=None): + """Index variables from element object files for pipeline + + if element_object_files is specified, only those files are + indexed. Otherwise, if the input_data_path is supplied, elements files + under that path are indexed. If neither is supplied, the default + directory is searched for index files and those are indexed. + """ + # self.clear_variables_index() + if element_object_files is None: + element_object_files = storage.dug_elements_objects(input_data_path,format='txt') + for file_ in element_object_files: + self.index_elements(file_) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def validate_indexed_variables(self, to_string=None, + element_object_files=None, + input_data_path=None, + output_data_path=None): + "Validate output from index variables task for pipeline" + if not element_object_files: + element_object_files = storage.dug_elements_objects(input_data_path, format='txt') + for file_ in element_object_files: + log.info("Validating %s", str(file_)) + self.validate_indexed_element_file(file_) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def validate_indexed_concepts(self, config=None, to_string=None, input_data_path=None, output_data_path=None): + """ + Entry for validate concepts + """ + get_data_set_name = lambda file: os.path.split(os.path.dirname(file))[-1] + expanded_concepts_files_dict = { + get_data_set_name(file): file for file in storage.dug_expanded_concept_objects(data_path=input_data_path, format='txt') + } + annotated_elements_files_dict = { + get_data_set_name(file): file for file in storage.dug_elements_objects(data_path=input_data_path, format='txt') + } + try: + assert len(expanded_concepts_files_dict) == len(annotated_elements_files_dict) + except: + log.error("Files Annotated Elements files and Expanded concepts files, should be pairs") + if len(expanded_concepts_files_dict) > len(annotated_elements_files_dict): + log.error("Some Annotated Elements files (from load_and_annotate task) are missing") + else: + log.error("Some Expanded Concepts files (from crawl task) are missing") + log.error(f"Annotated Datasets : {list(annotated_elements_files_dict.keys())}") + log.error(f"Expanded Concepts Datasets: {list(expanded_concepts_files_dict.keys())}") + exit(-1) + for data_set_name in annotated_elements_files_dict: + log.debug(f"Reading concepts and elements for dataset {data_set_name}") + elements_file_path = annotated_elements_files_dict[data_set_name] + concepts_file_path = expanded_concepts_files_dict[data_set_name] + dug_elements = jsonpickle.decode(storage.read_object(elements_file_path)) + dug_concepts = jsonpickle.decode(storage.read_object(concepts_file_path)) + log.debug(f"Read {len(dug_elements)} elements, and {len(dug_concepts)} Concepts") + log.info(f"Validating {data_set_name}") + self._validate_indexed_concepts(elements=dug_elements, concepts=dug_concepts) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def make_kg_tagged(self, to_string=False, elements_files=None, + input_data_path=None, output_data_path=None): + "Create tagged knowledge graphs from elements" + if not output_data_path: + output_data_path = storage.dug_kgx_path("") + storage.clear_dir(output_data_path) + log.info("Starting building KGX files") + + if not elements_files: + elements_files = storage.dug_elements_objects(input_data_path, format='txt') + log.info(f"found {len(elements_files)} files : {elements_files}") + for file_ in elements_files: + elements = jsonpickle.decode(storage.read_object(file_)) + if "topmed_" in file_: + kg = self.make_tagged_kg(elements) + else: + kg = self.convert_to_kgx_json(elements) + dug_base_file_name = file_.split(os.path.sep)[-2] + output_file_path = os.path.join(output_data_path, + dug_base_file_name + '_kgx.json') + storage.write_object(kg, output_file_path) + log.info("Wrote %d and %d edges, to %s", len(kg['nodes']), + len(kg['edges']), output_file_path) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def crawl_tranql(self, to_string=False, concept_files=None, + input_data_path=None, output_data_path=None): + "Perform the tranql crawl" + if not concept_files: + concept_files = storage.dug_concepts_objects(input_data_path, format='txt') + + if output_data_path: + crawl_dir = os.path.join(output_data_path, 'crawl_output') + expanded_concepts_dir = os.path.join(output_data_path, + 'expanded_concepts') + else: + crawl_dir = storage.dug_crawl_path('crawl_output') + expanded_concepts_dir = storage.dug_expanded_concepts_path("") + log.info("Clearing crawl output dir %s", crawl_dir) + storage.clear_dir(crawl_dir) + + log.info("Clearing expanded concepts dir: %s", expanded_concepts_dir) + storage.clear_dir(expanded_concepts_dir) + + log.info("Crawling Dug Concepts, found %d file(s).", + len(concept_files)) + for file_ in concept_files: + objects = storage.read_object(file_) + objects = objects or {} + if not objects: + log.info(f'no concepts in {file_}') + data_set = jsonpickle.decode(objects) + original_variables_dataset_name = os.path.split( + os.path.dirname(file_))[-1] + self.crawl_concepts(concepts=data_set, + data_set_name=original_variables_dataset_name, output_path= output_data_path) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def index_concepts(self, to_string=False, + input_data_path=None, output_data_path=None): + "Index concepts from expanded concept files" + # These are concepts that have knowledge graphs from tranql + # clear out concepts and kg indicies from previous runs + # self.clear_concepts_index() + # self.clear_kg_index() + expanded_concepts_files = storage.dug_expanded_concept_objects( + input_data_path, format="txt") + for file_ in expanded_concepts_files: + concepts = jsonpickle.decode(storage.read_object(file_)) + self._index_concepts(concepts=concepts) + + if self.config.indexing.node_to_element_queries: + log.info("*******************") + + extracted_elements_files = storage.dug_extracted_elements_objects(data_path=input_data_path) + log.info(f"{extracted_elements_files}") + for file_ in extracted_elements_files: + log.info(f"reading file {file_}") + self.index_elements(file_) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log diff --git a/dags/roger/pipelines/bdc.py b/dags/roger/pipelines/bdc.py new file mode 100644 index 00000000..bc30cf44 --- /dev/null +++ b/dags/roger/pipelines/bdc.py @@ -0,0 +1,19 @@ +"Pipeline for BDC-dbGap data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class bdcPipeline(DugPipeline): + "Pipeline for BDC-dbGap data set" + pipeline_name = "bdc" + parser_name = "dbgap" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_dd_xml_path() + files = storage.get_files_recursive( + lambda file_name: ( + not file_name.startswith('._') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/bdc_pipelines.py b/dags/roger/pipelines/bdc_pipelines.py new file mode 100644 index 00000000..d4c6436d --- /dev/null +++ b/dags/roger/pipelines/bdc_pipelines.py @@ -0,0 +1,58 @@ +"Dug pipeline for dbGaP data set" + +from roger.pipelines import DugPipeline + +class BIOLINCCdbGaPPipeline(DugPipeline): + "Pipeline for the dbGaP data set" + pipeline_name = 'bdc-biolincc' + parser_name = 'biolincc' + + +class covid19dbGaPPipeline(DugPipeline): + "Pipeline for the dbGaP data set" + pipeline_name = 'bdc-covid19' + parser_name = 'covid19' + +class dirDbGaPPipeline(DugPipeline): + pipeline_name = "bdc-dir" + parser_name = "dir" + +class LungMapDbGaPPipeline(DugPipeline): + pipeline_name = "bdc-lungmap" + parser_name = "lungmap" + +class nsrrDbGaPPipeline(DugPipeline): + pipeline_name = "bdc-nsrr" + parser_name = "nsrr" + +class ParentDbGaPPipeline(DugPipeline): + pipeline_name = "bdc-parent" + parser_name = "parent" + +class PCGCDbGaPPipeline(DugPipeline): + pipeline_name = "pcgc-dbgap" + parser_name = "pcgc" + +class RecoverDbGaPPipeline(DugPipeline): + pipeline_name = "bdc-recover" + parser_name = "recover" + +class TopmedDBGaPPipeline(DugPipeline): + pipeline_name = "bdc-topmed" + parser_name = "topmeddbgap" + +class CureSCPipeline(DugPipeline): + pipeline_name = "bdc-curesc" + parser_name = "curesc" + +class HeartFailurePipeline(DugPipeline): + pipeline_name = "bdc-heartfailure" + parser_name = "heartfailure" + +class ImagingPipeline(DugPipeline): + pipeline_name = "bdc-imaging" + parser_name = "imaging" + +class RedsPipeline(DugPipeline): + pipeline_name = "bdc-reds" + parser_name = "reds" \ No newline at end of file diff --git a/dags/roger/pipelines/crdc.py b/dags/roger/pipelines/crdc.py new file mode 100644 index 00000000..2143cf7b --- /dev/null +++ b/dags/roger/pipelines/crdc.py @@ -0,0 +1,19 @@ +"Pipeline for Cancer Commons data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class CRDCPipeline(DugPipeline): + "Pipeline for Cancer Commons data set" + pipeline_name = "crdc" + parser_name = "crdc" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_crdc_path() + files = storage.get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/ctn.py b/dags/roger/pipelines/ctn.py new file mode 100644 index 00000000..25918062 --- /dev/null +++ b/dags/roger/pipelines/ctn.py @@ -0,0 +1,10 @@ +"Pipeline for Clinical trials network data" + +from roger.pipelines import DugPipeline + +class CTNPipeline(DugPipeline): + "Pipeline for Clinical trials nework data set" + pipeline_name = "ctn" + parser_name = "ctn" + + diff --git a/dags/roger/pipelines/db_gap.py b/dags/roger/pipelines/db_gap.py new file mode 100644 index 00000000..7c1db504 --- /dev/null +++ b/dags/roger/pipelines/db_gap.py @@ -0,0 +1,10 @@ +"Dug pipeline for dbGaP data set" + +from roger.pipelines import DugPipeline + +class dbGaPPipeline(DugPipeline): + "Pipeline for the dbGaP data set" + + pipeline_name = 'dbGaP' + parser_name = 'DbGaP' + files_dir = 'db_gap' diff --git a/dags/roger/pipelines/heal_research_programs.py b/dags/roger/pipelines/heal_research_programs.py new file mode 100644 index 00000000..bfec3f83 --- /dev/null +++ b/dags/roger/pipelines/heal_research_programs.py @@ -0,0 +1,16 @@ +"Pipeline for Heal-studies data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class HealResearchProgramPipeline(DugPipeline): + "Pipeline for Heal-research-programs data set" + pipeline_name = "heal-mds-research-networks" + parser_name = "heal-research" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_heal_research_program_path() + files = storage.get_files_recursive(lambda file_name: file_name.endswith('.xml'), + input_data_path) + return sorted([str(f) for f in files]) \ No newline at end of file diff --git a/dags/roger/pipelines/heal_studies.py b/dags/roger/pipelines/heal_studies.py new file mode 100644 index 00000000..a08e8115 --- /dev/null +++ b/dags/roger/pipelines/heal_studies.py @@ -0,0 +1,16 @@ +"Pipeline for Heal-studies data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class HealStudiesPipeline(DugPipeline): + "Pipeline for Heal-studies data set" + pipeline_name = "heal-mds-studies" + parser_name = "heal-studies" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_heal_study_path() + files = storage.get_files_recursive(lambda file_name: file_name.endswith('.xml'), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/kfdrc.py b/dags/roger/pipelines/kfdrc.py new file mode 100644 index 00000000..bcb0b7ac --- /dev/null +++ b/dags/roger/pipelines/kfdrc.py @@ -0,0 +1,19 @@ +"Pipeline for KDFRC data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class kfdrcPipeline(DugPipeline): + "Pipeline for KDFRC data set" + pipeline_name = "kfdrc" + parser_name = "kfdrc" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_kfdrc_path() + files = storage.get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/nida.py b/dags/roger/pipelines/nida.py new file mode 100644 index 00000000..b2e841bd --- /dev/null +++ b/dags/roger/pipelines/nida.py @@ -0,0 +1,18 @@ +"NIDA data set pipeline definition" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class NIDAPipeline(DugPipeline): + "NIDA data pipeline" + + pipeline_name = 'nida' + parser_name = 'NIDA' + + def get_objects(self, input_data_path=None): + "Return list of NIDA source files" + if not input_data_path: + input_data_path = storage.dug_input_files_path( + self.get_files_dir()) + files = sorted(storage.get_files_recursive(lambda x: 'NIDA-' in x , input_data_path)) + return files diff --git a/dags/roger/pipelines/picsure_test.py b/dags/roger/pipelines/picsure_test.py new file mode 100644 index 00000000..bea4469f --- /dev/null +++ b/dags/roger/pipelines/picsure_test.py @@ -0,0 +1,26 @@ +from roger.pipelines import DugPipeline +from roger.core import storage +from roger.logger import logger + + +class PicSure(DugPipeline): + "Pipeline for BACPAC data set" + pipeline_name = "bdc-test6" #lakefs + parser_name = "dbgap" + + def get_objects(self, input_data_path=None): + """Retrieve anvil objects + + This code is imported from roger.core.storage.dug_anvil_objects + """ + if not input_data_path: + input_data_path = storage.dug_input_files_path( + self.files_dir) + files = storage.get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + logger.info("**********") + logger.info(files) + return sorted([str(f) for f in files]) \ No newline at end of file diff --git a/dags/roger/pipelines/radx.py b/dags/roger/pipelines/radx.py new file mode 100644 index 00000000..7ffae159 --- /dev/null +++ b/dags/roger/pipelines/radx.py @@ -0,0 +1,18 @@ +"Pipeline for BACPAC data" + +from roger.pipelines import DugPipeline +from roger.core import storage + + +class RadxPipeline(DugPipeline): + "Pipeline for Radx data set" + pipeline_name = "radx" + parser_name = "radx" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_kfdrc_path() + files = storage.get_files_recursive( + lambda file_name: file_name.endswith('.json'), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/sparc.py b/dags/roger/pipelines/sparc.py new file mode 100644 index 00000000..d1c9c950 --- /dev/null +++ b/dags/roger/pipelines/sparc.py @@ -0,0 +1,17 @@ +"Pipeline for Sparc data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class SparcPipeline(DugPipeline): + "Pipeline for Sparc data set" + pipeline_name = "sparc" + parser_name = "SciCrunch" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_heal_study_path() + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/topmed.py b/dags/roger/pipelines/topmed.py new file mode 100644 index 00000000..90b3e515 --- /dev/null +++ b/dags/roger/pipelines/topmed.py @@ -0,0 +1,41 @@ +"Pipeline for Topmed data" + +from roger.pipelines import DugPipeline +from roger.pipelines.base import log, os +import jsonpickle +from roger.core import storage +from roger.logger import logger +class TopmedPipeline(DugPipeline): + "Pipeline for Topmed data set" + pipeline_name = "topmed" + parser_name = "TOPMedTag" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = str(storage.dug_input_files_path('topmed')) + files =storage.get_files_recursive( + lambda file_name: file_name.endswith('.csv'), + input_data_path) + return sorted([str(x) for x in files]) + + def make_kg_tagged(self, to_string=False, elements_files=None, + input_data_path=None, output_data_path=None): + "Create tagged knowledge graphs from elements" + log.info("Override base.make_kg_tagged called") + if not output_data_path: + output_data_path = storage.dug_kgx_path("") + storage.clear_dir(output_data_path) + if not elements_files: + elements_files = storage.dug_elements_objects(input_data_path, format='txt') + for file_ in elements_files: + elements = jsonpickle.decode(storage.read_object(file_)) + kg = self.make_tagged_kg(elements) + dug_base_file_name = file_.split(os.path.sep)[-2] + output_file_path = os.path.join(output_data_path, + dug_base_file_name + '_kgx.json') + storage.write_object(kg, output_file_path) + log.info("Wrote %d and %d edges, to %s", len(kg['nodes']), + len(kg['edges']), output_file_path) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + diff --git a/dags/roger/pvc.yaml b/dags/roger/pvc.yaml new file mode 100644 index 00000000..691fed1b --- /dev/null +++ b/dags/roger/pvc.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: search-data +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 10Mi diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py new file mode 100755 index 00000000..5fe5ff90 --- /dev/null +++ b/dags/roger/tasks.py @@ -0,0 +1,449 @@ +"Tasks and methods related to Airflow implementations of Roger" + +import os + +from airflow.operators.python import PythonOperator +from airflow.operators.empty import EmptyOperator +from airflow.utils.task_group import TaskGroup +from airflow.utils.dates import days_ago +from airflow.models import DAG +from airflow.models.dag import DagContext +from airflow.models.taskinstance import TaskInstance +from airflow.operators.bash import BashOperator +from typing import Union +from pathlib import Path +import glob +import shutil + + +from roger.config import config, RogerConfig +from roger.logger import get_logger +from roger.pipelines.base import DugPipeline +from avalon.mainoperations import put_files, LakeFsWrapper, get_files +from lakefs_sdk.configuration import Configuration +from lakefs_sdk.models.merge import Merge +from functools import partial + +logger = get_logger() + +default_args = { + 'owner': 'RENCI', + 'start_date': days_ago(1) +} + + +def task_wrapper(python_callable, **kwargs): + """ + Overrides configuration with config from airflow. + :param python_callable: + :param kwargs: + :return: + """ + # get dag config provided + dag_run = kwargs.get('dag_run') + pass_conf = kwargs.get('pass_conf', True) + if config.lakefs_config.enabled: + # get input path + input_data_path = generate_dir_name_from_task_instance(kwargs['ti'], + roger_config=config, + suffix='input') + # get output path from task id run id dag id combo + output_data_path = generate_dir_name_from_task_instance(kwargs['ti'], + roger_config=config, + suffix='output') + else: + input_data_path, output_data_path = None, None + # cast it to a path object + func_args = { + 'input_data_path': input_data_path, + 'output_data_path': output_data_path, + 'to_string': kwargs.get('to_string') + } + logger.info(f"Task function args: {func_args}") + # overrides values + config.dag_run = dag_run + if pass_conf: + return python_callable(config=config, **func_args) + return python_callable(**func_args) + +def get_executor_config(data_path='/opt/airflow/share/data'): + """ Get an executor configuration. + :param annotations: Annotations to attach to the executor. + :returns: Returns a KubernetesExecutor if K8s configured, None otherwise. + """ + env_var_prefix = config.OS_VAR_PREFIX + # based on environment set on scheduler pod, make secrets for worker pod + # this ensures passwords don't leak as pod templates. + secrets_map = [{ + "secret_name_ref": "ELASTIC_SEARCH_PASSWORD_SECRET", + "secret_key_ref": "ELASTIC_SEARCH_PASSWORD_SECRET_KEY", + "env_var_name": f"{env_var_prefix}ELASTIC__SEARCH_PASSWORD" + },{ + "secret_name_ref": "REDIS_PASSWORD_SECRET", + "secret_key_ref": "REDIS_PASSWORD_SECRET_KEY", + "env_var_name": f"{env_var_prefix}REDISGRAPH_PASSWORD" + }] + secrets = [] + for secret in secrets_map: + secret_name = os.environ.get(secret["secret_name_ref"], False) + secret_key_name = os.environ.get(secret["secret_key_ref"], False) + if secret_name and secret_key_name: + secrets.append({ + "name": secret["env_var_name"], + "valueFrom": { + "secretKeyRef": { + "name": secret_name, + "key": secret_key_name + } + }}) + + k8s_executor_config = { + "KubernetesExecutor": { + "envs": secrets, + } + } + return k8s_executor_config + +def init_lakefs_client(config: RogerConfig) -> LakeFsWrapper: + configuration = Configuration() + configuration.username = config.lakefs_config.access_key_id + configuration.password = config.lakefs_config.secret_access_key + configuration.host = config.lakefs_config.host + the_lake = LakeFsWrapper(configuration=configuration) + return the_lake + + +def pagination_helper(page_fetcher, **kwargs): + """Helper function to iterate over paginated results""" + while True: + resp = page_fetcher(**kwargs) + yield from resp.results + if not resp.pagination.has_more: + break + kwargs['after'] = resp.pagination.next_offset + + +def avalon_commit_callback(context: DagContext, **kwargs): + client: LakeFsWrapper = init_lakefs_client(config=config) + # now files have been processed, + # this part should + # get the out path of the task + local_path = str(generate_dir_name_from_task_instance(context['ti'], + roger_config=config, + suffix='output')).rstrip('/') + '/' + task_id = context['ti'].task_id + dag_id = context['ti'].dag_id + run_id = context['ti'].run_id + # run id looks like 2023-10-18T17:35:14.890186+00:00 + # normalized to 2023_10_18T17_35_14_890186_00_00 + # since lakefs branch id must consist of letters, digits, underscores and dashes, + # and cannot start with a dash + run_id_normalized = run_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_') + dag_id_normalized = dag_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_') + task_id_normalized = task_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_') + temp_branch_name = f'{dag_id_normalized}_{task_id_normalized}_{run_id_normalized}' + # remote path to upload the files to. + remote_path = f'{dag_id}/{task_id}/' + + # merge destination branch + branch = config.lakefs_config.branch + repo = config.lakefs_config.repo + # This part pushes to a temp branch on the repo + + # now we have the output path lets do some pushing but where ? + # right now lets stick to using one repo , + + # issue Vladmir pointed out if uploads to a single lakefs branch have not + # been finalized with commit, + # this would cause dirty commits if parallel tasks target the same branch. + + # solution: Lakefs team suggested we commit to a different temp branch per + # task, and merge that branch. + # this callback function will do that for now. + + # 1. put files into a temp branch. + # 2. make sure a commit happens. + # 3. merge that branch to master branch. + logger.info("Pushing local path %s to %s@%s in %s dir", + local_path, repo, temp_branch_name, remote_path) + put_files( + local_path=local_path, + remote_path=remote_path, + task_name=task_id, + task_args=[""], + pipeline_id=dag_id, + task_docker_image="docker-image", + s3storage=False, + lake_fs_client=client, + branch=temp_branch_name, + repo=repo, + # @TODO figure out how to pass real commit id here + commit_id=branch, + source_branch_name=branch + ) + + # see what changes are going to be pushed from this branch to main branch + for diff in pagination_helper(client._client.refs_api.diff_refs, + repository=repo, left_ref=branch, + right_ref=temp_branch_name): + logger.info("Diff: " + str(diff)) + + try: + # merging temp branch to working branch + # the current working branch wins incase of conflicts + merge = Merge(**{"strategy": "source-wins"}) + client._client.refs_api.merge_into_branch(repository=repo, + source_ref=temp_branch_name, + destination_branch=branch, + merge=merge + ) + + logger.info(f"merged branch {temp_branch_name} into {branch}") + except Exception as e: + # remove temp + logger.error(e) + # delete temp branch + finally: + client._client.branches_api.delete_branch( + repository=repo, + branch=temp_branch_name + ) + + logger.info(f"deleted temp branch {temp_branch_name}") + logger.info(f"deleting local dir {local_path}") + files_to_clean = glob.glob(local_path + '**', recursive=True) + [local_path] + + clean_up(context, **kwargs) + +def clean_up(context: DagContext, **kwargs): + input_dir = str(generate_dir_name_from_task_instance(context['ti'], + roger_config=config, + suffix='output')).rstrip('/') + '/' + output_dir = str(generate_dir_name_from_task_instance(context['ti'], + roger_config=config, + suffix='input')).rstrip('/') + '/' + files_to_clean = glob.glob(input_dir + '**', recursive=True) + [input_dir] + files_to_clean += glob.glob(output_dir + '**', recursive=True) + [output_dir] + for f in files_to_clean: + if os.path.exists(f): + shutil.rmtree(f) + +def generate_dir_name_from_task_instance(task_instance: TaskInstance, + roger_config: RogerConfig, suffix:str): + # if lakefs is not enabled just return none so methods default to using + # local dir structure. + if not roger_config.lakefs_config.enabled: + return None + root_data_dir = os.getenv("ROGER_DATA_DIR").rstrip('/') + task_id = task_instance.task_id + dag_id = task_instance.dag_id + run_id = task_instance.run_id + try_number = task_instance._try_number + return Path( + f"{root_data_dir}/{dag_id}_{task_id}_{run_id}_{try_number}_{suffix}") + +def setup_input_data(context, exec_conf): + logger.info(""" + - Figures out the task name and id, + - find its data dependencies + - clean up and create in and out dir + - put dependency data in input dir + - if for some reason data was not found raise an exception + """) + # Serves as a location where files the task will work on are placed. + # computed as ROGER_DATA_DIR + /current task instance name_input_dir + + input_dir = str(generate_dir_name_from_task_instance( + context['ti'], roger_config=config, suffix="input")) + # Clear up files from previous run etc... + + # create input dir + os.makedirs(input_dir, exist_ok=True) + + # Download files from lakefs and store them in this new input_path + client = init_lakefs_client(config=config) + repos = exec_conf['repos'] + # if no external repo is provided we assume to get the upstream task dataset. + if not repos or len(repos) == 0: + # merge destination branch + branch = config.lakefs_config.branch + repo = config.lakefs_config.repo + task_instance: TaskInstance = context['ti'] + # get upstream ids + upstream_ids = task_instance.task.upstream_task_ids + dag_id = task_instance.dag_id + # calculate remote dirs using dag_id + upstreams + repos = [{ + 'repo': repo, + 'branch': branch, + 'path': f'{dag_id}/{upstream_id}' + } for upstream_id in upstream_ids] + + # input_repo = exec_conf['input_repo'] + # input_branch = exec_conf['input_branch'] + # If input repo is provided use that as source of files + for repo in repos: + if not repo.get('path'): + # get all if path is not specified + repo['path'] = '*' + logger.info(f"repos : {repos}") + for r in repos: + logger.info("downloading %s from %s@%s to %s", + r['path'], r['repo'], r['branch'], input_dir) + # create path to download to ... + if not os.path.exists(input_dir + f'/{r["repo"]}'): + os.mkdir(input_dir + f'/{r["repo"]}') + get_files( + local_path=input_dir + f'/{r["repo"]}', + remote_path=r['path'], + branch=r['branch'], + repo=r['repo'], + changes_only=False, + lake_fs_client=client + ) + + +def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = {}, pass_conf=True, no_output_files=False): + """ Create a python task. + :param func_kwargs: additional arguments for callable. + :param dag: dag to add task to. + :param name: The name of the task. + :param a_callable: The code to run in this task. + """ + + # these are actual arguments passed down to the task function + op_kwargs = { + "python_callable": a_callable, + "to_string": True, + "pass_conf": pass_conf + } + # update / override some of the args passed to the task function by default + if func_kwargs is None: + func_kwargs = {} + op_kwargs.update(func_kwargs) + + + # Python operator arguments , by default for non-lakefs config this is all we need. + python_operator_args = { + "task_id": name, + "python_callable":task_wrapper, + # "executor_config" : get_executor_config(), + "dag": dag, + "provide_context" : True + } + + # if we have lakefs... + if config.lakefs_config.enabled: + + # repo and branch for pre-execution , to download input objects + pre_exec_conf = { + 'repos': [] + } + if external_repos: + # if the task is a root task , beginning of the dag... + # and we want to pull data from a different repo. + pre_exec_conf = { + 'repos': [{ + 'repo': r['name'], + 'branch': r['branch'], + 'path': r.get('path', '*') + } for r in external_repos] + } + + pre_exec = partial(setup_input_data, exec_conf=pre_exec_conf) + # add pre_exec partial function as an argument to python executor conf + python_operator_args['pre_execute'] = pre_exec + python_operator_args['on_failure_callback'] = partial(clean_up, kwargs=op_kwargs) + # if the task has output files, we will add a commit callback + if not no_output_files: + python_operator_args['on_success_callback'] = partial(avalon_commit_callback, kwargs=op_kwargs) + + # add kwargs + python_operator_args["op_kwargs"] = op_kwargs + + return PythonOperator(**python_operator_args) + +def create_pipeline_taskgroup( + dag, + pipeline_class: type, + configparam: RogerConfig, + **kwargs): + """Emit an Airflow dag pipeline for the specified pipeline_class + + Extra kwargs are passed to the pipeline class init call. + """ + name = pipeline_class.pipeline_name + input_dataset_version = pipeline_class.input_version + + with TaskGroup(group_id=f"{name}_dataset_pipeline_task_group") as tg: + with pipeline_class(config=configparam, **kwargs) as pipeline: + pipeline: DugPipeline + annotate_task = create_python_task( + dag, + f"annotate_{name}_files", + pipeline.annotate, + external_repos=[{ + 'name': getattr(pipeline_class, 'pipeline_name'), + 'branch': input_dataset_version + }], + pass_conf=False) + + index_variables_task = create_python_task( + dag, + f"index_{name}_variables", + pipeline.index_variables, + pass_conf=False, + # declare that this task will not generate files. + no_output_files=True) + index_variables_task.set_upstream(annotate_task) + + validate_index_variables_task = create_python_task( + dag, + f"validate_{name}_index_variables", + pipeline.validate_indexed_variables, + pass_conf=False, + # declare that this task will not generate files. + no_output_files=True + ) + validate_index_variables_task.set_upstream([annotate_task, index_variables_task]) + + make_kgx_task = create_python_task( + dag, + f"make_kgx_{name}", + pipeline.make_kg_tagged, + pass_conf=False) + make_kgx_task.set_upstream(annotate_task) + + crawl_task = create_python_task( + dag, + f"crawl_{name}", + pipeline.crawl_tranql, + pass_conf=False) + crawl_task.set_upstream(annotate_task) + + index_concepts_task = create_python_task( + dag, + f"index_{name}_concepts", + pipeline.index_concepts, + pass_conf=False, + # declare that this task will not generate files. + no_output_files=True) + index_concepts_task.set_upstream(crawl_task) + + validate_index_concepts_task = create_python_task( + dag, + f"validate_{name}_index_concepts", + pipeline.validate_indexed_concepts, + pass_conf=False, + # declare that this task will not generate files. + no_output_files=True + ) + validate_index_concepts_task.set_upstream([crawl_task, index_concepts_task, annotate_task]) + + + complete_task = EmptyOperator(task_id=f"complete_{name}") + complete_task.set_upstream( + (make_kgx_task, + validate_index_variables_task, validate_index_concepts_task)) + + return tg diff --git a/dags/test_metadata.yaml b/dags/test_metadata.yaml new file mode 100644 index 00000000..54d508c4 --- /dev/null +++ b/dags/test_metadata.yaml @@ -0,0 +1,124 @@ +# This is a file that lists the data to be used for testing purposes +# It contains a reduced set of the metadata.yaml file +kgx: + versions: + - files: + - biolink-v1.0.json + - ctd-v1.0.json + - gtopdb-v1.0.json + - hetio-v1.0.json + - hgnc-v1.0.json + - hmdb-v1.0.json + - kegg-v1.0.json + - mychem-v1.0.json + - ontological-hierarchy-v1.0.json + - panther-v1.0.json + - foodb-v1.0.json + - pharos-v1.0.json + - intact-v1.0.json + - human-goa-v1.0.json + - uberongraph-v1.0.json + - viral-proteome-v1.0.json + version: v1.0 + name: baseline-graph + format: json + - files: + - biolink-v2.0.json + - ctd-v2.0.json + - gtopdb-v2.0.json + - hetio-v2.0.json + - hgnc-v2.0.json + - hmdb-v2.0.json + - kegg-v2.0.json + - mychem-v2.0.json + - ontological-hierarchy-v2.0.json + - panther-v2.0.json + - foodb-v2.0.json + - pharos-v2.0.json + - intact-v2.0.json + - human-goa-v2.0.json + - uberongraph-v2.0.json + - viral-proteome-v2.0.json + version: v2.0 + name: baseline-graph + format: json + - files: + - heal/sparc/curation-export-processed.json + version: v2.0 + name: sparc-kgx + format: json + - files: + - Biolink_edges_v3.0.jsonl + - Biolink_nodes_v3.0.jsonl + - CTD_edges_v3.0.jsonl + - CTD_nodes_v3.0.jsonl + - DrugCentral_edges_v3.0.jsonl + - DrugCentral_nodes_v3.0.jsonl + - GtoPdb_edges_v3.0.jsonl + - GtoPdb_nodes_v3.0.jsonl + - Hetio_edges_v3.0.jsonl + - Hetio_nodes_v3.0.jsonl + - HGNC_edges_v3.0.jsonl + - HGNC_nodes_v3.0.jsonl + - HMDB_edges_v3.0.jsonl + - HMDB_nodes_v3.0.jsonl + - HumanGOA_edges_v3.0.jsonl + - HumanGOA_nodes_v3.0.jsonl + - IntAct_edges_v3.0.jsonl + - IntAct_nodes_v3.0.jsonl + - OntologicalHierarchy_edges_v3.0.jsonl + - OntologicalHierarchy_nodes_v3.0.jsonl + - PANTHER_edges_v3.0.jsonl + - PANTHER_nodes_v3.0.jsonl + - PHAROS_edges_v3.0.jsonl + - PHAROS_nodes_v3.0.jsonl + - UberGraph_edges_v3.0.jsonl + - UberGraph_nodes_v3.0.jsonl + version: v3.0 + name: baseline-graph + format: jsonl + - version: test + files: + - hgnc_nodes.jsonl + - hgnc_edges.jsonl + name: test + - version: v3.0 + name: cde-graph + format: jsonl + files: + - cde/annotated_edges_v3.0.jsonl + - cde/annotated_nodes_v3.0.jsonl +dug_inputs: + versions: + - name: bdc + version: v1.0 + files: + s3: + - "bdc/v1.0/bdc_dbgap_data_dicts.tar.gz" + stars: + - "bdc_dbgap_data_dicts.tar.gz" + format: dbGaP + - name: nida + version: v1.0 + files: + s3: + - "nida/v1.0/nida-12studies.tar.gz" + stars: + - "nida-12studies.tar.gz" + format: nida + - name: sparc + version: v1.0 + files: + s3: + - "sparc/v1.0/sparc-dbgap-xml-formatted.tar.gz" + stars: + - "sparc-dbgap-xml-formatted.tar.gz" + format: sparc + - name: anvil + version: v1.0 + files: + s3: + - "bdc/v1.0/anvil_dbgap_data_dicts.tar.gz" + stars: + - "anvil_dbgap_data_dicts.tar.gz" + format: anvil \ No newline at end of file diff --git a/dags/utils/__init__.py b/dags/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dags/utils/s3_utils.py b/dags/utils/s3_utils.py new file mode 100644 index 00000000..f0f7277b --- /dev/null +++ b/dags/utils/s3_utils.py @@ -0,0 +1,45 @@ +from contextlib import contextmanager + +import boto3 + +from roger.config import S3Config + + +class S3Utils: + + def __init__( + self, + s3_config: S3Config + ): + self.config = s3_config + + @contextmanager + def connect( + self, + ): + session = boto3.session.Session( + aws_access_key_id=self.config.access_key, + aws_secret_access_key=self.config.secret_key, + ) + + s3 = session.resource( + 's3', + endpoint_url=self.config.host, + ) + bucket = s3.Bucket(self.config.bucket) + yield bucket + + def get(self, remote_file_name: str, local_file_name: str): + with self.connect() as bucket: + bucket.download_file(remote_file_name, local_file_name) + + def put(self, local_file_name: str, remote_file_name: str): + with self.connect() as bucket: + bucket.upload_file(local_file_name, remote_file_name) + + def ls(self): + with self.connect() as bucket: + return [ + obj + for obj in bucket.objects.all() + ] diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 00000000..7c698ed6 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,207 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. +# +# WARNING: This configuration is for local development. Do not use it in a production deployment. +# +# This configuration supports basic configuration using environment variables or an .env file +# The following variables are supported: +# +# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. +# Default: apache/airflow:master-python3.8 +# AIRFLOW_UID - User ID in Airflow containers +# Default: 50000 +# AIRFLOW_GID - Group ID in Airflow containers +# Default: 50000 +# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account. +# Default: airflow +# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account. +# Default: airflow +# +# Feel free to modify this file to suit your needs. +--- +version: '3' +x-airflow-common: + &airflow-common + build: + dockerfile: Dockerfile + context: . + environment: + &airflow-common-env + AIRFLOW__CORE__EXECUTOR: CeleryExecutor + AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__BROKER_URL: redis://:$REDIS_PASSWORD@redis:$REDIS_PORT/0 + AIRFLOW__CORE__FERNET_KEY: '' + AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' + AIRFLOW__CORE__LOAD_EXAMPLES: 'false' + + ROGER_DUG__INPUTS_DATA__SETS: "$ROGER_DUG__INPUTS_DATA__SETS" + ROGER_ELASTICSEARCH_HOST: "$ELASTIC_API_HOST" + ROGER_ELASTICSEARCH_PASSWORD: "$ELASTIC_PASSWORD" + ROGER_ELASTICSEARCH_NBOOST__HOST: "$NBOOST_API_HOST" + ROGER_REDISGRAPH_HOST: "$REDIS_HOST" + ROGER_REDISGRAPH_PASSWORD: "$REDIS_PASSWORD" + ROGER_KGX_DATASET__VERSION: "v3.0" + ROGER_DATA_DIR: "/opt/airflow/share/data" + volumes: + - ./dags:/opt/airflow/dags + - ./logs:/opt/airflow/logs + - ./plugins:/opt/airflow/plugins + - ./data:/opt/airflow/share/data + user: root + depends_on: + redis: + condition: service_healthy + postgres: + condition: service_healthy + +services: + postgres: + image: postgres:13 + environment: + POSTGRES_USER: airflow + POSTGRES_PASSWORD: airflow + POSTGRES_DB: airflow + volumes: + - postgres-db-volume:/var/lib/postgresql/data + - ${DATA_DIR}/elastic:/elastic + - ${DATA_DIR}/redis:/redis + healthcheck: + test: ["CMD", "pg_isready", "-U", "airflow"] + interval: 5s + retries: 5 + restart: always + + airflow-webserver: + <<: *airflow-common + command: webserver + ports: + - 8080:8080 + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] + interval: 10s + timeout: 10s + retries: 5 + restart: always + + airflow-scheduler: + <<: *airflow-common + command: scheduler + restart: always + + airflow-worker: + <<: *airflow-common + command: celery worker + restart: always + + airflow-init: + <<: *airflow-common + command: version + environment: + <<: *airflow-common-env + _AIRFLOW_DB_UPGRADE: 'true' + _AIRFLOW_WWW_USER_CREATE: 'true' + _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + + flower: + <<: *airflow-common + command: celery flower + ports: + - 5555:5555 + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:5555/"] + interval: 10s + timeout: 10s + retries: 5 + restart: always + + redis: + # image: redislabs/redisgraph:2.10.9 #Alternative Image + user: root + image: 'redis/redis-stack:6.2.4-v2' + command: "redis-server --requirepass $REDIS_PASSWORD --loadmodule /opt/redis-stack/lib/redisgraph.so" + environment: + - REDIS_ARGS=--requirepass $REDIS_PASSWORD + volumes: + - $DATA_DIR/redis:/data # FIX RDB Error on local + ports: + - $REDIS_PORT:$REDIS_PORT + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 30s + retries: 50 + restart: always + + dug: + image: containers.renci.org/helxplatform/dug:latest + depends_on: + - elasticsearch + - redis + restart: always + environment: + ELASTIC_API_HOST: "$ELASTIC_API_HOST" + ELASTIC_PASSWORD: "$ELASTIC_PASSWORD" + REDIS_HOST: "$REDIS_HOST" + REDIS_PASSWORD: "$REDIS_PASSWORD" + FLASK_ENV: "development" + PYTHONUNBUFFERED: "TRUE" + entrypoint: [ "gunicorn", + "--workers=$API_WORKERS", "--name=dug", + "--bind=0.0.0.0:$API_PORT", "--timeout=$API_TIMEOUT", + "--log-level=DEBUG", "-k", "uvicorn.workers.UvicornWorker", "--reload", "dug.server:APP"] + ports: + - $API_PORT:$API_PORT + + elasticsearch: + user: root + image: docker.elastic.co/elasticsearch/elasticsearch:8.5.2 + environment: + - ELASTIC_PASSWORD=$ELASTIC_PASSWORD + - discovery.type=single-node + - xpack.security.enabled=true + - ingest.geoip.downloader.enabled=false + volumes: + - $DATA_DIR/elastic:/usr/share/elasticsearch/data + ports: + - '9200:9200' + - '9300:9300' + + tranql: + image: containers.renci.org/helxplatform/tranql:rti-merge + ports: + - '8001:8001' + entrypoint: [ + "gunicorn", + "--workers=4", + "--bind=0.0.0.0:8001", + "--timeout=300", + "--access-logfile=$TRANQL_ACCESS_LOG", + "--error-logfile=$TRANQL_ERROR_LOG", + "--log-level=debug", + "tranql.api:app", + ] + environment: + - REDIS_PASSWORD=$REDIS_PASSWORD + volumes: + - ./tranql-schema.yaml:/tranql/tranql/conf/schema.yaml +volumes: + postgres-db-volume: \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0fe342a6..8b6ca0b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,189 +1,14 @@ -alabaster==0.7.12 -alembic==1.4.2 -antlr4-python3-runtime==4.8 -apache-airflow==1.10.12 -apispec==1.3.3 -appnope==0.1.0 -argcomplete==1.12.0 -argon2-cffi==20.1.0 -async-generator==1.10 -attrs==19.3.0 -Babel==2.8.0 -backcall==0.2.0 -biolink-model==1.2.5 -biolinkml==1.5.8 -bleach==3.2.1 -bmt==0.1.1 -cached-property==1.5.1 -cachetools==4.1.1 -cattrs==1.0.0 -certifi==2020.6.20 -cffi==1.14.3 -CFGraph==0.2.1 -chardet==3.0.4 -click==7.1.2 -colorama==0.4.3 -colorlog==4.0.2 -configparser==3.5.3 -croniter==0.3.34 -decorator==4.4.2 -defusedxml==0.6.0 -dill==0.3.2 -dnspython==1.16.0 -docker==4.3.1 -docutils==0.16 -email-validator==1.1.1 -entrypoints==0.3 -env==0.1.0 -Flask==1.1.2 -Flask-Admin==1.5.4 -Flask-AppBuilder==2.3.4 -Flask-Babel==1.0.0 -Flask-Caching==1.3.3 -Flask-JWT-Extended==3.24.1 -Flask-Login==0.4.1 -Flask-OpenID==1.2.5 -Flask-SQLAlchemy==2.4.4 -flask-swagger==0.2.14 -Flask-WTF==0.14.3 -funcsigs==1.0.2 -future==0.18.2 -graphviz==0.14.1 -gunicorn==20.0.4 -idna==2.10 -imagesize==1.2.0 -importlib-metadata==1.7.0 -iniconfig==1.1.1 -ipykernel==5.3.4 -ipython==7.18.1 -ipython-genutils==0.2.0 -ipywidgets==7.5.1 -iso8601==0.1.12 -isodate==0.6.0 -itsdangerous==1.1.0 -jedi==0.17.2 -Jinja2==2.11.2 -json-merge-patch==0.2 -jsonasobj==1.2.1 -jsonlines==1.2.0 -jsonschema==3.2.0 -jupyter==1.0.0 -jupyter-client==6.1.7 -jupyter-console==6.2.0 -jupyter-core==4.6.3 -jupyterlab-pygments==0.1.2 -kgx==0.1.0 -lazy-object-proxy==1.5.1 -lockfile==0.12.2 -Mako==1.1.3 -Markdown==2.6.11 -MarkupSafe==1.1.1 -marshmallow==2.21.0 -marshmallow-enum==1.5.1 -marshmallow-sqlalchemy==0.23.1 -mistune==0.8.4 -mypy==0.790 -mypy-extensions==0.4.3 -natsort==7.0.1 -nbclient==0.5.1 -nbconvert==6.0.7 -nbformat==5.0.8 -neo4jrestclient==2.1.1 -nest-asyncio==1.4.1 -networkx==2.5 -notebook==6.1.4 -numpy==1.19.1 -ordered-set==4.0.2 -packaging==20.4 -pandas==1.1.0 -pandocfilters==1.4.2 -parso==0.7.1 -pathlib==1.0.1 -pathtools==0.1.2 -pbr==5.5.0 -pendulum==1.4.4 -pexpect==4.8.0 -pickleshare==0.7.5 -pluggy==0.13.1 -prefixcommons==0.1.9 -prison==0.1.3 -prologterms==0.0.6 -prometheus-client==0.8.0 -prompt-toolkit==3.0.8 -psutil==5.7.2 -PTable==0.9.2 -ptyprocess==0.6.0 -py==1.9.0 -pycparser==2.20 -Pygments==2.6.1 -PyJSG==0.10.0 -PyJWT==1.7.1 -pyparsing==2.4.7 -pyrsistent==0.16.0 -PyShEx==0.7.14 -PyShExC==0.8.2 -pystache==0.5.4 -pytest==6.1.1 -python-daemon==2.2.4 -python-dateutil==2.8.1 -python-editor==1.0.4 -python-nvd3==0.15.0 -python-slugify==4.0.1 -python3-openid==3.2.0 -pytz==2020.1 -pytzdata==2020.1 -PyYAML==5.3.1 -pyzmq==19.0.2 -qtconsole==4.7.7 -QtPy==1.9.0 -rdflib==5.0.0 -rdflib-jsonld==0.5.0 -redis==3.5.3 -redisgraph==2.1.5 -redisgraph-bulk-loader==0.9.3 -requests==2.24.0 -Send2Trash==1.5.0 -setproctitle==1.1.10 -ShExJSG==0.7.0 -six==1.15.0 -snowballstemmer==2.0.0 -sparql-slurper==0.3.4 -SPARQLWrapper==1.8.5 -Sphinx==3.2.1 -sphinx-click==2.5.0 -sphinx-rtd-theme==0.5.0 -sphinxcontrib-applehelp==1.0.2 -sphinxcontrib-devhelp==1.0.2 -sphinxcontrib-htmlhelp==1.0.3 -sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-qthelp==1.0.3 -sphinxcontrib-serializinghtml==1.1.4 -SQLAlchemy==1.3.18 -SQLAlchemy-JSONField==0.9.0 -SQLAlchemy-Utils==0.36.8 -stringcase==1.2.0 -tabulate==0.8.7 -tenacity==4.12.0 -terminado==0.9.1 -terminaltables==3.1.0 -testpath==0.4.4 -text-unidecode==1.3 -thrift==0.13.0 -toml==0.10.1 -tornado==6.0.4 -traitlets==5.0.5 -typed-ast==1.4.1 -typing-extensions==3.7.4.2 -tzlocal==1.5.1 -unicodecsv==0.14.1 -urllib3==1.25.10 -validators==0.18.1 -watchdog==0.10.3 -wcwidth==0.2.5 -webencodings==0.5.1 -websocket-client==0.57.0 -Werkzeug==0.16.1 -widgetsnbextension==3.5.1 -WTForms==2.3.3 -zipp==3.1.0 -zope.deprecation==4.4.0 +elasticsearch==8.5.2 +flatten-dict +jsonpickle +git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6 +setuptools>=66 +pytest +PyYAML +git+https://github.com/helxplatform/dug@2.13.11 +orjson==3.9.15 +git+https://github.com/helxplatform/kg_utils.git@v0.0.10 +git+https://github.com/helxplatform/python-stringcase@1.2.1 +bmt==1.4.4 +git+https://github.com/helxplatform/avalon.git@v1.1.0 +h11>=0.16.0 diff --git a/roger-cli-steps.md b/roger-cli-steps.md new file mode 100644 index 00000000..8e132746 --- /dev/null +++ b/roger-cli-steps.md @@ -0,0 +1,27 @@ +# Deployment with Roger CLI + +## QUICK Local Set Up + +This is list steps to produce a local deployment of Roger. This set up does NOT use airflow and instead only uses the Roger CLI via **Makefile** commands. + +### Prerequsite Steps + +- Set up Roger dependencies by ensuring that the `.env` has all the correct information. +- Run the following docker compose commands + - `docker compose up tranql -d`: starts up tranql which is the API handlerfor redis graph in the `graph` stage + - `docker compose up redis -d`: starts up redis which will be used via redis graph for the `graph` stage + - `docker compose up dug -d`: starts up dug API to work as the API handler for elastic search in the `index` stage + - `docker compose up elasticsearch -d`: starts up elastic search for the `index` stage + +### Roger CLI Steps + +1) `python3 -m venv ~/.environments/roger` +2) `source ~/.environments/roger/bin/activate` +3) `pip install -r requirements.txt` +4) `export PYTHONPATH=$PWD/dags` +5) Change the elasticsearch and redisgraph `host` values to localhost in `dags/roger/config/config.yaml` +6) Get the S3 Bucket credentials (access_key, bucket, host, secret_key) and export them as environment variables with ROGER_S3_ in the front of the value like: `ROGER_S3_ACCESS__KEY=XXXXKEYXXXX` +7) `cd bin/` and here either run `make all` OR separate the commands into three steps: + 1) `make annotate`: executes the CLI related commands found in `bin/dug_annotate/Makefile` + 2) `make graph`: executes the CLI related commands found in `bin/roger_graph_build/Makefile` + 3) `make index`: executes the CLI related commands found in `bin/dug_index/Makefile` diff --git a/roger/config.yaml b/roger/config.yaml deleted file mode 100644 index f1377a3c..00000000 --- a/roger/config.yaml +++ /dev/null @@ -1,56 +0,0 @@ -redisgraph: - username: "" - password: "" - host: localhost - graph: test - ports: - http: 6379 - -logging: - level: DEBUG - format: '[%(name)s][%(filename)s][%(funcName)20s] %(levelname)s: %(message)s' - -data_root: roger/data -base_data_uri: https://stars.renci.org/var/kgx_data - -#https://github.com/RedisGraph/redisgraph-bulk-loader/blob/master/redisgraph_bulk_loader/bulk_insert.py#L43 -bulk_loader: - separator: "|" - enforce_schema: False - skip_invalid_nodes: False - skip_invalid_edges: False - quote: 0 - max_token_count: 1024 - max_buffer_size: 2048 - max_token_size: 500 - index: [] - full_text_index: [] - -validation: - queries: - count_nodes: - name: "Count Nodes" - query: "MATCH (a) RETURN COUNT(a)" - count_edges: - name: "Count Edges" - query: "MATCH (a)-[e]-(b) RETURN COUNT(e)" - connectivity: - name: TOPMED Connectivity - query: "MATCH (a { id : '$var' })--(b) RETURN a.category, b.id" - args: - - var: TOPMED.TAG:8 - - var: TOPMED.VAR:phv00000484.v1.p10 - - var: TOPMED.VAR:phv00000487.v1.p10 - - var: TOPMED.VAR:phv00000496.v1.p10 - - var: TOPMED.VAR:phv00000517.v1.p10 - - var: TOPMED.VAR:phv00000518.v1.p10 - - var: TOPMED.VAR:phv00000528.v1.p10 - - var: TOPMED.VAR:phv00000529.v1.p10 - - var: TOPMED.VAR:phv00000530.v1.p10 - - var: TOPMED.VAR:phv00000531.v1.p10 - count_connected_nodes: - name: Count Connected Nodes - query: "MATCH (a)-[e]-(b) RETURN count(a), count(b)" - query_by_type: - name: Query by Type - query: "MATCH (a:gene)-[e]-(b) WHERE 'chemical_substance' IN b.category RETURN count(distinct(a)), count(distinct(b))" diff --git a/roger/core.py b/roger/core.py deleted file mode 100644 index 3d354071..00000000 --- a/roger/core.py +++ /dev/null @@ -1,688 +0,0 @@ -import argparse -import glob -import json -import os -import redis -import requests -import shutil -import time -import yaml -import sys -import traceback -from biolink import model -from collections import defaultdict -from enum import Enum -from io import StringIO -from kgx.cli import redisgraph_upload -from roger.roger_util import get_logger, get_config -from redisgraph_bulk_loader.bulk_insert import bulk_insert -from roger.roger_db import RedisGraph -from string import Template - -log = get_logger () -config = get_config () -data_root = config['data_root'] - -class SchemaType(Enum): - """ High level semantic metatdata concepts. - Categories are classes in an ontological model like Biolink. - Predicates are links between nodes. """ - CATEGORY = "category" - PREDICATE = "predicate" - -class FileFormat(Enum): - """ File formats this module knows about. """ - JSON = "json" - YAML = "yaml" - -class Util: - - @staticmethod - def current_time_in_millis(): - """ - Get current time in milliseconds. - - Returns - ------- - int - Time in milliseconds - - """ - return int(round(time.time() * 1000)) - - """ A just do it approach to getting data. """ - @staticmethod - def read_file(path): - """ Read a file. - :param path: Path to a file. - """ - text = None - with open(path, "r") as stream: - text = stream.read () - return text - - @staticmethod - def read_url(url): - """ Read data from a URL. - :param url: The URL to read. """ - return requests.get (url).text - - @staticmethod - def read_data(path): - """ Read data from a URL or File. HTTP(S) is the only supported protocol. - :param path: A URL or file path. """ - text = None - if Util.is_web(path): - text = Util.read_url (path) - else: - text = Util.read_file (path) - return text - - @staticmethod - def read_object(path, key=None): - """ Read on object from a path. - :param path: A URL or file path. Supports YAML and JSON depending on extension. - :param key: A configuration key. This is prepended to the path if present. - :raises ValueError: If the key is not in the configuration. """ - if key is not None: - prefix = config[key] - path = f"{prefix}/{path}" if Util.is_web(prefix) \ - else os.path.join (prefix, path) - obj = None - if path.endswith (".yaml") or path.endswith (".yml"): - obj = yaml.safe_load (Util.read_data (path)) - elif path.endswith (".json"): - obj = json.loads (Util.read_data (path)) - return obj - - @staticmethod - def is_web (uri): - """ The URI is a web URI (starts with http or https). - :param uri: A URI """ - return uri.startswith("http://") or uri.startswith ("https://") - - @staticmethod - def write_object (obj, path, key=None): - """ Write an object to a path. YAML and JSON supported based on extension. - :param obj: The object to write. - :param path: The path to write to. - :param key: The configuration key to prepend to the path. - """ - """ Prepend a prefix from the configuration file if a key is given. """ - if key is not None: - prefix = config[key] - path = f"{prefix}/{path}" if Util.is_web(prefix) \ - else os.path.join (prefix, path) - """ Ensure the directory to be written to exists. """ - dirname = os.path.dirname (path) - if not os.path.exists (dirname): - os.makedirs (dirname, exist_ok=True) - """ Write the file in the specified format. """ - if path.endswith (".yaml") or path.endswith (".yml"): - with open(path, 'w') as outfile: - yaml.dump (obj, stream) - elif path.endswith (".json"): - with open (path, "w") as stream: - json.dump (obj, stream, indent=2) - else: - """ Raise an exception if invalid. """ - raise ValueError (f"Unrecognized extension: {path}") - - @staticmethod - def kgx_path (name): - """ Form a KGX object path. - :path name: Name of the KGX object. """ - return os.path.join (data_root, "kgx", name) - - @staticmethod - def kgx_objects (): - """ A list of KGX objects. """ - kgx_pattern = Util.kgx_path("**.json") - return sorted(glob.glob (kgx_pattern)) - - @staticmethod - def merge_path (name): - """ Form a merged KGX object path. - :path name: Name of the merged KGX object. """ - return os.path.join (data_root, "merge", name) - - @staticmethod - def merged_objects (): - """ A list of merged KGX objects. """ - merged_pattern = Util.merge_path("**.json") - return sorted(glob.glob (merged_pattern)) - - @staticmethod - def schema_path (name): - """ Path to a schema object. - :param name: Name of the object to get a path for. """ - return os.path.join (data_root, "schema", name) - - @staticmethod - def bulk_path (name): - """ Path to a bulk load object. - :param name: Name of the object. """ - return os.path.join (data_root, "bulk", name) - - @staticmethod - def read_schema (schema_type: SchemaType): - """ Read a schema object. - :param schema_type: Schema type of the object to read. """ - path = Util.schema_path (f"{schema_type.value}-schema.json") - return Util.read_object (path) - - @staticmethod - def get_uri (path, key): - """ Build a URI. - :param path: The path of an object. - :param key: The key of a configuration value to prepend to the object. """ - return f"{config[key]}/{path}" - - @staticmethod - def get_relative_path (path): - return os.path.join (os.path.dirname (__file__), path) - - @staticmethod - def read_relative_object (path): - return Util.read_object (Util.get_relative_path(path)) - - @staticmethod - def trunc(text, limit): - return ('..' + text[-limit-2:]) if len(text) > limit else text - - @staticmethod - def is_up_to_date (source, targets): - target_time_list = [ os.stat (f).st_mtime for f in targets if os.path.exists(f) ] - if len(target_time_list) == 0: - log.debug (f"no targets found") - return False - source = [ os.stat (f).st_mtime for f in source if os.path.exists (f) ] - if len(source) == 0: - log.debug ("no source found. up to date") - return True - return max(source) < min(target_time_list) - -class KGXModel: - """ Abstractions for transforming Knowledge Graph Exchange formatted data. """ - def __init__(self, biolink): - self.biolink = biolink - - def get (self, dataset_version = "v0.1"): - """ Read metadata for edge and node files, then join them into whole KGX objects - containing both nodes and edges. - :param dataset_version: Data version to operate on. - """ - metadata = Util.read_relative_object ("metadata.yaml") - for item in metadata['versions']: - if item['version'] == dataset_version: - for edge_url in item['edgeFiles']: - start = Util.current_time_in_millis () - edge_url = Util.get_uri (edge_url, "base_data_uri") - node_url = edge_url.replace ("-edge-", "-node-") - subgraph_basename = os.path.basename (edge_url.replace ("-edge", "")) - subgraph_path = Util.kgx_path (subgraph_basename) - if os.path.exists (subgraph_path): - log.info (f"cached kgx: {subgraph_path}") - continue - subgraph = { - "edges" : Util.read_object (edge_url), - "nodes" : Util.read_object (node_url) - } - Util.write_object (subgraph, subgraph_path) - total_time = Util.current_time_in_millis () - start - - edges = len(subgraph['edges']) - nodes = len(subgraph['nodes']) - log.debug ("wrote {:>45}: edges:{:>7} nodes: {:>7} time:{:>8}".format ( - Util.trunc(subgraph_path, 45), edges, nodes, total_time)) - - def create_schema (self): - """ - Determine the schema of each type of object. We have to do this to make it possible - to write tabular data. Need to know all possible columns in advance and correct missing - fields. - """ - if self.schema_up_to_date(): - log.info (f"schema is up to date.") - return - - predicate_schemas = defaultdict(lambda:None) - category_schemas = defaultdict(lambda:None) - for subgraph in Util.kgx_objects (): - """ Read a kgx data file. """ - log.debug (f"analyzing schema of {subgraph}.") - basename = os.path.basename (subgraph).replace (".json", "") - graph = Util.read_object (subgraph) - """ Infer predicate schemas. """ - for edge in graph['edges']: - predicate = edge['edge_label'] - if not predicate in predicate_schemas: - predicate_schemas[predicate] = edge - for k in edge.keys (): - edge[k] = '' - else: - for k in edge.keys (): - if not k in predicate_schemas[predicate]: - predicate_schemas[predicate][k] = '' - """ Infer node schemas. """ - for node in graph['nodes']: - node_type = self.biolink.get_leaf_class (node['category']) - if not node_type in category_schemas: - category_schemas[node_type] = node - for k in node.keys (): - node[k] = '' - else: - for k in node.keys (): - if not k in category_schemas[node_type]: - category_schemas[node_type][k] = '' - """ Write node and predicate schemas. """ - self.write_schema (predicate_schemas, SchemaType.PREDICATE) - self.write_schema (category_schemas, SchemaType.CATEGORY) - - def schema_up_to_date (self): - return Util.is_up_to_date ( - source=Util.kgx_objects (), - targets=[ - Util.schema_path (f"{SchemaType.PREDICATE.value}-schema.json"), - Util.schema_path (f"{SchemaType.PREDICATE.value}-schema.json") - ]) - - def write_schema (self, schema, schema_type: SchemaType): - """ Output the schema file. - :param schema: Schema to get keys from. - :param schema_type: Type of schema to write. """ - file_name = Util.schema_path (f"{schema_type.value}-schema.json") - log.info (f"writing schema: {file_name}") - dictionary = { k : self.format_keys(v.keys(), schema_type) for k, v in schema.items () } - Util.write_object (dictionary, file_name) - - def merge_nodes (self, L, R): - for k in L.keys (): - R_v = R.get (k, None) - if R_v == '' or R_v == None: - L[k] = R_v - - def diff_lists (self, L, R): - return list(list(set(L)-set(R)) + list(set(R)-set(L))) - - def merge (self): - """ Merge nodes. Would be good to have something less computationally intensive. """ - for path in Util.kgx_objects (): - new_path = path.replace ('/kgx/', '/merge/') - - source_stats = os.stat (path) - if os.path.exists (new_path): - dest_stats = os.stat (new_path) - if dest_stats.st_mtime > source_stats.st_mtime: - log.info (f"merge {new_path} is up to date.") - continue - - log.info (f"merging {path}") - graph = Util.read_object (path) - graph_nodes = graph.get ('nodes', []) - graph_map = { n['id'] : n for n in graph_nodes } - graph_keys = graph_map.keys () - total_merge_time = 0 - for path_2 in Util.kgx_objects (): - if path_2 == path: - continue - start = Util.current_time_in_millis () - other_graph = Util.read_object (path_2) - load_time = Util.current_time_in_millis () - start - - start = Util.current_time_in_millis () - other_nodes = other_graph.get('nodes', []) - other_map = { n['id'] : n for n in other_nodes } - other_keys = set(other_map.keys()) - intersection = [ v for v in graph_keys if v in other_keys ] - difference = list(set(other_keys) - set(graph_keys)) - scope_time = Util.current_time_in_millis () - start - - start = Util.current_time_in_millis () - for i in intersection: - self.merge_nodes (graph_map[i], other_map[i]) - other_graph['nodes'] = [ other_map[i] for i in difference ] - merge_time = Util.current_time_in_millis () - start - - start = Util.current_time_in_millis () - Util.write_object (other_graph, path_2.replace ('kgx', 'merge')) - write_time = Util.current_time_in_millis () - start - log.debug ("merged {:>45} load:{:>5} scope:{:>7} merge:{:>3}".format( - Util.trunc(path_2, 45), load_time, scope_time, merge_time)) - total_merge_time += load_time + scope_time + merge_time + write_time - - start = Util.current_time_in_millis () - Util.write_object (graph, new_path) - rewrite_time = Util.current_time_in_millis () - start - log.info (f"{path} rewrite: {rewrite_time}. total merge time: {total_merge_time}") - - def format_keys (self, keys, schema_type : SchemaType): - """ Format schema keys. Make source and destination first in edges. Make - id first in nodes. Remove keys for fields we can't yet represent. - :param keys: List of keys. - :param schema_type: Type of schema to conform to. - """ - """ Sort keys. """ - k_list = sorted(keys) - if schema_type == SchemaType.PREDICATE: - """ Rename subject and object to src and dest """ - k_list.remove ('subject') - k_list.remove ('object') - k_list.insert (0, 'src') - k_list.insert (1, 'dest') - elif schema_type == SchemaType.CATEGORY: - """ Make id the first field. Remove smiles. It causes ast parse errors. - TODO: update bulk loader to ignore AST on selected fields. - """ - k_list.remove ('id') - if 'simple_smiles' in k_list: - k_list.remove ('simple_smiles') - k_list.insert (0, 'id') - return k_list - - def load (self): - """ Use KGX to load a data set into Redisgraph """ - input_format = "json" - uri = f"redis://{config['redisgraph']['host']}:{config['redisgraph']['ports']['http']}/" - username = config['redisgraph']['username'] - password = config['redisgraph']['password'] - log.info (f"connecting to redisgraph: {uri}") - for subgraph in glob.glob (f"{kgx_repo}/**.json"): - redisgraph_upload(inputs=[ subgraph ], - input_format=input_format, - input_compression=None, - uri=uri, - username=username, - password=password, - node_filters=[], - edge_filters=[]) - -class BiolinkModel: - """ Programmatic model of Biolink. """ - def to_camel_case(self, snake_str): - """ Convert a snake case string to camel case. """ - components = snake_str.split('_') - return ''.join(x.title() for x in components) - - def get_class(self, name): - """ Get a Python class from a string name. """ - return getattr(sys.modules["biolink.model"], name) - - def is_derived (self, a_class_name, classes): - """ Return true if the class derives from any of the provided classes. """ - for c in classes: - if isinstance (self.get_class(self.to_camel_case(a_class_name)), c): - return True - return False - - def get_leaf_class (self, names): - """ Return the leaf classes in the provided list of names. """ - classes = [ self.get_class(self.to_camel_case(n)) for n in names ] - leaves = [ n for n in names if not self.is_derived (n, classes) ] - return leaves [0] - -class BulkLoad: - """ Tools for creating a Redisgraph bulk load dataset. """ - def __init__(self, biolink): - self.biolink = biolink - - def tables_up_to_date (self): - return Util.is_up_to_date ( - source=[ - Util.schema_path (f"{SchemaType.PREDICATE.value}-schema.json"), - Util.schema_path (f"{SchemaType.PREDICATE.value}-schema.json") - ] + Util.merged_objects (), - targets=glob.glob (Util.bulk_path ("nodes/**.csv")) + \ - glob.glob (Util.bulk_path ("edges/**.csv"))) - - def create (self): - """ Check source times. """ - if self.tables_up_to_date (): - log.info ("up to date.") - return - - """ Format the data for bulk load. """ - predicates_schema = Util.read_schema (SchemaType.PREDICATE) - categories_schema = Util.read_schema (SchemaType.CATEGORY) - bulk_path = Util.bulk_path("") - if os.path.exists(bulk_path): - shutil.rmtree(bulk_path) - - state = defaultdict(lambda:None) - for subgraph in Util.merged_objects (): - log.info (f"processing {subgraph}") - graph = Util.read_object (subgraph) - - """ Write node data for bulk load. """ - categories = defaultdict(lambda: []) - for node in graph['nodes']: - index = self.biolink.get_leaf_class (node['category']) - categories[index].append (node) - self.write_bulk (Util.bulk_path("nodes"), categories, categories_schema, - state=state, f=subgraph) - - """ Write predicate data for bulk load. """ - predicates = defaultdict(lambda: []) - for edge in graph['edges']: - predicates[edge['edge_label']].append (edge) - edge['src'] = edge.pop ('subject') - edge['dest'] = edge.pop ('object') - self.write_bulk (Util.bulk_path("edges"), predicates, predicates_schema) - - def cleanup (self, v): - """ Filter problematic text. - :param v: A value to filter and clean. - """ - if isinstance(v, list): - v = [ self.cleanup(val) for val in v ] - elif isinstance (v, str): - """ Some values contain the CSV separator character. 'fix' that. """ - if len(v) > 1 and v[0] == '[' and v[-1] == ']': - v = v.replace ("[", "@").replace ("]", "@") #f" {v}" - v = v.replace ("|","^") - return v - - def write_bulk (self, bulk_path, obj_map, schema, state={}, f=None): - """ Write a bulk load group of objects. - :param bulk_path: Path to the bulk loader object to write. - :param obj_map: A map of biolink type to list of objects. - :param schema: The schema (nodes or predicates) containing identifiers. - :param state: Track state of already written objects to avoid duplicates. - """ - os.makedirs (bulk_path, exist_ok=True) - for key, objects in obj_map.items (): - out_file = f"{bulk_path}/{key}.csv" - if len(objects) == 0: - continue - new_file = not os.path.exists (out_file) - all_keys = schema[key] - with open (out_file, "a") as stream: - if new_file: - log.info (f" --creating {out_file}") - stream.write ("|".join (all_keys)) - stream.write ("\n") - """ Make all objects conform to the schema. """ - for obj in objects: - for akey in all_keys: - if not akey in obj: - obj[akey] = "" - """ Write fields, skipping duplicate objects. """ - for obj in objects: - oid = str(obj['id']) - if oid in state: - continue - state[oid] = oid - values = [ self.cleanup(obj[k]) for k in all_keys if not 'smiles' in k ] - clean = list(map(str, values)) - s = "|".join (clean) - stream.write (s) - stream.write ("\n") - - def insert (self): - redisgraph = config.get('redisgraph', {}) - bulk_loader = config.get('bulk_loader', {}) - nodes = sorted(glob.glob (Util.bulk_path ("nodes/**.csv"))) - edges = sorted(glob.glob (Util.bulk_path ("edges/**.csv"))) - graph = redisgraph['graph'] - log.info (f"bulk loading \n nodes: {nodes} \n edges: {edges}") - print (f"bulk loading \n nodes: {nodes} \n edges: {edges}") - - try: - log.info (f"deleting graph {graph} in preparation for bulk load.") - db = self.get_redisgraph (redisgraph) - db.redis_graph.delete () - except redis.exceptions.ResponseError: - log.info ("no graph to delete") - - log.info (f"bulk loading graph: {graph}") - args = [] - if len(nodes) > 0: - args.extend (("-n " + " -n ".join (nodes)).split ()) - if len(edges) > 0: - args.extend (("-r " + " -r ".join (edges)).split ()) - args.extend ([ "--separator=|" ]) - args.extend ([ redisgraph['graph'] ]) - """ standalone_mode=False tells click not to sys.exit() """ - bulk_insert (args, standalone_mode=False) - - def get_redisgraph (self, redisgraph): - return RedisGraph (host=redisgraph['host'], - port=redisgraph['ports']['http'], - graph=redisgraph['graph']) - - def validate (self): - redisgraph = config.get('redisgraph', {}) - print (f"config:{json.dumps(redisgraph, indent=2)}") - db = self.get_redisgraph (redisgraph) - validation_queries = config.get('validation', {}).get('queries', []) - for key, query in validation_queries.items (): - text = query['query'] - name = query['name'] - args = query.get('args', [{}]) - for arg in args: - start = Util.current_time_in_millis () - instance = Template (text).safe_substitute (arg) - db.query (instance) - duration = Util.current_time_in_millis () - start - log.info (f"Query {key}:{name} ran in {duration}ms: {instance}") - -class Roger: - """ Consolidate Roger functionality for a cleaner interface. """ - - def __init__(self, to_string=False): - """ Initialize. - :param to_string: Log messages to a string, available as self.log_stream.getvalue() - after execution completes. - """ - import logging - if to_string: - """ Add a stream handler to enable to_string. """ - self.log_stream = StringIO() - self.string_handler = logging.StreamHandler (self.log_stream) - log.addHandler (self.string_handler) - self.biolink = BiolinkModel () - self.kgx = KGXModel (self.biolink) - self.bulk = BulkLoad (self.biolink) - - def __enter__(self): - """ Implement Python's Context Manager interface. """ - return self - - def __exit__(self, exception_type, exception_value, traceback): - """ Implement Python's Context Manager interface. We use this finalizer - to detach the stream handler appended in the constructor. - :param exception_type: Type of exception, if one occurred. - :param exception_value: The exception, if one occurred. - :param traceback: The stack trace explaining the exception. - """ - if exception_type or exception_value or traceback: - log.error ("{} {} {}".format (exception_type, exception_value, traceback)) - log.removeHandler (self.string_handler) - -class RogerUtil: - """ An interface abstracting Roger's inner workings to make it easier to - incorporate into external tools like workflow engines. """ - @staticmethod - def get_kgx (to_string=False): - output = None - with Roger (to_string) as roger: - roger.kgx.get () - output = roger.log_stream.getvalue () if to_string else None - return output - - @staticmethod - def create_schema (to_string=False): - output = None - with Roger (to_string) as roger: - roger.kgx.create_schema () - output = roger.log_stream.getvalue () if to_string else None - return output - - @staticmethod - def merge_nodes (to_string=False): - output = None - with Roger (to_string) as roger: - roger.kgx.merge () - output = roger.log_stream.getvalue () if to_string else None - return output - - @staticmethod - def create_bulk_load (to_string=False): - output = None - with Roger (to_string) as roger: - roger.bulk.create () - output = roger.log_stream.getvalue () if to_string else None - return output - - @staticmethod - def bulk_load (to_string=False): - output = None - with Roger (to_string) as roger: - roger.bulk.insert () - output = roger.log_stream.getvalue () if to_string else None - return output - - @staticmethod - def validate (to_string=False): - output = None - with Roger (to_string) as roger: - roger.bulk.validate () - output = roger.log_stream.getvalue () if to_string else None - return output - -if __name__ == "__main__": - """ Roger CLI. """ - parser = argparse.ArgumentParser(description='Roger') - parser.add_argument('-v', '--dataset-version', help="Dataset version.", default="v0.1") - parser.add_argument('-d', '--data-root', help="Root of data hierarchy", default=None) - parser.add_argument('-g', '--get-kgx', help="Get KGX objects", action='store_true') - parser.add_argument('-l', '--load-kgx', help="Load via KGX", action='store_true') - parser.add_argument('-s', '--create-schema', help="Infer schema", action='store_true') - parser.add_argument('-m', '--merge-kgx', help="Merge KGX nodes", action='store_true') - parser.add_argument('-b', '--create-bulk', help="Create bulk load", action='store_true') - parser.add_argument('-i', '--insert', help="Do the bulk insert", action='store_true') - parser.add_argument('-a', '--validate', help="Validate the insert", action='store_true') - args = parser.parse_args () - - biolink = BiolinkModel () - kgx = KGXModel (biolink) - bulk = BulkLoad (biolink) - if args.data_root is not None: - data_root = get_config()['data_root'] = args.data_root - log.info (f"data root:{data_root}") - if args.get_kgx: - kgx.get (dataset_version=args.dataset_version) - if args.load_kgx: - kgx.load () - if args.merge_kgx: - kgx.merge () - if args.create_schema: - kgx.create_schema () - if args.create_bulk: - bulk.create () - if args.insert: - bulk.insert () - if args.validate: - bulk.validate () - - sys.exit (0) diff --git a/roger/metadata.yaml b/roger/metadata.yaml deleted file mode 100644 index 1b56206e..00000000 --- a/roger/metadata.yaml +++ /dev/null @@ -1,37 +0,0 @@ -versions: -- edgeFiles: - # - biolink_kgx-edge-v0.1.json - - chembio_kgx-edge-v0.1.json - - chemical_normalization-edge-v0.1.json - - cord19-phenotypes-edge-v0.1.json -# - cord19-scibite-edge-v0.1.json -# - cord19-scigraph-edge-v0.1.json - - ctd-edge-v0.1.json - - foodb-edge-v0.1.json -# - kegg-edge-v0.1.json - - mychem-edge-v0.1.json -# - panther-edge-v0.1.json - - pharos-edge-v0.1.json - - topmed-edge-v0.1.json - nodeFiles: - - biolink_kgx-node-v0.1.json - - chembio_kgx-node-v0.1.json - - chemical_normalization-node-v0.1.json - - cord19-phenotypes-node-v0.1.json - - cord19-scibite-node-v0.1.json - - cord19-scigraph-node-v0.1.json - - ctd-node-v0.1.json - - foodb-node-v0.1.json - # - kegg-node-v0.1.json - - mychem-node-v0.1.json - - panther-node-v0.1.json - - pharos-node-v0.1.json - - topmed-node-v0.1.json - version: v0.1 -- version: test - edgeFiles: - - cord19-phenotypes-edge-v0.1.json - - chembio_kgx-edge-v0.1.json - nodeFiles: - - cord19-phenotypes-node-v0.1.json - - chembio_kgx-node-v0.1.json diff --git a/roger/roger_util.py b/roger/roger_util.py deleted file mode 100644 index 35c5f3f7..00000000 --- a/roger/roger_util.py +++ /dev/null @@ -1,58 +0,0 @@ -import logging -import requests -import sys -import yaml -from os import path -from typing import Dict, Any, Optional - -config: Optional[Dict[str, Any]] = None -logger: Optional[logging.Logger] = None - -CONFIG_FILENAME = path.join(path.dirname(path.abspath(__file__)), 'config.yaml') - -def get_config(filename: str = CONFIG_FILENAME) -> dict: - """ - Get config as a dictionary - - Parameters - ---------- - filename: str - The filename with all the configuration - - Returns - ------- - dict - A dictionary containing all the entries from the config YAML - - """ - global config - if config is None: - config = yaml.load(open(filename), Loader=yaml.FullLoader) - return config - -def get_logger(name: str = 'roger') -> logging.Logger: - """ - Get an instance of logger. - - Parameters - ---------- - name: str - The name of logger - - Returns - ------- - logging.Logger - An instance of logging.Logger - - """ - global logger - if logger is None: - config = get_config() - logger = logging.getLogger(name) - handler = logging.StreamHandler(sys.stdout) - formatter = logging.Formatter(config['logging']['format']) - handler.setFormatter(formatter) - logger.addHandler(handler) - logger.setLevel(config['logging']['level']) - logger.propagate = False - return logger diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..32e25657 --- /dev/null +++ b/tests/integration/__init__.py @@ -0,0 +1,3 @@ +from pathlib import Path + +TEST_DATA_DIR = (Path(__file__).parent / 'data').resolve() \ No newline at end of file diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 00000000..d0a68d1c --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,76 @@ +import os + +from roger.core.enums import SchemaType +import json + +class BiolinkMock: + def __init__(self): + self.leafs = [ + 'chemical_substance', + 'molecular_activity', + 'gene', + 'biological_process', + 'disease', + 'phenotypic_feature' + ] + + def get_leaf_class(self, class_names): + for y in self.leafs: + if y in class_names: + return y + return class_names[0] + + def find_biolink_leaves(self, biolink_concepts): + return set([concept for concept in biolink_concepts + if concept in ['named_thing', 'some_other_type']]) + +category = None +predicates = None +file_content_assertions = {} +kgx_files = [] +merged_files = [] +merge_file_test_dir = '' +schema = { + SchemaType.PREDICATE: {}, + SchemaType.CATEGORY: {} +} + +def kgx_objects(): + return [os.path.join(*os.path.split(__file__)[:-1], 'data', file) + for file in kgx_files] + +def merged_objects(): + return [os.path.join(*os.path.split(__file__)[:-1], 'data', file) + for file in merged_files] + +def bulk_path(*args, **kwargs): + return os.path.join(*os.path.split(__file__)[:-1], 'data', 'bulk') + +def is_up_to_date(*args, **kwargs): + return False + +def schema_path(name, *args, **kwargs): + return name + +def read_schema(schema_type: SchemaType, *args, **kwargs): + return conftest.schema[schema_type] + +def read_object(path, *args, **kwargs): + import json + with open(path) as f: + return json.load(f) + +def write_object(dictionary, file_name): + print(dictionary, file_name) + print(file_content_assertions) + assert file_content_assertions[file_name] == dictionary + +def merge_path(file_name): + return os.path.join(*os.path.split(__file__)[:-1], 'data', 'merge', + merge_file_test_dir, file_name) + +def json_line_iter(jsonl_file_path): + f = open(file=jsonl_file_path, mode='r') + for line in f: + yield json.loads(line) + f.close() diff --git a/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/edges.jsonl b/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/edges.jsonl new file mode 100644 index 00000000..e0477a06 --- /dev/null +++ b/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/edges.jsonl @@ -0,0 +1,2 @@ +{"id": "edge_1", "edge_label": "edge_type_1", "list_vs_str": [], "list_vs_int": [], "list_vs_bool": [], "list_vs_float": [], "str_vs_float": "", "str_vs_bool": "", "str_vs_int": "", "int_vs_bool": 0, "int_vs_float": 0, "float_vs_bool": 0, "predicate": "related_to"} +{"id": "edge_2", "edge_label": "edge_type_1", "list_vs_str": "", "list_vs_int": 0, "list_vs_bool": true, "list_vs_float": 0.0, "str_vs_float": 0.0, "str_vs_bool": false, "str_vs_int": 0, "int_vs_bool": true, "int_vs_float": 0.0, "float_vs_bool": true, "predicate": "related_to" } \ No newline at end of file diff --git a/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/expected.json b/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/expected.json new file mode 100644 index 00000000..49c8e2b3 --- /dev/null +++ b/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/expected.json @@ -0,0 +1,20 @@ +{ + "predicate-schema.json": { + "related_to": { + "id": "str", + "edge_label": "str", + "list_vs_str": "list", + "list_vs_int": "list", + "list_vs_bool": "list", + "list_vs_float": "list", + "str_vs_float": "str", + "str_vs_bool": "str", + "str_vs_int": "str", + "int_vs_bool": "str", + "int_vs_float": "str", + "float_vs_bool": "str", + "predicate": "str" + } + }, + "category-schema.json": {} +} \ No newline at end of file diff --git a/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/nodes.jsonl b/tests/integration/data/merge/conflicting_prop_types__edges__schema__kgx/nodes.jsonl new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/edges.jsonl b/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/edges.jsonl new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/expected.json b/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/expected.json new file mode 100644 index 00000000..7d7979b5 --- /dev/null +++ b/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/expected.json @@ -0,0 +1,19 @@ +{ + "category-schema.json": { + "named_thing": { + "id": "str", + "category": "list", + "list_vs_str": "list", + "list_vs_int": "list", + "list_vs_bool": "list", + "list_vs_float": "list", + "str_vs_float": "str", + "str_vs_bool": "str", + "str_vs_int": "str", + "int_vs_bool": "str", + "int_vs_float": "str", + "float_vs_bool": "str" + } + }, + "predicate-schema.json": {} +} \ No newline at end of file diff --git a/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/nodes.jsonl b/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/nodes.jsonl new file mode 100644 index 00000000..ed5aae5d --- /dev/null +++ b/tests/integration/data/merge/conflicting_prop_types__nodes__schema__kgx/nodes.jsonl @@ -0,0 +1,2 @@ +{"id": "node_1", "category": ["named_thing"], "list_vs_str": [], "list_vs_int": [], "list_vs_bool": [], "list_vs_float": [], "str_vs_float": "", "str_vs_bool": "", "str_vs_int": "", "int_vs_bool": 0, "int_vs_float": 0, "float_vs_bool": 0} +{"id": "node_1", "category": ["named_thing"], "list_vs_str": "", "list_vs_int": 0, "list_vs_bool": true, "list_vs_float": 0.0, "str_vs_float": 0.0, "str_vs_bool": false, "str_vs_int": 0, "int_vs_bool": true, "int_vs_float": 0.0, "float_vs_bool": true} \ No newline at end of file diff --git a/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/edges.jsonl b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/edges.jsonl new file mode 100644 index 00000000..63ab7769 --- /dev/null +++ b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/edges.jsonl @@ -0,0 +1,4 @@ +{"edge_label": "edge_label_1", "id": "x", "bool_attr": false, "int_attr": 0, "float_attr": 0.0 , "predicate": "edge_label_1"} +{"edge_label": "edge_label_1", "id": "x3", "str_attr": "str", "list_attr": [], "predicate": "edge_label_1"} +{"edge_label": "edge_label_2", "id": "x4", "str_attr": "str", "predicate": "edge_label_2"} +{"edge_label": "edge_label_2", "id": "x3", "bool_attr": true, "float_attr": 2.33, "int_attr": 3092, "str_att": "name", "predicate": "edge_label_2"} \ No newline at end of file diff --git a/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/expected.json b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/expected.json new file mode 100644 index 00000000..6fcf9cd5 --- /dev/null +++ b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/expected.json @@ -0,0 +1,43 @@ +{ + "category-schema.json": { + "named_thing": { + "str_attr": "str", + "list_attr": "list", + "bool_attr": "bool", + "int_attr": "int", + "float_attr": "float", + "id": "str", + "category": "list" + }, + "some_other_type": { + "id": "str", + "category": "list", + "attr_1": "str", + "attr_2": "list", + "attr_3": "bool", + "attr_4": "int" + } + }, + "predicate-schema.json": { + "edge_label_1": { + "id": "str", + "edge_label": "str", + "str_attr": "str", + "list_attr": "list", + "bool_attr": "bool", + "int_attr": "int", + "float_attr": "float", + "predicate": "str" + }, + "edge_label_2": { + "id": "str", + "str_attr": "str", + "edge_label": "str", + "bool_attr": "bool", + "float_attr": "float", + "int_attr": "int", + "str_att": "str", + "predicate": "str" + } + } +} \ No newline at end of file diff --git a/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/nodes.jsonl b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/nodes.jsonl new file mode 100644 index 00000000..1670a2be --- /dev/null +++ b/tests/integration/data/merge/non_conflicting_prop_types__schema__kgx/nodes.jsonl @@ -0,0 +1,3 @@ +{"id": "ID1", "category": ["named_thing"], "list_attr": [], "bool_attr": false, "int_attr": 0} +{"id": "ID2", "category": ["named_thing"], "str_attr": "", "float_attr": 0.0} +{"id": "Id3", "category": ["some_other_type"], "attr_1": "", "attr_2": [], "attr_3": true, "attr_4": 1} \ No newline at end of file diff --git a/tests/integration/test_KGX_Model.py b/tests/integration/test_KGX_Model.py new file mode 100644 index 00000000..4fc5716f --- /dev/null +++ b/tests/integration/test_KGX_Model.py @@ -0,0 +1,42 @@ +import json +import pytest +from unittest.mock import patch + +from roger.models.kgx import KGXModel +from . import conftest + + +@pytest.fixture +def kgx_model(): + biolink = conftest.BiolinkMock() + kgx_model = KGXModel(biolink=biolink, config={}) + return kgx_model + +def setup_mock_and_run_create_schema(test_files_dir, kgx_model: KGXModel): + + with patch('roger.models.kgx.storage', conftest): + conftest.merge_file_test_dir = test_files_dir + with open(conftest.merge_path("expected.json")) as f: + expected = json.load(f) + conftest.file_content_assertions = expected + kgx_model.create_schema() + +def test_create_schema_plain(kgx_model: KGXModel): + file_name = 'non_conflicting_prop_types__schema__kgx' + setup_mock_and_run_create_schema(file_name, kgx_model=kgx_model) + +def test_create_schema_conflicting_nodes(kgx_model: KGXModel): + file_name = 'conflicting_prop_types__nodes__schema__kgx' + setup_mock_and_run_create_schema(file_name, kgx_model=kgx_model) + +def test_create_schema_conflicting_edges(kgx_model: KGXModel): + file_name = 'conflicting_prop_types__edges__schema__kgx' + setup_mock_and_run_create_schema(file_name, kgx_model=kgx_model) + +def test_merge(kgx_model: KGXModel): + with patch('roger.models.kgx.storage', conftest): + conftest.kgx_files = [ + 'data_1.merge.kgx.json', + 'data_2.merge.kgx.json' + ] + #TODO add tests for merge nodes diff --git a/tests/integration/test_bulk_loader.py b/tests/integration/test_bulk_loader.py new file mode 100644 index 00000000..ae99573f --- /dev/null +++ b/tests/integration/test_bulk_loader.py @@ -0,0 +1,111 @@ +import pytest +from unittest.mock import patch + +from roger.core import BulkLoad +from . import conftest + + +@pytest.fixture +def bulk_loader(): + biolink = conftest.BiolinkMock() + return BulkLoad(biolink=biolink, config={'separator': 30}) + + +def test_create_redis_schema(): + test_schema = { + 'concept': { + 'attribute0': 'list', + 'attribute1': 'str', + 'attribute2': 'int', + 'attribute3': 'bool' + } + } + redis_schema = BulkLoad.create_redis_schema_header(test_schema['concept'], is_relation=False) + assert 'attribute0:ARRAY' in redis_schema + assert 'attribute1:STRING' in redis_schema + assert 'attribute2:INT' in redis_schema + assert 'attribute3:BOOL' in redis_schema + + redis_schema = BulkLoad.create_redis_schema_header(test_schema['concept'], is_relation=True) + assert 'attribute0:ARRAY' in redis_schema + assert 'attribute1:STRING' in redis_schema + assert 'attribute2:INT' in redis_schema + assert 'attribute3:BOOL' in redis_schema + + # should add these columns to relationships + assert 'internal_start_id:START_ID' in redis_schema + assert 'internal_end_id:END_ID' in redis_schema + + +def test_group_by_set_attr(): + items = [ + { # we need to make sure that empty values are the only ones ignored + # not values that evaluate to false. + 'id': 0, + 'attr_1': '', + 'attr_2': 2, + 'attr_3': [], + 'attr_4': False, + 'attr_5': None + }, + { + 'id': 1, + 'attr_1': 'a', + 'attr_2': 'b', + 'attr_3': 'c', + 'attr_4': '' + } + ] + # first group is attr_2, attr_4, 'id' + group_1 = frozenset(['attr_2', 'attr_4', 'id']) + # second group is attr_1, attr_2, attr_3 , 'id' + group_2 = frozenset(['attr_1', 'attr_2', 'attr_3', 'id']) + grouping, invalid_keys = BulkLoad.group_items_by_attributes_set(objects=items, + processed_object_ids=set()) + assert group_1 in grouping + assert group_2 in grouping + + assert items[0] in grouping[group_1] + assert items[1] in grouping[group_2] + + +def test_write_bulk_nodes(bulk_loader: BulkLoad): + nodes_schema = { + "named_thing": { + "id": "str", + "str": "str", + "list_attr": "list", + "bool_attr": "bool", + "float_attr": "float", + "int_attr": "int" + } + } + node_objects = { + "named_thing": [ + { + "id": "ID:1", + "str": "name", + "list_attr": ["x"], + "bool_attr": False, + "float_attr": 0.1, + "int_attr": 0 + } + ] + } + with patch('roger.core.bulkload.storage', conftest): + bulk_path = conftest.bulk_path() + state = {} + bulk_loader.write_bulk(bulk_path=bulk_path, + obj_map=node_objects, + schema=nodes_schema, + state=state, + is_relation=False) + assert len(state['file_paths']) > 0 + # @TODO add assertions. + # with open(os.path.join(bulk_path,'named_thing_csv-0-1')) + + + + + + diff --git a/tests/integration/test_dug_utils.py b/tests/integration/test_dug_utils.py new file mode 100644 index 00000000..4e31f820 --- /dev/null +++ b/tests/integration/test_dug_utils.py @@ -0,0 +1,62 @@ +import tempfile + +from pathlib import Path + +import pytest + +from dug_helpers.dug_utils import FileFetcher, get_topmed_files, get_dbgap_files +from roger.config import config + + +def test_fetch_network_file(): + filename = "README.md" + with tempfile.TemporaryDirectory() as tmp_dir: + fetch1 = FileFetcher( + "https://github.com", + "/helxplatform/roger/blob/main/", + tmp_dir, + ) + expected_path = Path(tmp_dir) / filename + assert not expected_path.exists() + fetch1(filename) + assert expected_path.exists() + + with tempfile.TemporaryDirectory() as tmp_dir: + fetch2 = FileFetcher( + "https://github.com", + Path("/helxplatform/roger/blob/main/"), + Path(tmp_dir), + ) + + expected_path = Path(tmp_dir) / filename + assert not expected_path.exists() + fetch2(filename) + assert expected_path.exists() + + +def test_fetcher_errors(): + + filename = "DOES NOT EXIST.md" + + with tempfile.TemporaryDirectory() as tmp_dir: + fetch = FileFetcher( + "https://github.com", + Path("/helxplatform/roger/blob/main/"), + Path(tmp_dir), + ) + with pytest.raises(RuntimeError): + fetch(filename) + + +@pytest.mark.skip() +def test_get_topmed_files(): + file_names = get_topmed_files(config=config) + for file_name in file_names: + assert Path(file_name).exists() + + +@pytest.mark.skip() +def test_get_dbgap_files(): + file_names = get_dbgap_files(config=config) + for file_name in file_names: + assert Path(file_name).exists() \ No newline at end of file diff --git a/tests/integration/test_type_conversion_util.py b/tests/integration/test_type_conversion_util.py new file mode 100644 index 00000000..ab4e122f --- /dev/null +++ b/tests/integration/test_type_conversion_util.py @@ -0,0 +1,49 @@ +from roger.components.data_conversion_utils import TypeConversionUtil + + +def test_type_comparision(): + datatype_1 = list.__name__ + datatype_2 = str.__name__ + datatype_3 = bool.__name__ + datatype_4 = float.__name__ + datatype_5 = int.__name__ + # list should always come first + assert datatype_1 == TypeConversionUtil.compare_types(datatype_1, datatype_2) + assert datatype_1 == TypeConversionUtil.compare_types(datatype_1, datatype_3) + assert datatype_1 == TypeConversionUtil.compare_types(datatype_1, datatype_4) + assert datatype_1 == TypeConversionUtil.compare_types(datatype_1, datatype_5) + + # then string + assert datatype_2 == TypeConversionUtil.compare_types(datatype_2, datatype_3) + assert datatype_2 == TypeConversionUtil.compare_types(datatype_2, datatype_4) + assert datatype_2 == TypeConversionUtil.compare_types(datatype_2, datatype_5) + + # the rest should always be casted up to string + assert datatype_2 == TypeConversionUtil.compare_types(datatype_3, datatype_4) + assert datatype_2 == TypeConversionUtil.compare_types(datatype_4, datatype_5) + assert datatype_2 == TypeConversionUtil.compare_types(datatype_5, datatype_3) + + # should raise error when sent 'Unknown' data types + bogus_dt = "bogus" + try: + TypeConversionUtil.compare_types(bogus_dt, datatype_1) + except AssertionError as error: + exception_raised = True + assert exception_raised + try: + TypeConversionUtil.compare_types(datatype_1, bogus_dt) + except AssertionError as error: + exception_raised = True + assert exception_raised + + +def test_casting_values(): + castable = [ + ["True", bool.__name__, True], + [1 , bool.__name__, True], + [1.0, bool.__name__, True], + [[], bool.__name__, False] + ] + for items in castable: + assert items[-1] == TypeConversionUtil.cast(*items[:-1]) # cast (value, type) + diff --git a/tests/test_redis_query.cypher b/tests/test_redis_query.cypher new file mode 100644 index 00000000..509df1fa --- /dev/null +++ b/tests/test_redis_query.cypher @@ -0,0 +1,5 @@ +MATCH (c{id:'HP:0032316'}) return c + +MATCH (disease:`Disease` {`id`: 'MONDO:0004979'}) WITH disease MATCH (disease)-[e1_disease_phenotypic_feature]-(phenotypic_feature:`PhenotypicFeature` {}) +WITH disease AS disease, phenotypic_feature AS phenotypic_feature, collect(e1_disease_phenotypic_feature) AS e1_disease_phenotypic_feature +RETURN disease,phenotypic_feature,e1_disease_phenotypic_feature,labels(disease) AS type__disease,labels(phenotypic_feature) AS type__phenotypic_feature,[edge in e1_disease_phenotypic_feature | type(edge)] AS type__e1_disease_phenotypic_feature,[edge in e1_disease_phenotypic_feature | [startNode(edge).id, endNode(edge).id]] AS id_pairs__e1_disease_phenotypic_feature \ No newline at end of file diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py new file mode 100644 index 00000000..f32cb512 --- /dev/null +++ b/tests/unit/test_config.py @@ -0,0 +1,75 @@ +import os + +from roger.config import RogerConfig, RedisConfig + + +def test_merge(): + dict_a = { + 'redis': { + 'host': 'redis', + 'port': 6379, + 'user': 'admin', + 'password': 'pass1' + } + } + dict_b = { + 'redis': { + 'port': 6389, + 'password': 'pass2' + }, + 'elasticsearch': { + 'host': 'elastic', + 'port': 9200 + } + } + + assert RogerConfig.merge_dicts(dict_a, dict_b) == { + 'redis': { + 'host': 'redis', + 'port': 6389, + 'user': 'admin', + 'password': 'pass2' + }, + 'elasticsearch': { + 'host': 'elastic', + 'port': 9200 + } + } + + +def test_get_overrides(): + prefix = "TEST_VALUES_" + assert RogerConfig.get_override_data(prefix) == {} + + os.environ[f"{prefix}REDIS_HOST"] = 'http://redis.svc' + os.environ[f"{prefix}REDIS_PORT"] = '6379' + os.environ[f"{prefix}REDIS_USER"] = 'redis-admin' + os.environ[f"{prefix}REDIS_PASSWORD"] = 'admin-pass' + os.environ[f"{prefix}ELASTIC__SEARCH_HOST"] = 'http://elastic.svc' + + actual = RogerConfig.get_override_data(prefix) + expected = { + 'redis': { + 'host': 'http://redis.svc', + 'port': '6379', + 'user': 'redis-admin', + 'password': 'admin-pass', + }, + 'elastic_search': { + 'host': 'http://elastic.svc', + } + } + assert actual == expected + + +def test_redis_conf(): + redis_conf = RedisConfig(**{}) + assert redis_conf.username == "" + assert redis_conf.password == "" + assert redis_conf.host == "redis" + assert redis_conf.graph == "test" + assert redis_conf.port == 6379 + + redis_conf = RedisConfig(**{"port": "6379"}) + assert redis_conf.port == 6379 + diff --git a/tranql-schema.yaml b/tranql-schema.yaml new file mode 100644 index 00000000..79d9d575 --- /dev/null +++ b/tranql-schema.yaml @@ -0,0 +1,12 @@ +schema: + redis: + doc: | + Roger is a knowledge graph built by aggregeting several kgx formatted knowledge graphs from several sources. + url: "redis:" + redis: true + redis_connection_params: + # Host here is the service name in the docker composed container. + host: redis + port: 6379 + # SET USERNAME and PASSWORD + # via REDIS_USERNAME , REDIS_PASSWORD Env vars (i.e capitialize service name) diff --git a/tranql_translate.py b/tranql_translate.py deleted file mode 100644 index e9fe67e6..00000000 --- a/tranql_translate.py +++ /dev/null @@ -1,71 +0,0 @@ -# -*- coding: utf-8 -*- -# - -""" -An Airflow workflow for the Roger Translator KGX data pipeline. -""" - -import os -import subprocess -from airflow.operators.bash_operator import BashOperator -from airflow.contrib.example_dags.libs.helper import print_stuff -from airflow.models import DAG -from airflow.operators.python_operator import PythonOperator -from airflow.utils.dates import days_ago -from roger.core import RogerUtil - -default_args = { - 'owner': 'RENCI', - 'start_date': days_ago(1) -} - -""" Build the workflow's tasks and DAG. """ -with DAG( - dag_id='tranql_translate', - default_args=default_args, - schedule_interval=None -) as dag: - - """ Configure use of KubernetesExecutor. """ - at_k8s=False - - def get_executor_config (annotations=None): - """ Get an executor configuration. - :param annotations: Annotations to attach to the executor. - :returns: Returns a KubernetesExecutor if K8s is configured and None otherwise. - """ - k8s_executor_config = { - "KubernetesExecutor": { - "annotations": annotations - } - } - return k8s_executor_config if at_k8s else None - - def create_python_task (name, a_callable): - """ Create a python task. - :param name: The name of the task. - :param a_callable: The code to run in this task. - """ - return PythonOperator( - task_id=name, - python_callable=a_callable, - op_kwargs={ 'to_string' : True }, - executor_config=get_executor_config (annotations={ - "task_name" : name - }) - ) - - """ Build the workflow tasks. """ - intro = BashOperator(task_id='Intro', bash_command='echo running tranql translator') - get_kgx = create_python_task ("GetSource", RogerUtil.get_kgx) - create_schema = create_python_task ("CreateSchema", RogerUtil.create_schema) - merge_nodes = create_python_task ("MergeNodes", RogerUtil.merge_nodes) - create_bulk_load = create_python_task ("CreateBulkLoad", RogerUtil.create_bulk_load) - bulk_load = create_python_task ("BulkLoad", RogerUtil.bulk_load) - validate = create_python_task ("Validate", RogerUtil.validate) - finish = BashOperator (task_id='Finish', bash_command='echo finish') - - """ Build the DAG. """ - intro >> get_kgx >> [ create_schema, merge_nodes ] >> create_bulk_load >> \ - bulk_load >> validate >> finish -