Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 11 additions & 20 deletions .env
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
AIRFLOW_UID=502
AIRFLOW_GID=0

API_WORKERS=4
API_PORT=5551
API_TIMEOUT=10
KGX_DATA_SETS="bdc-studies-kgx:v1.0"
INPUT_DATA_SETS="heal-mds-studies:v1.0"

DATA_DIR=./local_storage
LAKEFS_ACCESS_KEY=""
LAKEFS_SECRET_KEY=""
LAKEFS_REPO=""
LAKEFS_BRANCH=""
LAKEFS_URL="https://lakefs.apps.renci.org"

DUG_LOG_LEVEL=INFO

ELASTICSEARCH_PASSWORD=12345
ELASTICSEARCH_HOST=elasticsearch
ELASTICSEARCH_USERNAME=elastic

NBOOST_API_HOST=nboost

REDIS_PASSWORD=weak
REDIS_HOST=merge-redis-master
REDIS_PORT=6379
TRANQL_ACCESS_LOG=access.log
TRANQL_ERROR_LOG=error.log
ROGER_DUG__INPUTS_DATA__SETS=topmed:v1.0
BIOMEGATRON_URL="https://med-nemo.apps.renci.org/annotate"
SAPBERT_URL="https://sap-qdrant.apps.renci.org/annotate"
NODE_NORM_URL="https://nodenormalization-sri.renci.org/get_normalized_nodes?conflate=false&description=true&curie="
NAME_RES_URL="https://name-resolution-sri.renci.org/reverse_lookup"
2 changes: 1 addition & 1 deletion .github/workflows/trivy-pr-scan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
# We still fail the job if results are found, so below will always run
# unless manually canceled.
- name: Upload Trivy scan results to GitHub Security tab
uses: github/codeql-action/upload-sarif@v2
uses: github/codeql-action/upload-sarif@v3
if: '!cancelled()'
with:
sarif_file: 'trivy-results.sarif'
103 changes: 72 additions & 31 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,33 +1,74 @@
FROM bitnami/airflow:2.10.5-debian-12-r7

USER root
RUN apt-get update && apt-get install -y git nano vim gcc rustc cargo
#RUN useradd -u 1001 -ms /bin/bash airflow && chown -R airflow /home/airflow
COPY requirements.txt requirements.txt
RUN source /opt/bitnami/airflow/venv/bin/activate && CARGO_HOME=/tmp/.cargo && \
pip install setuptools wheel && \
pip install -r requirements.txt

RUN rm -f requirements.txt

## Vul patches
## Python lib patches on airflow python env
RUN source /opt/bitnami/airflow/venv/bin/activate pip install --upgrade \
flask-appbuilder==4.5.3 \
cryptography==44.0.1 \
werkzeug==3.0.6 \
urllib3==2.2.2
RUN source /opt/bitnami/airflow/venv/bin/activate pip uninstall -y \
apache-airflow-providers-mysql==6.2.0

# Uninstall these from non airflow python env
RUN pip install --upgrade \
flask-appbuilder==4.5.3 \
cryptography==44.0.1 \
werkzeug==3.0.6 \
urllib3==2.2.2
RUN apt-get autoremove -y vim
RUN apt-get autoremove -y binutils
RUN apt-get autoremove -y linux-libc-dev
# Use a Debian-based image for better compatibility
FROM python:3.11.14-slim-trixie

# Set Airflow version and home directory
ARG AIRFLOW_VERSION=3.1.1
ARG AIRFLOW_HOME=/opt/airflow

# Environment variables
ENV AIRFLOW_HOME=${AIRFLOW_HOME}
ENV AIRFLOW__CORE__LOAD_EXAMPLES=False
ENV AIRFLOW__CORE__EXECUTOR=LocalExecutor
ENV AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow
ENV PYTHONUNBUFFERED=1

# Create airflow user and directories
RUN useradd --uid 50000 --home-dir ${AIRFLOW_HOME} --create-home airflow && \
mkdir -p ${AIRFLOW_HOME}/dags ${AIRFLOW_HOME}/logs ${AIRFLOW_HOME}/plugins ${AIRFLOW_HOME}/config

# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
libpq-dev \
libffi-dev \
libssl-dev \
curl \
tini \
tzdata \
git \
&& rm -rf /var/lib/apt/lists/*

# Upgrade pip tools
RUN pip install --no-cache-dir --upgrade pip setuptools wheel

# Install Airflow (with PostgreSQL, Celery, Redis support)
RUN pip install --no-cache-dir \
"apache-airflow[postgres,celery,redis,fab]==${AIRFLOW_VERSION}" \
"apache-airflow-providers-cncf-kubernetes" \
--constraint "https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-3.11.txt"

# Optional: install extra packages
RUN pip install --no-cache-dir psycopg2-binary redis

COPY ./requirements.txt /tmp/requirements.txt

RUN pip install -r /tmp/requirements.txt

RUN rm /tmp/requirements.txt



RUN apt-get purge -y --auto-remove \
build-essential \
libpq-dev \
libffi-dev \
libssl-dev \
curl \
git && \
apt-get clean

# Set ownership
RUN chown -R airflow:airflow ${AIRFLOW_HOME}

# Switch to airflow user
USER airflow
WORKDIR ${AIRFLOW_HOME}

# Expose Airflow webserver port
EXPOSE 8080

# Use tini for signal handling
ENTRYPOINT ["/usr/bin/tini", "--"]

# Default command
CMD ["airflow", "webserver"]
9 changes: 6 additions & 3 deletions dags/annotate_and_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"commitid_from": None,
"commitid_to": None
},
schedule_interval=None
# schedule_interval=None
) as dag:
init = EmptyOperator(task_id="init", dag=dag)
finish = EmptyOperator(task_id="finish", dag=dag)
Expand Down Expand Up @@ -65,7 +65,7 @@
"commitid_from": None,
"commitid_to": None
},
schedule_interval=None
# schedule_interval=None
) as dag:

init = EmptyOperator(task_id="init", dag=dag)
Expand All @@ -80,4 +80,7 @@ def print_context(ds=None, **kwargs):

init >> create_python_task(dag, "get_from_lakefs", print_context) >> finish

#run_this = PythonOperator(task_id="print_the_context", python_callable=print_context)
#run_this = PythonOperator(task_id="print_the_context", python_callable=print_context)

if __name__ == "__main__":
dag.test()
2 changes: 1 addition & 1 deletion dags/knowledge_graph_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
with DAG(
dag_id='knowledge_graph_build',
default_args=default_args,
schedule_interval=None
# schedule_interval=None
) as dag:

""" Build the workflow tasks. """
Expand Down
8 changes: 4 additions & 4 deletions dags/roger/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import warnings
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, Optional, List
from typing import Dict, Optional, List, Union

import yaml
from dug.config import Config as DugConfig
Expand Down Expand Up @@ -33,7 +33,7 @@ class LakefsConfig(DictLike):
secret_access_key: str
branch: str
repo: str
enabled: bool = False
enabled: Union[bool, str] = False

def __post_init__(self):
if isinstance(self.enabled, str):
Expand Down Expand Up @@ -135,8 +135,8 @@ class AnnotationConfig(DictLike):
])

def __post_init__(self):
self.annotator_args["sapbert"]["bagel"]["enabled"] = self.annotator_args["sapbert"]["bagel"][
"enabled"].lower() == "true"
self.annotator_args["sapbert"]["bagel"]["enabled"] = str(self.annotator_args["sapbert"]["bagel"][
"enabled"]).lower() == "true"


@dataclass
Expand Down
3 changes: 3 additions & 0 deletions dags/roger/core/bulkload.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ def create_nodes_csv_file(self, input_data_path=None, output_data_path=None):
merged_nodes_file = storage.merged_objects('nodes', input_data_path)
counter = 1
for node in storage.json_line_iter(merged_nodes_file):
if node.get('description'):
node['description'] = node['description'].replace('\n',
' ')
if not node.get('category'):
category_error_nodes.add(node['id'])
node['category'] = [BiolinkModel.root_type]
Expand Down
Loading
Loading