diff --git a/Dockerfile b/Dockerfile index 7323534b..f500d484 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,7 +46,8 @@ RUN pip install -r /tmp/requirements.txt RUN rm /tmp/requirements.txt - +COPY . /opt/roger +RUN pip install -e /opt/roger RUN apt-get purge -y --auto-remove \ build-essential \ @@ -57,6 +58,8 @@ RUN apt-get purge -y --auto-remove \ git && \ apt-get clean +RUN if [ -n "$ROGER_SOURCE" ]; then pip install -e $ROGER_SOURCE; fi + # Set ownership RUN chown -R airflow:airflow ${AIRFLOW_HOME} diff --git a/cli.py b/cli.py index be77525a..e3efadb7 100644 --- a/cli.py +++ b/cli.py @@ -1,7 +1,7 @@ import roger.core.base as RogerUtil from roger.config import config from roger.logger import get_logger -from dug_helpers.dug_utils import DugUtil, get_topmed_files, get_dbgap_files, get_sparc_files, get_anvil_files, get_nida_files +from roger.dug_helpers.dug_utils import DugUtil, get_topmed_files, get_dbgap_files, get_sparc_files, get_anvil_files, get_nida_files import sys import argparse import os diff --git a/dags/annotate_and_index.py b/dags/annotate_and_index.py index e4cdfd98..f2abd6ff 100644 --- a/dags/annotate_and_index.py +++ b/dags/annotate_and_index.py @@ -9,8 +9,8 @@ import os from airflow.models import DAG -from airflow.operators.empty import EmptyOperator -from airflow.operators.python import PythonOperator +from airflow.providers.standard.operators.empty import EmptyOperator +from airflow.providers.standard.operators.python import PythonOperator from roger.tasks import default_args, create_pipeline_taskgroup, logger, create_python_task env_enabled_datasets = os.getenv( @@ -66,10 +66,10 @@ "commitid_to": None }, # schedule_interval=None -) as dag: +) as test_dag: - init = EmptyOperator(task_id="init", dag=dag) - finish = EmptyOperator(task_id="finish", dag=dag) + init = EmptyOperator(task_id="init", dag=test_dag) + finish = EmptyOperator(task_id="finish", dag=test_dag) def print_context(ds=None, **kwargs): print(">>>All kwargs") @@ -78,9 +78,11 @@ def print_context(ds=None, **kwargs): print(ds) - init >> create_python_task(dag, "get_from_lakefs", print_context) >> finish + (init >> + create_python_task(test_dag, "get_from_lakefs", print_context) >> + finish) #run_this = PythonOperator(task_id="print_the_context", python_callable=print_context) if __name__ == "__main__": - dag.test() \ No newline at end of file + dag.test() diff --git a/dags/knowledge_graph_build.py b/dags/knowledge_graph_build.py index d94d2262..9fb62a8e 100644 --- a/dags/knowledge_graph_build.py +++ b/dags/knowledge_graph_build.py @@ -6,7 +6,7 @@ """ from airflow.models import DAG -from airflow.operators.empty import EmptyOperator +from airflow.providers.standard.operators.empty import EmptyOperator import roger from roger.tasks import default_args, create_python_task from roger.config import config diff --git a/docker-compose.yaml b/docker-compose.yaml index fd81f5f5..dc8e0eba 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,4 +1,3 @@ -version: '3.8' x-airflow-common: &airflow-common build: . @@ -36,6 +35,7 @@ x-airflow-common: &airflow-common - ./logs:/opt/airflow/logs - ./plugins:/opt/airflow/plugins - ./config:/opt/airflow/config + - .:/opt/roger depends_on: postgres: condition: service_healthy @@ -164,4 +164,4 @@ volumes: redis-stack-data: networks: airflow-network: - driver: bridge \ No newline at end of file + driver: bridge diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..374b58cb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..13e445ae --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = dags diff --git a/requirements.txt b/requirements.txt index 44e6ffae..a221aa7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6 setuptools>=66 pytest PyYAML -git+https://github.com/helxplatform/dug@v2.13.12 +git+https://github.com/helxplatform/dug@DugModel2.0 orjson==3.9.15 git+https://github.com/helxplatform/kg_utils.git@v0.0.10 git+https://github.com/helxplatform/python-stringcase@1.2.1 @@ -15,6 +15,8 @@ h11>=0.16.0 starlette>=0.49.1 datetime aiohttp +redis +falkordb #--- patch werkzeug==3.0.6 -cryptography>=44.0.1 \ No newline at end of file +cryptography>=44.0.1 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..653f7f60 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,40 @@ +[metadata] +name = roger +version = 0.10.4 +author = Renaissance Computing Institute + RTI +description = Data pipeline automation for dug +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/helxplatform/roger +project_urls = + Bug Tracker = https://github.com/helxplatform/roger/issues +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent + +[options] +package_dir = + = src +packages = find: +python_requires = >=3.10 +include_package_data = true +install_requires = + dug + orjson + requests + requests_cache + redis + +[options.entry_points] +console_scripts = + dug = dug.cli:main + roger = roger:cli + +[options.extras_require] +rest = + jsonschema + airflow + +[options.packages.find] +where = src diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..45f160da --- /dev/null +++ b/setup.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python + +import setuptools + +if __name__ == "__main__": + setuptools.setup() \ No newline at end of file diff --git a/dags/roger/__init__.py b/src/roger/__init__.py similarity index 100% rename from dags/roger/__init__.py rename to src/roger/__init__.py diff --git a/dags/_version.py b/src/roger/_version.py similarity index 100% rename from dags/_version.py rename to src/roger/_version.py diff --git a/dags/roger/components/__init__.py b/src/roger/components/__init__.py similarity index 100% rename from dags/roger/components/__init__.py rename to src/roger/components/__init__.py diff --git a/dags/roger/components/data_conversion.py b/src/roger/components/data_conversion.py similarity index 100% rename from dags/roger/components/data_conversion.py rename to src/roger/components/data_conversion.py diff --git a/dags/roger/components/data_conversion_utils.py b/src/roger/components/data_conversion_utils.py similarity index 100% rename from dags/roger/components/data_conversion_utils.py rename to src/roger/components/data_conversion_utils.py diff --git a/dags/roger/config/__init__.py b/src/roger/config/__init__.py similarity index 97% rename from dags/roger/config/__init__.py rename to src/roger/config/__init__.py index cc017fca..ce8a6af5 100644 --- a/dags/roger/config/__init__.py +++ b/src/roger/config/__init__.py @@ -144,6 +144,8 @@ class IndexingConfig(DictLike): variables_index: str = "variables_index" concepts_index: str = "concepts_index" kg_index: str = "kg_index" + studies_index: str = "studies_index" + sections_index: str = "sections_index" tranql_min_score: float = 0.2 excluded_identifiers: List[str] = field(default_factory=lambda: [ "CHEBI:17336" @@ -228,6 +230,11 @@ def to_dug_conf(self) -> DugConfig: preprocessor=self.annotation.preprocessor, annotator_type=self.annotation.annotator_type, annotator_args=self.annotation.annotator_args, + concepts_index_name=self.indexing.get('concepts_index'), + variables_index_name=self.indexing.get('variables_index'), + studies_index_name=self.indexing.get('studies_index'), + sections_index_name=self.indexing.get('sections_index'), + kg_index_name=self.indexing.get('kg_index'), normalizer={ 'url': self.annotation.normalizer, }, diff --git a/dags/roger/config/_base.py b/src/roger/config/_base.py similarity index 100% rename from dags/roger/config/_base.py rename to src/roger/config/_base.py diff --git a/dags/roger/config/config.yaml b/src/roger/config/config.yaml similarity index 100% rename from dags/roger/config/config.yaml rename to src/roger/config/config.yaml diff --git a/dags/roger/config/dev-config.yaml b/src/roger/config/dev-config.yaml similarity index 100% rename from dags/roger/config/dev-config.yaml rename to src/roger/config/dev-config.yaml diff --git a/dags/roger/config/s3_config.py b/src/roger/config/s3_config.py similarity index 100% rename from dags/roger/config/s3_config.py rename to src/roger/config/s3_config.py diff --git a/dags/roger/core/__init__.py b/src/roger/core/__init__.py similarity index 100% rename from dags/roger/core/__init__.py rename to src/roger/core/__init__.py diff --git a/dags/roger/core/base.py b/src/roger/core/base.py similarity index 100% rename from dags/roger/core/base.py rename to src/roger/core/base.py diff --git a/dags/roger/core/bulkload.py b/src/roger/core/bulkload.py similarity index 100% rename from dags/roger/core/bulkload.py rename to src/roger/core/bulkload.py diff --git a/dags/roger/core/enums.py b/src/roger/core/enums.py similarity index 100% rename from dags/roger/core/enums.py rename to src/roger/core/enums.py diff --git a/dags/roger/core/redis_graph.py b/src/roger/core/redis_graph.py similarity index 97% rename from dags/roger/core/redis_graph.py rename to src/roger/core/redis_graph.py index ca65ddce..5d89ef74 100644 --- a/dags/roger/core/redis_graph.py +++ b/src/roger/core/redis_graph.py @@ -3,8 +3,8 @@ import redis # from redisgraph import Node, Edge, Graph # https://redis-py.readthedocs.io/en/v4.5.1/redismodules.html#redisgraph-commands -from redis.commands.graph.node import Node -from redis.commands.graph.edge import Edge +from falkordb.node import Node +from falkordb.edge import Edge from roger.logger import get_logger @@ -88,4 +88,4 @@ def test (): rg.delete () # rg.query ("""MATCH (a { id : 'chemical_substance' }) RETURN a""") -#test () \ No newline at end of file +#test () diff --git a/dags/roger/core/storage.py b/src/roger/core/storage.py similarity index 100% rename from dags/roger/core/storage.py rename to src/roger/core/storage.py diff --git a/dags/dug_helpers/__init__.py b/src/roger/dug_helpers/__init__.py similarity index 100% rename from dags/dug_helpers/__init__.py rename to src/roger/dug_helpers/__init__.py diff --git a/dags/dug_helpers/dug_utils.py b/src/roger/dug_helpers/dug_utils.py similarity index 99% rename from dags/dug_helpers/dug_utils.py rename to src/roger/dug_helpers/dug_utils.py index 52db9624..ac4b145d 100644 --- a/dags/dug_helpers/dug_utils.py +++ b/src/roger/dug_helpers/dug_utils.py @@ -24,7 +24,7 @@ from roger.core import storage from roger.models.biolink import BiolinkModel from roger.logger import get_logger -from utils.s3_utils import S3Utils +from roger.utils.s3_utils import S3Utils log = get_logger() diff --git a/dags/roger/logger.py b/src/roger/logger.py similarity index 100% rename from dags/roger/logger.py rename to src/roger/logger.py diff --git a/dags/roger/models/__init__.py b/src/roger/models/__init__.py similarity index 100% rename from dags/roger/models/__init__.py rename to src/roger/models/__init__.py diff --git a/dags/roger/models/biolink.py b/src/roger/models/biolink.py similarity index 100% rename from dags/roger/models/biolink.py rename to src/roger/models/biolink.py diff --git a/dags/roger/models/kgx.py b/src/roger/models/kgx.py similarity index 100% rename from dags/roger/models/kgx.py rename to src/roger/models/kgx.py diff --git a/dags/roger/pipelines/README.md b/src/roger/pipelines/README.md similarity index 100% rename from dags/roger/pipelines/README.md rename to src/roger/pipelines/README.md diff --git a/dags/roger/pipelines/__init__.py b/src/roger/pipelines/__init__.py similarity index 100% rename from dags/roger/pipelines/__init__.py rename to src/roger/pipelines/__init__.py diff --git a/dags/roger/pipelines/anvil.py b/src/roger/pipelines/anvil.py similarity index 100% rename from dags/roger/pipelines/anvil.py rename to src/roger/pipelines/anvil.py diff --git a/dags/roger/pipelines/bacpac.py b/src/roger/pipelines/bacpac.py similarity index 100% rename from dags/roger/pipelines/bacpac.py rename to src/roger/pipelines/bacpac.py diff --git a/dags/roger/pipelines/base.py b/src/roger/pipelines/base.py similarity index 92% rename from dags/roger/pipelines/base.py rename to src/roger/pipelines/base.py index 5c1df4cc..26a8c342 100644 --- a/dags/roger/pipelines/base.py +++ b/src/roger/pipelines/base.py @@ -29,7 +29,7 @@ from roger.models.biolink import BiolinkModel from roger.logger import get_logger -from utils.s3_utils import S3Utils +from roger.utils.s3_utils import S3Utils log = get_logger() @@ -115,7 +115,7 @@ def __init__(self, config: RogerConfig, to_string=False): "Set instance variables and check to make sure we're overriden" if not self.pipeline_name: raise PipelineException( - "Subclass must at least define pipeline_name as class var") + "Subclass must at least define pipeline_name as class var") self.config = config self.bl_toolkit = BiolinkModel() dug_conf = config.to_dug_conf() @@ -142,17 +142,8 @@ def __init__(self, config: RogerConfig, to_string=False): self.concepts_index = indexing_config.get('concepts_index') self.kg_index = indexing_config.get('kg_index') - self.search_obj: Search = self.factory.build_search_obj([ - self.variables_index, - self.concepts_index, - self.kg_index, - ]) - self.index_obj: Index = self.factory.build_indexer_obj([ - self.variables_index, - self.concepts_index, - self.kg_index, - - ]) + self.search_obj: Search = self.factory.build_search_obj() + self.index_obj: Index = self.factory.build_indexer_obj() def __enter__(self): self.event_loop = asyncio.new_event_loop() @@ -192,21 +183,21 @@ def get_parser_name(self): can also be overriden. """ return getattr(self, 'parser_name', self.pipeline_name) - + def get_annotator_name(self): - """ - Access method for annotator_name - Defaults to annotator_monarch unless specified using annotation.annotator_type in the configuration file. + """ Access method for annotator_name + + Defaults to annotator_monarch unless specified using + annotation.annotator_type in the configuration file. """ return self.config.annotation.annotator_type - def get_parser(self): dug_plugin_manager = get_plugin_manager() parser: Parser = get_parser(dug_plugin_manager.hook, self.get_parser_name()) return parser - + def get_annotator(self): dug_plugin_manager = get_plugin_manager() annotator: Annotator = get_annotator( @@ -219,18 +210,20 @@ def get_annotator(self): def init_annotator(self, max_retries=5, base_delay=1, max_delay=10): attempt = 0 while attempt < max_retries: - try: + try: log.info("Initializing annotator") - annotator = self.get_annotator() + annotator = self.get_annotator() return annotator # success except Exception as e: attempt += 1 if attempt == max_retries: - log.error("Max retries reached when creating annotator. Failing with error: %s", e) + log.error("Max retries reached when creating annotator. " + "Failing with error: %s", e) raise delay = min(base_delay * (2 ** (attempt - 1)), max_delay) delay += random.uniform(0, 1) # add jitter - log.warning("Error occurred: %s. Retrying in %.2f seconds...", e, delay) + log.warning("Error occurred: %s. Retrying in %.2f seconds...", + e, delay) time.sleep(delay) @@ -250,8 +243,8 @@ def annotate_files(self, parsable_files, output_data_path=None): annotator = self.init_annotator() log.info("Done intializing annotator") for _, parse_file in enumerate(parsable_files): - log.debug("Creating Dug Crawler object on parse_file %s at %d of %d", - parse_file, _ , len(parsable_files)) + log.debug("Creating Dug Crawler object on parse_file %s " + "at %d of %d", parse_file, _ , len(parsable_files)) crawler = Crawler( crawl_file=parse_file, parser=parser, @@ -267,8 +260,8 @@ def annotate_files(self, parsable_files, output_data_path=None): elements_file_path = os.path.join( output_data_path, current_file_name) elements_file = os.path.join(elements_file_path, 'elements.txt') - concepts_file = os.path.join(elements_file_path, 'concepts.txt') - + concepts_file = os.path.join(elements_file_path, 'concepts.txt') + # Use the specified parser to parse the parse_file into elements. log.debug("Parser is %s", str(parser)) elements = parser(parse_file) @@ -292,12 +285,15 @@ def annotate_files(self, parsable_files, output_data_path=None): elements = crawler.elements # Write pickles of objects to file - log.info("Parsed and annotated: %s", parse_file) - - storage.write_object(jsonpickle.encode(elements, indent=2), elements_file) + log.info("Parsed and annotated: %s", parse_file) + + storage.write_object(jsonpickle.encode(elements, indent=2), + elements_file) log.info("Serialized annotated elements to : %s", elements_file) - storage.write_object(jsonpickle.encode(non_expanded_concepts, indent=2), concepts_file) + storage.write_object( + jsonpickle.encode(non_expanded_concepts, indent=2), + concepts_file) log.info("Serialized annotated concepts to : %s", concepts_file) def convert_to_kgx_json(self, elements, written_nodes=None): @@ -428,7 +424,7 @@ def make_tagged_kg(self, elements): # @TODO extract this into config or maybe dug ?? topmed_tag_concept_type = "TOPMed Phenotype Concept" nodes_written = set() - for tag in elements: + for tag in elements: if not (isinstance(tag, DugConcept) and tag.type == topmed_tag_concept_type): continue @@ -500,7 +496,8 @@ def index_elements(self, elements_file): def validate_indexed_element_file(self, elements_file): "After submitting elements for indexing, verify that they're available" - elements = [x for x in jsonpickle.decode(storage.read_object(elements_file)) + elements = [x for x in jsonpickle.decode( + storage.read_object(elements_file)) if not isinstance(x, DugConcept)] # Pick ~ 10 % sample_size = int(len(elements) * 0.1) @@ -540,14 +537,16 @@ def validate_indexed_element_file(self, elements_file): def _search_elements(self, curie, search_term): "Asynchronously call a search on the curie and search term" - response = self.event_loop.run_until_complete(self.search_obj.search_vars_unscored( - concept=curie, - query=search_term + response = self.event_loop.run_until_complete( + self.search_obj.search_vars_unscored( + concept=curie, + query=search_term )) ids_dict = [] if 'total_items' in response: if response['total_items'] == 0: - log.error(f"No search elements returned for variable search: {self.variables_index}.") + log.error(f"No search elements returned for variable search: " + f"{self.variables_index}.") log.error(f"Concept id : {curie}, Search term: {search_term}") raise Exception(f"Validation error - Did not find {curie} for" f"Search term: {search_term}") @@ -555,7 +554,8 @@ def _search_elements(self, curie, search_term): del response['total_items'] for element_type in response: all_elements_ids = [e['id'] for e in - reduce(lambda x, y: x + y['elements'], response[element_type], [])] + reduce(lambda x, y: x + y['elements'], + response[element_type], [])] ids_dict += all_elements_ids return ids_dict @@ -567,13 +567,14 @@ def crawl_concepts(self, concepts, data_set_name, output_path=None): :param data_set_name: :return: """ - # TODO crawl dir seems to be storaing crawling info to avoid re-crawling, but is that consting us much? , it was when tranql was slow, but - # might right to consider getting rid of it. + # TODO crawl dir seems to be storaing crawling info to avoid + # re-crawling, but is that consting us much? , it was when tranql was + # slow, but might right to consider getting rid of it. crawl_dir = storage.dug_crawl_path('crawl_output') output_file_name = os.path.join(data_set_name, 'expanded_concepts.txt') - extracted_dug_elements_file_name = os.path.join(data_set_name, - 'extracted_graph_elements.txt') + extracted_dug_elements_file_name = os.path.join( + data_set_name, 'extracted_graph_elements.txt') if not output_path: output_file = storage.dug_expanded_concepts_path(output_file_name) extracted_output_file = storage.dug_expanded_concepts_path( @@ -581,8 +582,9 @@ def crawl_concepts(self, concepts, data_set_name, output_path=None): ) else: output_file = os.path.join(output_path, output_file_name) - extracted_output_file = os.path.join( output_path, extracted_dug_elements_file_name) - + extracted_output_file = os.path.join( + output_path, extracted_dug_elements_file_name) + Path(crawl_dir).mkdir(parents=True, exist_ok=True) extracted_dug_elements = [] log.debug("Creating Dug Crawler object") @@ -622,9 +624,11 @@ def crawl_concepts(self, concepts, data_set_name, output_path=None): if percent_complete % 10 == 0: log.info("%d%%", percent_complete) log.info("Crawling %s done", data_set_name) - storage.write_object(obj=jsonpickle.encode(concepts, indent=2), path=output_file) + storage.write_object(obj=jsonpickle.encode(concepts, indent=2), + path=output_file) log.info ("Concepts serialized to %s", output_file) - storage.write_object(obj=jsonpickle.encode(extracted_dug_elements, indent=2), + storage.write_object(obj=jsonpickle.encode(extracted_dug_elements, + indent=2), path=extracted_output_file) log.info("Extracted elements serialized to %s", extracted_output_file) @@ -732,7 +736,8 @@ def _validate_indexed_concepts(self, elements, concepts): def clear_index(self, index_id): "Delete the index specified by index_id from ES" - exists = self.event_loop.run_until_complete(self.search_obj.es.indices.exists(index=index_id)) + exists = self.event_loop.run_until_complete( + self.search_obj.es.indices.exists(index=index_id)) if exists: log.info("Deleting index %s", str(index_id)) response = self.event_loop.run_until_complete( @@ -840,7 +845,8 @@ def annotate(self, to_string=False, files=None, input_data_path=None, "Annotate files with the appropriate parsers and crawlers" if files is None: files = self.get_objects(input_data_path=input_data_path) - self.annotate_files(parsable_files=files, output_data_path=output_data_path) + self.annotate_files(parsable_files=files, + output_data_path=output_data_path) output_log = self.log_stream.getvalue() if to_string else '' return output_log @@ -855,8 +861,9 @@ def index_variables(self, to_string=False, element_object_files=None, """ # self.clear_variables_index() if element_object_files is None: - element_object_files = storage.dug_elements_objects(input_data_path,format='txt') - for file_ in element_object_files: + element_object_files = storage.dug_elements_objects( + input_data_path,format='txt') + for file_ in element_object_files: self.index_elements(file_) output_log = self.log_stream.getvalue() if to_string else '' return output_log @@ -867,30 +874,40 @@ def validate_indexed_variables(self, to_string=None, output_data_path=None): "Validate output from index variables task for pipeline" if not element_object_files: - element_object_files = storage.dug_elements_objects(input_data_path, format='txt') + element_object_files = storage.dug_elements_objects( + input_data_path, format='txt') for file_ in element_object_files: log.info("Validating %s", str(file_)) self.validate_indexed_element_file(file_) output_log = self.log_stream.getvalue() if to_string else '' return output_log - def validate_indexed_concepts(self, config=None, to_string=None, input_data_path=None, output_data_path=None): + def validate_indexed_concepts(self, config=None, to_string=None, + input_data_path=None, output_data_path=None): """ Entry for validate concepts """ - get_data_set_name = lambda file: os.path.split(os.path.dirname(file))[-1] + get_data_set_name = lambda file: ( + os.path.split(os.path.dirname(file))[-1]) expanded_concepts_files_dict = { - get_data_set_name(file): file for file in storage.dug_expanded_concept_objects(data_path=input_data_path, format='txt') + get_data_set_name(file): file for file in + storage.dug_expanded_concept_objects(data_path=input_data_path, + format='txt') } annotated_elements_files_dict = { - get_data_set_name(file): file for file in storage.dug_elements_objects(data_path=input_data_path, format='txt') + get_data_set_name(file): file for file in + storage.dug_elements_objects(data_path=input_data_path, + format='txt') } - try: - assert len(expanded_concepts_files_dict) == len(annotated_elements_files_dict) + try: + assert (len(expanded_concepts_files_dict) == + len(annotated_elements_files_dict)) except: - log.error("Files Annotated Elements files and Expanded concepts files, should be pairs") + log.error("Files Annotated Elements files and " + "expanded concepts files, should be pairs") if len(expanded_concepts_files_dict) > len(annotated_elements_files_dict): - log.error("Some Annotated Elements files (from load_and_annotate task) are missing") + log.error("Some Annotated Elements files " + "(from load_and_annotate task) are missing") else: log.error("Some Expanded Concepts files (from crawl task) are missing") log.error(f"Annotated Datasets : {list(annotated_elements_files_dict.keys())}") @@ -936,9 +953,10 @@ def make_kg_tagged(self, to_string=False, elements_files=None, def crawl_tranql(self, to_string=False, concept_files=None, input_data_path=None, output_data_path=None): - "Perform the tranql crawl" + "Perform the tranql crawl" if not concept_files: - concept_files = storage.dug_concepts_objects(input_data_path, format='txt') + concept_files = storage.dug_concepts_objects( + input_data_path, format='txt') if output_data_path: crawl_dir = os.path.join(output_data_path, 'crawl_output') @@ -956,15 +974,16 @@ def crawl_tranql(self, to_string=False, concept_files=None, log.info("Crawling Dug Concepts, found %d file(s).", len(concept_files)) for file_ in concept_files: - objects = storage.read_object(file_) - objects = objects or {} + objects = storage.read_object(file_) + objects = objects or {} if not objects: log.info(f'no concepts in {file_}') data_set = jsonpickle.decode(objects) original_variables_dataset_name = os.path.split( os.path.dirname(file_))[-1] self.crawl_concepts(concepts=data_set, - data_set_name=original_variables_dataset_name, output_path= output_data_path) + data_set_name=original_variables_dataset_name, + output_path= output_data_path) output_log = self.log_stream.getvalue() if to_string else '' return output_log @@ -984,7 +1003,8 @@ def index_concepts(self, to_string=False, if self.config.indexing.node_to_element_queries: log.info("*******************") - extracted_elements_files = storage.dug_extracted_elements_objects(data_path=input_data_path) + extracted_elements_files = storage.dug_extracted_elements_objects( + data_path=input_data_path) log.info(f"{extracted_elements_files}") for file_ in extracted_elements_files: log.info(f"reading file {file_}") diff --git a/dags/roger/pipelines/bdc.py b/src/roger/pipelines/bdc.py similarity index 100% rename from dags/roger/pipelines/bdc.py rename to src/roger/pipelines/bdc.py diff --git a/dags/roger/pipelines/bdc_pipelines.py b/src/roger/pipelines/bdc_pipelines.py similarity index 100% rename from dags/roger/pipelines/bdc_pipelines.py rename to src/roger/pipelines/bdc_pipelines.py diff --git a/dags/roger/pipelines/crdc.py b/src/roger/pipelines/crdc.py similarity index 100% rename from dags/roger/pipelines/crdc.py rename to src/roger/pipelines/crdc.py diff --git a/dags/roger/pipelines/ctn.py b/src/roger/pipelines/ctn.py similarity index 100% rename from dags/roger/pipelines/ctn.py rename to src/roger/pipelines/ctn.py diff --git a/dags/roger/pipelines/db_gap.py b/src/roger/pipelines/db_gap.py similarity index 100% rename from dags/roger/pipelines/db_gap.py rename to src/roger/pipelines/db_gap.py diff --git a/dags/roger/pipelines/heal_research_programs.py b/src/roger/pipelines/heal_research_programs.py similarity index 100% rename from dags/roger/pipelines/heal_research_programs.py rename to src/roger/pipelines/heal_research_programs.py diff --git a/dags/roger/pipelines/heal_studies.py b/src/roger/pipelines/heal_studies.py similarity index 73% rename from dags/roger/pipelines/heal_studies.py rename to src/roger/pipelines/heal_studies.py index a08e8115..eff0c927 100644 --- a/dags/roger/pipelines/heal_studies.py +++ b/src/roger/pipelines/heal_studies.py @@ -11,6 +11,7 @@ class HealStudiesPipeline(DugPipeline): def get_objects(self, input_data_path=None): if not input_data_path: input_data_path = storage.dug_heal_study_path() - files = storage.get_files_recursive(lambda file_name: file_name.endswith('.xml'), - input_data_path) + files = storage.get_files_recursive( + lambda file_name: file_name.endswith('.xml'), + input_data_path) return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/kfdrc.py b/src/roger/pipelines/kfdrc.py similarity index 100% rename from dags/roger/pipelines/kfdrc.py rename to src/roger/pipelines/kfdrc.py diff --git a/dags/roger/pipelines/nida.py b/src/roger/pipelines/nida.py similarity index 100% rename from dags/roger/pipelines/nida.py rename to src/roger/pipelines/nida.py diff --git a/dags/roger/pipelines/picsure_test.py b/src/roger/pipelines/picsure_test.py similarity index 100% rename from dags/roger/pipelines/picsure_test.py rename to src/roger/pipelines/picsure_test.py diff --git a/dags/roger/pipelines/radx.py b/src/roger/pipelines/radx.py similarity index 100% rename from dags/roger/pipelines/radx.py rename to src/roger/pipelines/radx.py diff --git a/dags/roger/pipelines/sparc.py b/src/roger/pipelines/sparc.py similarity index 100% rename from dags/roger/pipelines/sparc.py rename to src/roger/pipelines/sparc.py diff --git a/dags/roger/pipelines/topmed.py b/src/roger/pipelines/topmed.py similarity index 100% rename from dags/roger/pipelines/topmed.py rename to src/roger/pipelines/topmed.py diff --git a/dags/roger/pvc.yaml b/src/roger/pvc.yaml similarity index 100% rename from dags/roger/pvc.yaml rename to src/roger/pvc.yaml diff --git a/dags/roger/tasks.py b/src/roger/tasks.py similarity index 97% rename from dags/roger/tasks.py rename to src/roger/tasks.py index cd828383..bcca2e77 100755 --- a/dags/roger/tasks.py +++ b/src/roger/tasks.py @@ -10,8 +10,8 @@ # Airflow 3.x - prefer provider imports and new public types from airflow.providers.standard.operators.python import PythonOperator -from airflow.operators.empty import EmptyOperator -from airflow.utils.task_group import TaskGroup +from airflow.providers.standard.operators.empty import EmptyOperator +from airflow.sdk import TaskGroup from airflow.models import DAG from airflow.models.taskinstance import TaskInstance from airflow.providers.standard.operators.bash import BashOperator @@ -219,7 +219,7 @@ def generate_dir_name_from_task_instance(task_instance: TaskInstance, # local dir structure. if not roger_config.lakefs_config.enabled: return None - root_data_dir = os.getenv("ROGER_DATA_DIR").rstrip('/') + root_data_dir = os.getenv("ROGER_DATA_DIR", "/tmp/roger/data").rstrip('/') task_id = task_instance.task_id dag_id = task_instance.dag_id run_id = task_instance.run_id @@ -295,7 +295,9 @@ def setup_input_data(context: Context, exec_conf): logger.info(">>> end of downloading data") -def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos=None, pass_conf=True, no_output_files=False): +def create_python_task(dag, name, a_callable, func_kwargs=None, + external_repos=None, pass_conf=True, + no_output_files=False): """ Create a python task. :param func_kwargs: additional arguments for callable. :param dag: dag to add task to. @@ -349,7 +351,6 @@ def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos=N return PythonOperator(**python_operator_args) - def create_pipeline_taskgroup( dag, pipeline_class: type, diff --git a/dags/utils/__init__.py b/src/roger/utils/__init__.py similarity index 100% rename from dags/utils/__init__.py rename to src/roger/utils/__init__.py diff --git a/dags/utils/s3_utils.py b/src/roger/utils/s3_utils.py similarity index 100% rename from dags/utils/s3_utils.py rename to src/roger/utils/s3_utils.py diff --git a/tests/conftest.py b/tests/conftest.py index e69de29b..bdc954cb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -0,0 +1 @@ +pythonpath = "dags" diff --git a/tests/integration/test_dug_utils.py b/tests/integration/test_dug_utils.py index 4e31f820..36174a5a 100644 --- a/tests/integration/test_dug_utils.py +++ b/tests/integration/test_dug_utils.py @@ -4,7 +4,8 @@ import pytest -from dug_helpers.dug_utils import FileFetcher, get_topmed_files, get_dbgap_files +from roger.dug_helpers.dug_utils import (FileFetcher, get_topmed_files, + get_dbgap_files) from roger.config import config @@ -59,4 +60,4 @@ def test_get_topmed_files(): def test_get_dbgap_files(): file_names = get_dbgap_files(config=config) for file_name in file_names: - assert Path(file_name).exists() \ No newline at end of file + assert Path(file_name).exists()