From 547b0b072c1e80774d98d163a28cdbe685a883e4 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 17 Dec 2025 11:34:16 +0000 Subject: [PATCH 1/3] remove ability to pass files on command line --- eva_sub_cli/executables/cli.py | 22 ------ eva_sub_cli/orchestrator.py | 109 +++++++++++++--------------- eva_sub_cli/validators/validator.py | 3 +- tests/test_cli.py | 9 +-- tests/test_orchestrator.py | 109 ++++++++++------------------ 5 files changed, 91 insertions(+), 161 deletions(-) diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py index 5d68ec0..88e43ee 100755 --- a/eva_sub_cli/executables/cli.py +++ b/eva_sub_cli/executables/cli.py @@ -26,20 +26,6 @@ def validate_command_line_arguments(args, argparser): fail = False - if (args.vcf_files and not args.reference_fasta) or (not args.vcf_files and args.reference_fasta): - print("When using --vcf_files and --reference_fasta, both need to be specified") - fail = True - - if args.vcf_files: - for vcf_file in args.vcf_files: - if not os.path.isfile(vcf_file): - print(f"VCF file {vcf_file} is not a file") - fail = True - - if args.reference_fasta: - if not os.path.isfile(args.reference_fasta): - print(f"Fasta file {args.reference_fasta} is not a file") - fail = True if args.metadata_xlsx: if not os.path.isfile(args.metadata_xlsx): @@ -77,14 +63,6 @@ def parse_args(cmd_line_args): argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}') argparser.add_argument('--submission_dir', required=True, type=str, help='Path to the directory where all processing is done and submission info is stored') - vcf_group = argparser.add_argument_group( - 'Input VCF and assembly', - "Specify the VCF files and associated assembly with the following options. If you used different assemblies " - "for different VCF files, then you must include these in the metadata file rather than specifying them here." - ) - vcf_group.add_argument('--vcf_files', nargs='+', help="One or more VCF files to validate") - vcf_group.add_argument('--reference_fasta', - help="The FASTA file containing the reference genome from which the variants were derived") metadata_group = argparser.add_argument_group('Metadata', 'Specify the metadata in a spreadsheet or in a JSON file') metadata_group = metadata_group.add_mutually_exclusive_group(required=True) diff --git a/eva_sub_cli/orchestrator.py b/eva_sub_cli/orchestrator.py index 5cb71b1..e5f2526 100755 --- a/eva_sub_cli/orchestrator.py +++ b/eva_sub_cli/orchestrator.py @@ -77,15 +77,12 @@ def remove_non_vcf_files_from_metadata(metadata_json, metadata_xlsx): logger.warning(f"Some files mentioned in the metadata xlsx's ({metadata_xlsx}) Files sheet are not VCF files and have been removed.") -def get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, reference_fasta, - metadata_json, metadata_xlsx, metadata_xlsx_version): +def get_project_title_and_create_vcf_files_mapping(submission_dir, metadata_json, metadata_xlsx, metadata_xlsx_version): """ - Get project title and mapping between VCF files and reference FASTA files, from three sources: command line - arguments, metadata JSON file, or metadata XLSX file. + Get project title and mapping between VCF files and reference FASTA files, from two sources: metadata JSON file + or metadata XLSX file. :param submission_dir: Directory where mapping file will be saved - :param vcf_files: VCF files from command line, if present - :param reference_fasta: Reference FASTA from command line, if present :param metadata_json: Metadata JSON from command line, if present :param metadata_xlsx: Metadata XLSX from command line, if present :param metadata_xlsx_version: Version of metadata XLSX @@ -97,17 +94,10 @@ def get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, re writer.writerow(['vcf', 'fasta', 'report']) vcf_files_mapping = [] - if vcf_files and reference_fasta: - for vcf_file in vcf_files: - vcf_files_mapping.append([os.path.abspath(vcf_file), os.path.abspath(reference_fasta), '']) - if metadata_json: - project_title, _ = get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, False) - elif metadata_xlsx: - project_title, _ = get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata_xlsx_version, False) - elif metadata_json: - project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, True) + if metadata_json: + project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json) elif metadata_xlsx: - project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata_xlsx_version, True) + project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata_xlsx_version) # Filter out non-vcf files vcf_files_mapping = [(vcf, fasta, report) for vcf, fasta, report in vcf_files_mapping if is_vcf_file(vcf)] @@ -137,7 +127,7 @@ def validate_vcf_mapping(vcf_mapping): f'path.') -def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_req=False): +def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json): metadata = EvaMetadataJson(metadata_json) project_title = metadata.project.get('title') @@ -147,19 +137,19 @@ def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_ project_title = get_project_title_from_ena(project_accession) vcf_fasta_report_mapping = [] - if mapping_req: - analysis_alias_dict = defaultdict(dict) - for analysis in metadata.analyses: - analysis_alias_dict[analysis['analysisAlias']]['referenceFasta'] = analysis['referenceFasta'] - analysis_alias_dict[analysis['analysisAlias']]['assemblyReport'] = analysis['assemblyReport'] \ - if 'assemblyReport' in analysis else '' - - for file_dict in metadata.resolved_files: - reference_fasta = analysis_alias_dict[file_dict['analysisAlias']]['referenceFasta'] - assembly_report = analysis_alias_dict[file_dict['analysisAlias']]['assemblyReport'] - vcf_fasta_report_mapping.append([os.path.abspath(file_dict['fileName']), - os.path.abspath(reference_fasta), - os.path.abspath(assembly_report) if assembly_report else '']) + + analysis_alias_dict = defaultdict(dict) + for analysis in metadata.analyses: + analysis_alias_dict[analysis['analysisAlias']]['referenceFasta'] = analysis['referenceFasta'] + analysis_alias_dict[analysis['analysisAlias']]['assemblyReport'] = analysis['assemblyReport'] \ + if 'assemblyReport' in analysis else '' + + for file_dict in metadata.resolved_files: + reference_fasta = analysis_alias_dict[file_dict['analysisAlias']]['referenceFasta'] + assembly_report = analysis_alias_dict[file_dict['analysisAlias']]['assemblyReport'] + vcf_fasta_report_mapping.append([os.path.abspath(file_dict['fileName']), + os.path.abspath(reference_fasta), + os.path.abspath(assembly_report) if assembly_report else '']) return project_title, vcf_fasta_report_mapping @@ -223,7 +213,7 @@ def verify_and_get_metadata_xlsx_version(metadata_xlsx, min_req_version): return xlsx_version -def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata_xlsx_version, mapping_req=False): +def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata_xlsx_version): workbook = load_workbook(metadata_xlsx) project_sheet = workbook['Project'] @@ -243,33 +233,32 @@ def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata project_title = get_project_title_from_ena(project_accession) vcf_fasta_report_mapping = [] - if mapping_req: - analysis_alias_sheet = workbook['Analysis'] - analysis_headers = {} - for cell in analysis_alias_sheet[1]: - analysis_headers[cell.value] = cell.column - 1 - - analysis_alias_dict = {} - for row in analysis_alias_sheet.iter_rows(min_row=2, values_only=True): - analysis_alias = row[analysis_headers['Analysis Alias']] - reference_fasta = row[analysis_headers['Reference Fasta Path']] - analysis_alias_dict[analysis_alias] = reference_fasta - - files_sheet = workbook['Files'] - files_headers = {} - for cell in files_sheet[1]: - files_headers[cell.value] = cell.column - 1 - - for row in files_sheet.iter_rows(min_row=2, values_only=True): - file_name = row[files_headers['File Name']] - if file_name: - file_name = os.path.abspath(file_name) - analysis_alias = row[files_headers['Analysis Alias']] - reference_fasta = analysis_alias_dict[analysis_alias] - if reference_fasta: - reference_fasta = os.path.abspath(reference_fasta) - if file_name and reference_fasta: - vcf_fasta_report_mapping.append([file_name, reference_fasta, '']) + analysis_alias_sheet = workbook['Analysis'] + analysis_headers = {} + for cell in analysis_alias_sheet[1]: + analysis_headers[cell.value] = cell.column - 1 + + analysis_alias_dict = {} + for row in analysis_alias_sheet.iter_rows(min_row=2, values_only=True): + analysis_alias = row[analysis_headers['Analysis Alias']] + reference_fasta = row[analysis_headers['Reference Fasta Path']] + analysis_alias_dict[analysis_alias] = reference_fasta + + files_sheet = workbook['Files'] + files_headers = {} + for cell in files_sheet[1]: + files_headers[cell.value] = cell.column - 1 + + for row in files_sheet.iter_rows(min_row=2, values_only=True): + file_name = row[files_headers['File Name']] + if file_name: + file_name = os.path.abspath(file_name) + analysis_alias = row[files_headers['Analysis Alias']] + reference_fasta = analysis_alias_dict[analysis_alias] + if reference_fasta: + reference_fasta = os.path.abspath(reference_fasta) + if file_name and reference_fasta: + vcf_fasta_report_mapping.append([file_name, reference_fasta, '']) return project_title, vcf_fasta_report_mapping @@ -308,7 +297,7 @@ def check_validation_required(tasks, sub_config, username=None, password=None): return False -def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx, +def orchestrate_process(submission_dir, metadata_json, metadata_xlsx, tasks, executor, validation_tasks=ALL_VALIDATION_TASKS, username=None, password=None, shallow_validation=False, nextflow_config=None, **kwargs): # load config @@ -332,7 +321,7 @@ def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_jso # Get the provided Project Title and VCF files mapping (VCF, Fasta and Report) project_title, vcf_files_mapping = get_project_title_and_create_vcf_files_mapping( - submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx, metadata_xlsx_version + submission_dir, metadata_json, metadata_xlsx, metadata_xlsx_version ) vcf_files = get_vcf_files(vcf_files_mapping) diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index 8c62e4f..b68ecf4 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -578,8 +578,7 @@ def _collect_file_info_to_metadata(self): file_rows.append(file_dict) file_count += 1 else: - error_txt = ('No file section found in metadata and multiple analysis alias exist: ' - 'cannot infer the relationship between files and analysis alias') + error_txt = 'No file section found in metadata' self.error(error_txt) errors.append({'property': '/files', 'description': error_txt}) metadata.set_files(file_rows) diff --git a/tests/test_cli.py b/tests/test_cli.py index 028d5f3..674a7db 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -49,16 +49,10 @@ def test_main(self): all_lines[0].endswith('[eva_sub_cli.orchestrator][DEBUG] test\n') def test_validate_args(self): - vcf_file = os.path.join(self.submission_dir,'test.vcf') - fasta_file = os.path.join(self.submission_dir, 'test.fasta') json_file = os.path.join(self.submission_dir, 'test.json') - touch(vcf_file) - touch(fasta_file) touch(json_file) cmd_args = [ '--submission_dir', self.submission_dir, - '--vcf_files', vcf_file, - '--reference_fasta', fasta_file, '--metadata_json', json_file, '--tasks', 'validate', '--executor', 'native', @@ -67,10 +61,9 @@ def test_validate_args(self): args = cli.parse_args(cmd_args) assert args.submission_dir == self.submission_dir - with patch('sys.exit') as m_exit: cli.parse_args(cmd_args[:2]+cmd_args[4:]) - m_exit.assert_called_once_with(1) + m_exit.assert_called_once_with(2) def test_main_exception_handling(self): mock_response = Mock() diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index fcccabe..ad3f612 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -1,7 +1,9 @@ import csv +import json import os import shutil import unittest +from copy import deepcopy from unittest.mock import patch, Mock, MagicMock from ebi_eva_common_pyutils.config import WritableConfig @@ -31,8 +33,8 @@ class TestOrchestrator(unittest.TestCase): config_file = os.path.join(test_sub_dir, SUB_CLI_CONFIG_FILE) mapping_file = os.path.join(test_sub_dir, 'vcf_mapping_file.csv') - vcf_files = [os.path.join(test_sub_dir, 'vcf_file1.vcf'), os.path.join(test_sub_dir, 'vcf_file2.vcf')] - reference_fasta = os.path.join(test_sub_dir, 'genome.fa') + # vcf_files = [os.path.join(test_sub_dir, 'vcf_file1.vcf'), os.path.join(test_sub_dir, 'vcf_file2.vcf')] + # reference_fasta = os.path.join(test_sub_dir, 'genome.fa') metadata_json = os.path.join(test_sub_dir, 'sub_metadata.json') metadata_xlsx = os.path.join(test_sub_dir, 'sub_metadata.xlsx') metadata_xlsx_version = '3.0.0' @@ -68,7 +70,6 @@ def tearDown(self) -> None: if os.path.exists(self.test_sub_dir): shutil.rmtree(self.test_sub_dir) - def test_remove_non_vcf_files_from_metadata_json(self): # assert non vcf files exist in metadata metadata = EvaMetadataJson(self.metadata_json_with_non_vcf_files) @@ -81,8 +82,6 @@ def test_remove_non_vcf_files_from_metadata_json(self): metadata = EvaMetadataJson(self.metadata_json_with_non_vcf_files) assert all(is_vcf_file(f['fileName']) for f in metadata.files) - - def test_remove_non_vcf_files_from_metadata_xlsx(self): # assert non vcf files exist in metadata workbook = load_workbook(self.metadata_xlsx_with_non_vcf_files) @@ -119,7 +118,7 @@ def test_check_validation_required(self): get_submission_status_mock.return_value = 'FAILED' self.assertTrue(check_validation_required(tasks, sub_config)) - # A FAILD submission status reset the submission ID and submission URL + # A FAILED submission status reset the submission ID and submission URL self.assertEqual(sub_config.get(SUB_CLI_CONFIG_KEY_SUBMISSION_ID), None) self.assertEqual(sub_config.get(SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL), None) @@ -143,15 +142,14 @@ def test_orchestrate_validate(self): 'eva_sub_cli.orchestrator.get_project_title_and_create_vcf_files_mapping') as m_get_project_title_and_create_vcf_files_mapping, \ patch('eva_sub_cli.orchestrator.DockerValidator') as m_docker_validator: m_get_project_title_and_create_vcf_files_mapping.return_value = self.project_title, self.mapping_file - orchestrate_process(self.test_sub_dir, None, None, self.metadata_json, - self.metadata_xlsx, tasks=[VALIDATE], executor=DOCKER) - m_get_project_title_and_create_vcf_files_mapping.assert_called_once_with(self.test_sub_dir, None, None, - self.metadata_json, + orchestrate_process(self.test_sub_dir, None, self.metadata_xlsx, tasks=[VALIDATE], executor=DOCKER) + m_get_project_title_and_create_vcf_files_mapping.assert_called_once_with(self.test_sub_dir, + None, self.metadata_xlsx, self.metadata_xlsx_version) m_get_vcf.assert_called_once_with(self.mapping_file) m_docker_validator.assert_any_call( - self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, self.metadata_xlsx, self.metadata_xlsx_version, + self.mapping_file, self.test_sub_dir, self.project_title, None, self.metadata_xlsx, self.metadata_xlsx_version, validation_tasks=ALL_VALIDATION_TASKS, submission_config=m_config.return_value, shallow_validation=False ) m_docker_validator().validate_and_report.assert_called_once_with() @@ -169,12 +167,11 @@ def test_orchestrate_validate_submit(self): m_config.return_value = config m_get_project_title_and_create_vcf_files_mapping.return_value = self.project_title, self.mapping_file - orchestrate_process(self.test_sub_dir, None, None, self.metadata_json, - self.metadata_xlsx, tasks=[SUBMIT], executor=DOCKER) + orchestrate_process(self.test_sub_dir, None, self.metadata_xlsx, tasks=[SUBMIT], executor=DOCKER) m_get_vcf.assert_called_once_with(self.mapping_file) # Validate was run because the config show it was not run successfully before m_docker_validator.assert_any_call( - self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, self.metadata_xlsx, + self.mapping_file, self.test_sub_dir, self.project_title, None, self.metadata_xlsx, self.metadata_xlsx_version, validation_tasks=ALL_VALIDATION_TASKS, submission_config=m_config.return_value, shallow_validation=False ) @@ -197,8 +194,7 @@ def test_orchestrate_submit_no_validate(self): m_config.return_value = {READY_FOR_SUBMISSION_TO_EVA: True} m_get_project_title_and_create_vcf_files_mapping.return_value = self.project_title, self.mapping_file - orchestrate_process(self.test_sub_dir, None, None, self.metadata_json, - self.metadata_xlsx, tasks=[SUBMIT], executor=DOCKER) + orchestrate_process(self.test_sub_dir, None, self.metadata_xlsx, tasks=[SUBMIT], executor=DOCKER) m_get_vcf.assert_called_once_with(self.mapping_file) # Validate was not run because the config showed it was run successfully before assert m_docker_validator.call_count == 0 @@ -209,30 +205,10 @@ def test_orchestrate_submit_no_validate(self): with m_submitter() as submitter: submitter.submit.assert_called_once_with() - def test_orchestrate_with_vcf_files(self): - with patch('eva_sub_cli.orchestrator.WritableConfig') as m_config, \ - patch('eva_sub_cli.orchestrator.DockerValidator') as m_docker_validator, \ - patch('eva_sub_cli.orchestrator.os.path.isfile'): - orchestrate_process(self.test_sub_dir, self.vcf_files, self.reference_fasta, self.metadata_json, - self.metadata_xlsx, tasks=[VALIDATE], executor=DOCKER) - # Mapping file was created from the vcf and assembly files - assert os.path.exists(self.mapping_file) - with open(self.mapping_file) as open_file: - reader = csv.DictReader(open_file, delimiter=',') - for row in reader: - assert 'vcf_file' in row['vcf'] - assert row['report'] == '' - m_docker_validator.assert_any_call( - self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, self.metadata_xlsx, - self.metadata_xlsx_version, validation_tasks=ALL_VALIDATION_TASKS, - submission_config=m_config.return_value, shallow_validation=False - ) - m_docker_validator().validate_and_report.assert_called_once_with() - def test_orchestrate_with_metadata_json_without_asm_report(self): with patch('eva_sub_cli.orchestrator.WritableConfig') as m_config, \ patch('eva_sub_cli.orchestrator.DockerValidator') as m_docker_validator: - orchestrate_process(self.test_sub_dir, None, None, self.metadata_json, + orchestrate_process(self.test_sub_dir, self.metadata_json, None, tasks=[VALIDATE], executor=DOCKER) # Mapping file was created from the metadata_json assert os.path.exists(self.mapping_file) @@ -253,7 +229,7 @@ def test_orchestrate_with_metadata_json_with_asm_report(self): with patch('eva_sub_cli.orchestrator.WritableConfig') as m_config, \ patch('eva_sub_cli.orchestrator.DockerValidator') as m_docker_validator, \ patch('eva_sub_cli.orchestrator.os.path.isfile'): - orchestrate_process(self.test_sub_dir, None, None, self.metadata_json, None, + orchestrate_process(self.test_sub_dir, self.metadata_json, None, tasks=[VALIDATE], executor=DOCKER) # Mapping file was created from the metadata_json assert os.path.exists(self.mapping_file) @@ -276,7 +252,7 @@ def test_orchestrate_with_metadata_json_with_project_accession(self): patch('eva_sub_cli.orchestrator.os.path.isfile'), \ patch('eva_sub_cli.orchestrator.get_project_title_from_ena') as get_project_title_from_ena: get_project_title_from_ena.return_value = self.project_title - orchestrate_process(self.test_sub_dir, None, None, self.metadata_json, None, + orchestrate_process(self.test_sub_dir, self.metadata_json, None, tasks=[VALIDATE], executor=DOCKER) # Mapping file was created from the metadata_json assert os.path.exists(self.mapping_file) @@ -291,32 +267,20 @@ def test_orchestrate_with_metadata_json_with_project_accession(self): ) m_docker_validator().validate_and_report.assert_called_once_with() - def test_orchestrate_vcf_files_takes_precedence_over_metadata(self): - shutil.copy(os.path.join(self.resource_dir, 'EVA_Submission_test_with_asm_report.json'), self.metadata_json) - - with patch('eva_sub_cli.orchestrator.WritableConfig') as m_config, \ - patch('eva_sub_cli.orchestrator.DockerValidator') as m_docker_validator, \ - patch('eva_sub_cli.orchestrator.os.path.isfile'): - orchestrate_process(self.test_sub_dir, self.vcf_files, self.reference_fasta, self.metadata_json, - None, tasks=[VALIDATE], executor=DOCKER, resume=False) - # Mapping file was created from the metadata_json - assert os.path.exists(self.mapping_file) - with open(self.mapping_file) as open_file: - reader = csv.DictReader(open_file, delimiter=',') - for row in reader: - assert 'vcf_file' in row['vcf'] - assert row['report'] == '' - m_docker_validator.assert_any_call( - self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, None, None, - validation_tasks=ALL_VALIDATION_TASKS, submission_config=m_config.return_value, shallow_validation=False - ) - m_docker_validator().validate_and_report.assert_called_once_with() - def test_orchestrate_non_vcf_files_filtered_out(self): + metadata_with_non_vcf_file = os.path.join(self.test_sub_dir, 'updated_metadata.json') + with open(self.metadata_json) as f: + metadata = json.load(f) + metadata['files'].append({ + 'analysisAlias': 'VD1', + 'fileName': 'test.vcf.gz.csi' + }) + with open(metadata_with_non_vcf_file, 'w') as f: + json.dump(metadata, f) with patch('eva_sub_cli.orchestrator.WritableConfig') as m_config, \ patch('eva_sub_cli.orchestrator.DockerValidator') as m_docker_validator, \ patch('eva_sub_cli.orchestrator.os.path.isfile'): - orchestrate_process(self.test_sub_dir, ['test.vcf.gz.csi'] + self.vcf_files, self.reference_fasta, self.metadata_json, + orchestrate_process(self.test_sub_dir, metadata_with_non_vcf_file, None, tasks=[VALIDATE], executor=DOCKER, resume=False) # Mapping file was created from the metadata_json assert os.path.exists(self.mapping_file) @@ -325,10 +289,10 @@ def test_orchestrate_non_vcf_files_filtered_out(self): vcf_files = set() for row in reader: vcf_files.add(row['vcf']) - assert len(vcf_files) == 2 + assert len(vcf_files) == 3 assert 'test.vcf.gz.csi' not in vcf_files m_docker_validator.assert_any_call( - self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, None, None, + self.mapping_file, self.test_sub_dir, self.project_title, metadata_with_non_vcf_file, None, None, validation_tasks=ALL_VALIDATION_TASKS, submission_config=m_config.return_value, shallow_validation=False ) m_docker_validator().validate_and_report.assert_called_once_with() @@ -336,7 +300,7 @@ def test_orchestrate_non_vcf_files_filtered_out(self): def test_orchestrate_with_metadata_xlsx(self): with patch('eva_sub_cli.orchestrator.WritableConfig') as m_config, \ patch('eva_sub_cli.orchestrator.DockerValidator') as m_docker_validator: - orchestrate_process(self.test_sub_dir, None, None, None, self.metadata_xlsx, + orchestrate_process(self.test_sub_dir, None, self.metadata_xlsx, tasks=[VALIDATE], executor=DOCKER) # Mapping file was created from the metadata_xlsx assert os.path.exists(self.mapping_file) @@ -355,7 +319,7 @@ def test_orchestrate_with_metadata_xlsx(self): def test_orchestrate_with_metadata_xlsx_v2(self): with patch('eva_sub_cli.orchestrator.WritableConfig') as m_config, \ patch('eva_sub_cli.orchestrator.DockerValidator') as m_docker_validator: - orchestrate_process(self.test_sub_dir, None, None, None, self.metadata_xlsx_v2, + orchestrate_process(self.test_sub_dir, None, self.metadata_xlsx_v2, tasks=[VALIDATE], executor=DOCKER) # Mapping file was created from the metadata_xlsx assert os.path.exists(self.mapping_file) @@ -377,7 +341,7 @@ def test_orchestrate_with_metadata_xlsx_having_project_accession(self): patch('eva_sub_cli.orchestrator.get_project_title_from_ena') as get_project_title_from_ena: get_project_title_from_ena.return_value = self.project_title - orchestrate_process(self.test_sub_dir, None, None, None, self.metadata_xlsx_with_project_accession, + orchestrate_process(self.test_sub_dir, None, self.metadata_xlsx_with_project_accession, tasks=[VALIDATE], executor=DOCKER) # Mapping file was created from the metadata_xlsx assert os.path.exists(self.mapping_file) @@ -394,7 +358,7 @@ def test_orchestrate_with_metadata_xlsx_having_project_accession(self): def test_metadata_file_does_not_exist_error(self): with self.assertRaises(FileNotFoundError) as context: - orchestrate_process(self.test_sub_dir, None, None, None, 'Non_existing_metadata.xlsx', + orchestrate_process(self.test_sub_dir, None, 'Non_existing_metadata.xlsx', tasks=[VALIDATE], executor=DOCKER) self.assertRegex( str(context.exception), @@ -402,10 +366,17 @@ def test_metadata_file_does_not_exist_error(self): ) def test_fasta_file_compressed(self): + metadata_with_compressed_fasta = os.path.join(self.test_sub_dir, 'updated_metadata.json') + with open(self.metadata_json) as f: + metadata = json.load(f) + for analysis in metadata['analysis']: + analysis['referenceFasta'] = os.path.join(self.test_sub_dir, 'genome.fa.gz') + with open(metadata_with_compressed_fasta, 'w') as f: + json.dump(metadata, f) with patch('eva_sub_cli.orchestrator.os.path.isfile'): with self.assertRaises(InvalidFileTypeError): - orchestrate_process(self.test_sub_dir, self.vcf_files, self.reference_fasta + '.gz', self.metadata_json, - self.metadata_xlsx, tasks=[VALIDATE], executor=DOCKER) + orchestrate_process(self.test_sub_dir, metadata_with_compressed_fasta, + None, tasks=[VALIDATE], executor=DOCKER) def test_get_sub_cli_github_tags(self): with patch("requests.get") as mock_req: From 1c8c6c8e2c2a202acff1c13cfd10e1585f957086 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 17 Dec 2025 12:54:37 +0000 Subject: [PATCH 2/3] remove commented out code --- tests/test_orchestrator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index ad3f612..971c3ec 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -33,8 +33,6 @@ class TestOrchestrator(unittest.TestCase): config_file = os.path.join(test_sub_dir, SUB_CLI_CONFIG_FILE) mapping_file = os.path.join(test_sub_dir, 'vcf_mapping_file.csv') - # vcf_files = [os.path.join(test_sub_dir, 'vcf_file1.vcf'), os.path.join(test_sub_dir, 'vcf_file2.vcf')] - # reference_fasta = os.path.join(test_sub_dir, 'genome.fa') metadata_json = os.path.join(test_sub_dir, 'sub_metadata.json') metadata_xlsx = os.path.join(test_sub_dir, 'sub_metadata.xlsx') metadata_xlsx_version = '3.0.0' From 38692bbff8ba4180f9cfe89e23edde5749fb2b2c Mon Sep 17 00:00:00 2001 From: April Shen Date: Thu, 18 Dec 2025 14:10:19 +0000 Subject: [PATCH 3/3] remove unused import and update documentation --- docs/how_to_submit.md | 9 --------- tests/test_orchestrator.py | 1 - 2 files changed, 10 deletions(-) diff --git a/docs/how_to_submit.md b/docs/how_to_submit.md index a37062d..337043f 100644 --- a/docs/how_to_submit.md +++ b/docs/how_to_submit.md @@ -81,15 +81,6 @@ For example, to run only `vcf_check` and `sample_check`: eva-sub-cli.py --metadata_xlsx --submission_dir --validation_tasks vcf_check sample_check ``` -### VCF files and reference FASTA -These can be provided either in the metadata file directly, or on the command line using the `--vcf_files` and -`--reference_fasta` options. Note that if you are using more than one reference FASTA, you **cannot** use the command -line options; you must specify which VCF files use which FASTA files in the metadata. - -VCF files can be either uncompressed or compressed using bgzip. -Other types of compression are not allowed and will result in errors during validation. -FASTA files must be uncompressed. - ### Metadata JSON Frequent submitters may be interested in using our [metadata JSON schema](https://github.com/EBIvariation/eva-sub-cli/blob/main/eva_sub_cli/etc/eva_schema.json) instead of our spreadsheet template. The metadata requirements are the same regardless of which format you use, you will diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 971c3ec..749b4c4 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -3,7 +3,6 @@ import os import shutil import unittest -from copy import deepcopy from unittest.mock import patch, Mock, MagicMock from ebi_eva_common_pyutils.config import WritableConfig