Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions docs/how_to_submit.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,6 @@ For example, to run only `vcf_check` and `sample_check`:
eva-sub-cli.py --metadata_xlsx <metadata file> --submission_dir <submission directory> --validation_tasks vcf_check sample_check
```

### VCF files and reference FASTA
These can be provided either in the metadata file directly, or on the command line using the `--vcf_files` and
`--reference_fasta` options. Note that if you are using more than one reference FASTA, you **cannot** use the command
line options; you must specify which VCF files use which FASTA files in the metadata.

VCF files can be either uncompressed or compressed using bgzip.
Other types of compression are not allowed and will result in errors during validation.
FASTA files must be uncompressed.

### Metadata JSON
Frequent submitters may be interested in using our [metadata JSON schema](https://github.com/EBIvariation/eva-sub-cli/blob/main/eva_sub_cli/etc/eva_schema.json)
instead of our spreadsheet template. The metadata requirements are the same regardless of which format you use, you will
Expand Down
22 changes: 0 additions & 22 deletions eva_sub_cli/executables/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,6 @@

def validate_command_line_arguments(args, argparser):
fail = False
if (args.vcf_files and not args.reference_fasta) or (not args.vcf_files and args.reference_fasta):
print("When using --vcf_files and --reference_fasta, both need to be specified")
fail = True

if args.vcf_files:
for vcf_file in args.vcf_files:
if not os.path.isfile(vcf_file):
print(f"VCF file {vcf_file} is not a file")
fail = True

if args.reference_fasta:
if not os.path.isfile(args.reference_fasta):
print(f"Fasta file {args.reference_fasta} is not a file")
fail = True

if args.metadata_xlsx:
if not os.path.isfile(args.metadata_xlsx):
Expand Down Expand Up @@ -77,14 +63,6 @@ def parse_args(cmd_line_args):
argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}')
argparser.add_argument('--submission_dir', required=True, type=str,
help='Path to the directory where all processing is done and submission info is stored')
vcf_group = argparser.add_argument_group(
'Input VCF and assembly',
"Specify the VCF files and associated assembly with the following options. If you used different assemblies "
"for different VCF files, then you must include these in the metadata file rather than specifying them here."
)
vcf_group.add_argument('--vcf_files', nargs='+', help="One or more VCF files to validate")
vcf_group.add_argument('--reference_fasta',
help="The FASTA file containing the reference genome from which the variants were derived")

metadata_group = argparser.add_argument_group('Metadata', 'Specify the metadata in a spreadsheet or in a JSON file')
metadata_group = metadata_group.add_mutually_exclusive_group(required=True)
Expand Down
109 changes: 49 additions & 60 deletions eva_sub_cli/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,12 @@ def remove_non_vcf_files_from_metadata(metadata_json, metadata_xlsx):
logger.warning(f"Some files mentioned in the metadata xlsx's ({metadata_xlsx}) Files sheet are not VCF files and have been removed.")


def get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, reference_fasta,
metadata_json, metadata_xlsx, metadata_xlsx_version):
def get_project_title_and_create_vcf_files_mapping(submission_dir, metadata_json, metadata_xlsx, metadata_xlsx_version):
"""
Get project title and mapping between VCF files and reference FASTA files, from three sources: command line
arguments, metadata JSON file, or metadata XLSX file.
Get project title and mapping between VCF files and reference FASTA files, from two sources: metadata JSON file
or metadata XLSX file.

:param submission_dir: Directory where mapping file will be saved
:param vcf_files: VCF files from command line, if present
:param reference_fasta: Reference FASTA from command line, if present
:param metadata_json: Metadata JSON from command line, if present
:param metadata_xlsx: Metadata XLSX from command line, if present
:param metadata_xlsx_version: Version of metadata XLSX
Expand All @@ -97,17 +94,10 @@ def get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, re
writer.writerow(['vcf', 'fasta', 'report'])

vcf_files_mapping = []
if vcf_files and reference_fasta:
for vcf_file in vcf_files:
vcf_files_mapping.append([os.path.abspath(vcf_file), os.path.abspath(reference_fasta), ''])
if metadata_json:
project_title, _ = get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, False)
elif metadata_xlsx:
project_title, _ = get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata_xlsx_version, False)
elif metadata_json:
project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, True)
if metadata_json:
project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json)
elif metadata_xlsx:
project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata_xlsx_version, True)
project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata_xlsx_version)

# Filter out non-vcf files
vcf_files_mapping = [(vcf, fasta, report) for vcf, fasta, report in vcf_files_mapping if is_vcf_file(vcf)]
Expand Down Expand Up @@ -137,7 +127,7 @@ def validate_vcf_mapping(vcf_mapping):
f'path.')


def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_req=False):
def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json):
metadata = EvaMetadataJson(metadata_json)

project_title = metadata.project.get('title')
Expand All @@ -147,19 +137,19 @@ def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_
project_title = get_project_title_from_ena(project_accession)

vcf_fasta_report_mapping = []
if mapping_req:
analysis_alias_dict = defaultdict(dict)
for analysis in metadata.analyses:
analysis_alias_dict[analysis['analysisAlias']]['referenceFasta'] = analysis['referenceFasta']
analysis_alias_dict[analysis['analysisAlias']]['assemblyReport'] = analysis['assemblyReport'] \
if 'assemblyReport' in analysis else ''

for file_dict in metadata.resolved_files:
reference_fasta = analysis_alias_dict[file_dict['analysisAlias']]['referenceFasta']
assembly_report = analysis_alias_dict[file_dict['analysisAlias']]['assemblyReport']
vcf_fasta_report_mapping.append([os.path.abspath(file_dict['fileName']),
os.path.abspath(reference_fasta),
os.path.abspath(assembly_report) if assembly_report else ''])

analysis_alias_dict = defaultdict(dict)
for analysis in metadata.analyses:
analysis_alias_dict[analysis['analysisAlias']]['referenceFasta'] = analysis['referenceFasta']
analysis_alias_dict[analysis['analysisAlias']]['assemblyReport'] = analysis['assemblyReport'] \
if 'assemblyReport' in analysis else ''

for file_dict in metadata.resolved_files:
reference_fasta = analysis_alias_dict[file_dict['analysisAlias']]['referenceFasta']
assembly_report = analysis_alias_dict[file_dict['analysisAlias']]['assemblyReport']
vcf_fasta_report_mapping.append([os.path.abspath(file_dict['fileName']),
os.path.abspath(reference_fasta),
os.path.abspath(assembly_report) if assembly_report else ''])

return project_title, vcf_fasta_report_mapping

Expand Down Expand Up @@ -223,7 +213,7 @@ def verify_and_get_metadata_xlsx_version(metadata_xlsx, min_req_version):
return xlsx_version


def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata_xlsx_version, mapping_req=False):
def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata_xlsx_version):
workbook = load_workbook(metadata_xlsx)

project_sheet = workbook['Project']
Expand All @@ -243,33 +233,32 @@ def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, metadata
project_title = get_project_title_from_ena(project_accession)

vcf_fasta_report_mapping = []
if mapping_req:
analysis_alias_sheet = workbook['Analysis']
analysis_headers = {}
for cell in analysis_alias_sheet[1]:
analysis_headers[cell.value] = cell.column - 1

analysis_alias_dict = {}
for row in analysis_alias_sheet.iter_rows(min_row=2, values_only=True):
analysis_alias = row[analysis_headers['Analysis Alias']]
reference_fasta = row[analysis_headers['Reference Fasta Path']]
analysis_alias_dict[analysis_alias] = reference_fasta

files_sheet = workbook['Files']
files_headers = {}
for cell in files_sheet[1]:
files_headers[cell.value] = cell.column - 1

for row in files_sheet.iter_rows(min_row=2, values_only=True):
file_name = row[files_headers['File Name']]
if file_name:
file_name = os.path.abspath(file_name)
analysis_alias = row[files_headers['Analysis Alias']]
reference_fasta = analysis_alias_dict[analysis_alias]
if reference_fasta:
reference_fasta = os.path.abspath(reference_fasta)
if file_name and reference_fasta:
vcf_fasta_report_mapping.append([file_name, reference_fasta, ''])
analysis_alias_sheet = workbook['Analysis']
analysis_headers = {}
for cell in analysis_alias_sheet[1]:
analysis_headers[cell.value] = cell.column - 1

analysis_alias_dict = {}
for row in analysis_alias_sheet.iter_rows(min_row=2, values_only=True):
analysis_alias = row[analysis_headers['Analysis Alias']]
reference_fasta = row[analysis_headers['Reference Fasta Path']]
analysis_alias_dict[analysis_alias] = reference_fasta

files_sheet = workbook['Files']
files_headers = {}
for cell in files_sheet[1]:
files_headers[cell.value] = cell.column - 1

for row in files_sheet.iter_rows(min_row=2, values_only=True):
file_name = row[files_headers['File Name']]
if file_name:
file_name = os.path.abspath(file_name)
analysis_alias = row[files_headers['Analysis Alias']]
reference_fasta = analysis_alias_dict[analysis_alias]
if reference_fasta:
reference_fasta = os.path.abspath(reference_fasta)
if file_name and reference_fasta:
vcf_fasta_report_mapping.append([file_name, reference_fasta, ''])

return project_title, vcf_fasta_report_mapping

Expand Down Expand Up @@ -308,7 +297,7 @@ def check_validation_required(tasks, sub_config, username=None, password=None):
return False


def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx,
def orchestrate_process(submission_dir, metadata_json, metadata_xlsx,
tasks, executor, validation_tasks=ALL_VALIDATION_TASKS, username=None, password=None,
shallow_validation=False, nextflow_config=None, **kwargs):
# load config
Expand All @@ -332,7 +321,7 @@ def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_jso

# Get the provided Project Title and VCF files mapping (VCF, Fasta and Report)
project_title, vcf_files_mapping = get_project_title_and_create_vcf_files_mapping(
submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx, metadata_xlsx_version
submission_dir, metadata_json, metadata_xlsx, metadata_xlsx_version
)
vcf_files = get_vcf_files(vcf_files_mapping)

Expand Down
3 changes: 1 addition & 2 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,8 +578,7 @@ def _collect_file_info_to_metadata(self):
file_rows.append(file_dict)
file_count += 1
else:
error_txt = ('No file section found in metadata and multiple analysis alias exist: '
'cannot infer the relationship between files and analysis alias')
error_txt = 'No file section found in metadata'
self.error(error_txt)
errors.append({'property': '/files', 'description': error_txt})
metadata.set_files(file_rows)
Expand Down
9 changes: 1 addition & 8 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,10 @@ def test_main(self):
all_lines[0].endswith('[eva_sub_cli.orchestrator][DEBUG] test\n')

def test_validate_args(self):
vcf_file = os.path.join(self.submission_dir,'test.vcf')
fasta_file = os.path.join(self.submission_dir, 'test.fasta')
json_file = os.path.join(self.submission_dir, 'test.json')
touch(vcf_file)
touch(fasta_file)
touch(json_file)
cmd_args = [
'--submission_dir', self.submission_dir,
'--vcf_files', vcf_file,
'--reference_fasta', fasta_file,
'--metadata_json', json_file,
'--tasks', 'validate',
'--executor', 'native',
Expand All @@ -67,10 +61,9 @@ def test_validate_args(self):
args = cli.parse_args(cmd_args)
assert args.submission_dir == self.submission_dir


with patch('sys.exit') as m_exit:
cli.parse_args(cmd_args[:2]+cmd_args[4:])
m_exit.assert_called_once_with(1)
m_exit.assert_called_once_with(2)

def test_main_exception_handling(self):
mock_response = Mock()
Expand Down
Loading