diff --git a/idr/idr.py b/idr/idr.py index 83355ce..41fb790 100644 --- a/idr/idr.py +++ b/idr/idr.py @@ -19,7 +19,7 @@ def mean(items): import idr.optimization from idr.optimization import estimate_model_params, old_estimator -from idr.utility import calc_post_membership_prbs, compute_pseudo_values +from idr.utility import calc_post_membership_prbs, compute_pseudo_values, is_valid_narrowPeak, is_valid_broadPeak Peak = namedtuple( 'Peak', ['chrm', 'strand', 'start', 'stop', 'signal', 'summit', 'signalValue', 'pValue', 'qValue']) @@ -696,9 +696,15 @@ def load_samples(args): else: peak_merge_fn = min if args.input_file_type == 'narrowPeak': + if not all([is_valid_narrowPeak(fp) for fp in args.samples]): + raise ValueError( + "Input file(s) are not narrowPeak files") summit_index = 9 else: summit_index = None + if not all([is_valid_broadPeak(fp) for fp in args.samples]): + raise ValueError( + "Input file(s) are not broadPeak files") f1, f2 = [load_bed(fp, signal_index, summit_index) for fp in args.samples] oracle_pks = ( diff --git a/idr/utility.py b/idr/utility.py index cf73c30..9bb2d8e 100644 --- a/idr/utility.py +++ b/idr/utility.py @@ -5,7 +5,53 @@ import math + DEFAULT_PV_COVERGE_EPS = 1e-8 +def validate_peak_line(line, include_summit=False): + """ validates the types of each element of line + use include_summit=True for narrowPeak files, """ + try: + parts = line.split() + if include_summit: + chrom, start, end, name, score, strand, signal, pvalue, qvalue, summit = parts + else: + chrom, start, end, name, score, strand, signal, pvalue, qvalue = parts + # check chr, start, end + chrom_check = isinstance(chrom, str) + start_check = isinstance(int(start), int) + end_check = isinstance(int(end), int) + # check name, score, strand + name_check = isinstance(name, str) + score_check = int(score) >= 0 and int(score) <= 1000 + strand_check = strand in ['.','-','+'] + # check signal, pvalue, qvalue, summit + signal_check = isinstance(float(signal), float) + pval_check = pvalue == -1 or float(pvalue) > 0 + qval_check = qvalue == -1 or float(qvalue) > 0 + if include_summit: + summit_check = int(summit) > 0 + return all([chrom_check, start_check, end_check, name_check, score_check, \ + strand_check, signal_check, pval_check, qval_check, summit_check]) + else: + return all([chrom_check, start_check, end_check, name_check, score_check, \ + strand_check, signal_check, pval_check, qval_check]) + + except ValueError: + print("Validation error: one or more of the elements could not be interpreted as the proper type, or there were insufficient elements to unpack: ", line) + return False + + +def is_valid_narrowPeak(fp): + """ validates this file has all the attributes of a narrowPeak file. """ + validation_results = [validate_peak_line(l, True) for l in fp] + fp.seek(0) + return all(validation_results) + +def is_valid_broadPeak(fp): + """ validates this file has all the attributes of a broadPeak file. """ + validation_results = [validate_peak_line(l, False) for l in fp] + fp.seek(0) + return all(validation_results) def simulate_values(N, params): """Simulate ranks and values from a mixture of gaussians