From 53604fc0645f8faaf48a950785aef094fd5f2b0b Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Tue, 15 Jun 2021 11:33:29 +0100 Subject: [PATCH] update geography_cleaning to take an argument for central_sample_id (not in gisaid) --- geography_cleaning.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/geography_cleaning.py b/geography_cleaning.py index a06c9fe..8447ea1 100755 --- a/geography_cleaning.py +++ b/geography_cleaning.py @@ -379,7 +379,7 @@ def deal_with_nonuk_cog(country, adm1, adm2, epiweek, geog_dict, adm2_to_week_co return geog_dict, adm2_to_week_counts -def process_input(metadata_file, country_col, outer_postcode_col, adm1_col, adm2_col, epiweek_col, map_utils_dir,outdir): +def process_input(metadata_file, country_col, outer_postcode_col, adm1_col, adm2_col, epiweek_col, sample_id_col, map_utils_dir,outdir): outer_to_latlongs_region = find_outerpostcode_to_coord_mapping(map_utils_dir) metadata_multi_loc, straight_map = prep_adm2_data(os.path.join(map_utils_dir, "adm2_cleaning.tsv")) @@ -436,7 +436,7 @@ def process_input(metadata_file, country_col, outer_postcode_col, adm1_col, adm2 adm1 = sequence[adm1_col] outer_postcode = sequence[outer_postcode_col].upper().strip(" ") adm2 = sequence[adm2_col] - name = sequence["central_sample_id"] + name = sequence[sample_id_col] if name in fixed_seqs: @@ -549,7 +549,7 @@ def process_input(metadata_file, country_col, outer_postcode_col, adm1_col, adm2 if conflict and name not in already_checked_discreps: - incompatible_locations.write(f'{sequence["central_sample_id"]},{outer_postcode},{adm2},{postcode_to_adm2[outer_postcode]},{processed_adm2}\n') + incompatible_locations.write(f'{sequence[sample_id_col]},{outer_postcode},{adm2},{postcode_to_adm2[outer_postcode]},{processed_adm2}\n') conflict_count += 1 utla = "" @@ -634,7 +634,7 @@ def process_input(metadata_file, country_col, outer_postcode_col, adm1_col, adm2 return outer_geog_dict, adm2_to_week_counts, epiweek_dict, non_uk, safe_locs -def make_geography_csv(metadata_file, country_col, outer_postcode_col, adm1_col, adm2_col,epiweek_col, map_utils_dir, outdir): +def make_geography_csv(metadata_file, country_col, outer_postcode_col, adm1_col, adm2_col,epiweek_col, sample_id_col, map_utils_dir, outdir): country_list = ["UK", "FALKLAND_ISLANDS", "GIBRALTAR", "JERSEY", "ISLE_OF_MAN", "GUERNSEY"] @@ -644,7 +644,7 @@ def make_geography_csv(metadata_file, country_col, outer_postcode_col, adm1_col, writer = csv.DictWriter(fw, fieldnames=fieldnames) writer.writeheader() - outer_geog_dict, adm2_to_week_counts, epiweek_dict, non_uk, safe_locs = process_input(metadata_file, country_col, outer_postcode_col, adm1_col, adm2_col, epiweek_col, map_utils_dir, outdir) + outer_geog_dict, adm2_to_week_counts, epiweek_dict, non_uk, safe_locs = process_input(metadata_file, country_col, outer_postcode_col, adm1_col, adm2_col, epiweek_col, sample_id_col, map_utils_dir, outdir) for name, geog_dict in outer_geog_dict.items(): if geog_dict["country"].upper().replace(" ","_") in country_list: @@ -702,13 +702,14 @@ def main(): parser.add_argument("--adm2-col", dest="adm2_col") parser.add_argument("--adm1-col", dest="adm1_col") parser.add_argument("--epiweek-col", dest="epiweek_col") + parser.add_argument("--sample-id-col", default="central_sample_id") parser.add_argument("--mapping-utils-dir", dest="map_utils_dir", help="path to map utils eg outer postcode") parser.add_argument("--outdir") args = parser.parse_args() - make_geography_csv(args.metadata, args.country_col, args.outer_postcode_col, args.adm1_col, args.adm2_col, args.epiweek_col, args.map_utils_dir, args.outdir) + make_geography_csv(args.metadata, args.country_col, args.outer_postcode_col, args.adm1_col, args.adm2_col, args.epiweek_col, args.sample_id_col, args.map_utils_dir, args.outdir) if __name__ == '__main__':