From c142774bf9cc44739d23ec0fbf4de8b03dac875e Mon Sep 17 00:00:00 2001 From: CaptainStabs <40151222+CaptainStabs@users.noreply.github.com> Date: Mon, 20 Mar 2023 12:52:12 -0400 Subject: [PATCH 1/5] Update helpers add local zip file support --- .../python/mrfutils/src/mrfutils/helpers.py | 96 +++++++++++-------- 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py b/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py index fb236d3..78ec059 100644 --- a/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py +++ b/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py @@ -1,19 +1,20 @@ import csv import gzip import hashlib +import io import json import logging import os +import zipfile from itertools import chain from pathlib import Path from urllib.parse import urlparse import requests - from mrfutils.exceptions import InvalidMRF -log = logging.getLogger('mrfutils') -log.setLevel(logging.INFO) +log = logging.getLogger('flatteners') +# log.setLevel(logging.DEBUG) def prepend(value, iterator): @@ -37,7 +38,7 @@ def peek(iterator): class JSONOpen: """ - Context manager for opening JSON(.gz) MRFs. + Context manager for opening JSON(.gz/.zip) MRFs. Usage: >>> with JSONOpen('localfile.json') as f: or @@ -45,57 +46,72 @@ class JSONOpen: including both zipped and unzipped files. """ - def __init__(self, filename): + def __init__(self, filename, zip_file=None): self.filename = filename + self.zip_file = zip_file self.f = None self.r = None self.is_remote = None - parsed_url = urlparse(self.filename) - self.suffix = ''.join(Path(parsed_url.path).suffixes) - if not self.suffix: - self.suffix = ''.join(Path(parsed_url.query).suffixes) + if not self.zip_file: + parsed_url = urlparse(self.filename) + self.suffix = ''.join(Path(parsed_url.path).suffixes) + if not self.suffix: + self.suffix = ''.join(Path(parsed_url.query).suffixes) - if not ( - self.suffix.endswith('.json.gz') or - self.suffix.endswith('.json') - ): - raise InvalidMRF(f'Suffix not JSON: {self.filename=} {self.suffix=}') + if not ( + self.suffix.endswith('.json.gz') or + self.suffix.endswith('.json') or + self.suffix.endswith('.zip') + ): + raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}') - self.is_remote = parsed_url.scheme in ('http', 'https') + self.is_remote = parsed_url.scheme in ('http', 'https') + else: + self.suffix = ".zip" + self.is_remote = False def __enter__(self): - if ( - self.is_remote - # endswith is used to protect against the case - # where the filename contains lots of dots - # insurer.stuff.json.gz - and self.suffix.endswith('.json.gz') - ): - self.s = requests.Session() - self.r = self.s.get(self.filename, stream=True) - self.f = gzip.GzipFile(fileobj=self.r.raw) - - elif ( - self.is_remote - and self.suffix.endswith('.json') - ): - self.s = requests.Session() - self.r = self.s.get(self.filename, stream=True) - self.r.raw.decode_content = True - self.f = self.r.raw - - elif self.suffix == '.json.gz': - self.f = gzip.open(self.filename, 'rb') - + if self.suffix.endswith('.zip'): + if self.is_remote: + # Download the zip file and store it in memory + response = requests.get(self.filename) + response.raise_for_status() + zip_data = io.BytesIO(response.content) + + # Open the first file in the zip + with zipfile.ZipFile(zip_data) as zip_file: + inner_filename = zip_file.namelist()[0] + self.f = zip_file.open(inner_filename) + else: + with zipfile.ZipFile(self.zip_file) as z: + self.f = z.open(self.filename) + + elif self.suffix.endswith('.json.gz'): + if self.is_remote: + self.s = requests.Session() + self.r = self.s.get(self.filename, stream=True) + self.f = gzip.GzipFile(fileobj=self.r.raw) + else: + self.f = gzip.open(self.filename, 'rb') + elif self.suffix.endswith('.json'): + if self.is_remote: + self.s = requests.Session() + self.r = self.s.get(self.filename, stream=True) + self.r.raw.decode_content = True + self.f = self.r.raw + else: + self.f = open(self.filename, 'rb') else: - self.f = open(self.filename, 'rb') + raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}') log.info(f'Opened file: {self.filename}') return self.f + def __exit__(self, exc_type, exc_val, exc_tb): - if self.is_remote: + # ZIP files do not use sessions and are thus not closable + if self.is_remote and not self.suffix.endswith('.zip'): self.s.close() self.r.close() From f8be7e6086ab15154adbccaf4f89c4d335999b03 Mon Sep 17 00:00:00 2001 From: CaptainStabs <40151222+CaptainStabs@users.noreply.github.com> Date: Mon, 20 Mar 2023 12:53:05 -0400 Subject: [PATCH 2/5] Update flatteners.py --- .../python/mrfutils/src/mrfutils/flatteners.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py b/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py index de5f43e..65ead7e 100644 --- a/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py +++ b/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py @@ -648,6 +648,7 @@ def in_network_file_to_csv( url: str, out_dir: str, file: str | None = None, + zip_file: str | None = None, code_filter: set | None = None, npi_filter: set | None = None, ) -> None: @@ -673,7 +674,7 @@ def in_network_file_to_csv( ref_map = None metadata = ijson.ObjectBuilder() - parser = start_parser(file) + parser = start_parser(file, zip_file) file_row = file_row_from_url(url) file_row['url'] = url From 810a1f26ebf678e497bb1c30620b95546641721d Mon Sep 17 00:00:00 2001 From: CaptainStabs <40151222+CaptainStabs@users.noreply.github.com> Date: Mon, 20 Mar 2023 15:35:55 -0400 Subject: [PATCH 3/5] Update flatteners.py --- .../mrfutils/src/mrfutils/flatteners.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py b/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py index 65ead7e..d7aaa0e 100644 --- a/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py +++ b/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py @@ -63,7 +63,7 @@ log = logging.getLogger(__name__) logging.basicConfig(format='%(asctime)s - %(message)s') -log.setLevel(logging.DEBUG) +# log.setLevel(logging.DEBUG) # To distinguish data from rows Row = dict @@ -292,7 +292,6 @@ def process_arr(func, arr, *args, **kwargs): processed_arr.append(processed_item) return processed_arr -# experimental mod from array import array def process_group(group: dict, npi_filter: set) -> dict | None: try: @@ -302,18 +301,20 @@ def process_group(group: dict, npi_filter: set) -> dict | None: # HOTFIX group['npi'] = [int(n) for n in group['NPI']] - group['npi'] = array('L', group['npi']) + try: + group['npi'] = array('L', group['npi']) + except: + print(group['npi']) + raise + # I was alerted that some if not npi_filter: return group group['npi'] = [n for n in group['npi'] if n in npi_filter] - if not group['npi']: return - group['npi'] = array('L', group['npi']) - return group @@ -639,8 +640,8 @@ def swap_references( yield item -def start_parser(filename) -> Generator: - with JSONOpen(filename) as f: +def start_parser(filename, zip_file=None) -> Generator: + with JSONOpen(filename, zip_file) as f: yield from ijson.parse(f, use_float = True) @@ -742,7 +743,7 @@ def gen_plan_file(parser): elif (prefix, event, value) == ('reporting_structure', 'end_array', None): return -def write_plan_file(plan_file, toc_id, out_dir): +def write_plan_file(plan_file, toc_id, out_dir, toc_id_set): if not plan_file.get('in_network_files'): return @@ -771,11 +772,10 @@ def write_plan_file(plan_file, toc_id, out_dir): file_row['url'] = url file_row['description'] = file['description'] - write_table(file_row, 'toc_file', out_dir) - file_rows.append(file_row) - - print(len(plan_rows)) - print(len(file_rows)) + if toc_id not in toc_id_set: + write_table(file_row, 'toc_file', out_dir) + file_rows.append(file_row) + toc_id_set.add(toc_id) for plan_row in plan_rows: for file_row in file_rows: @@ -808,10 +808,13 @@ def toc_file_to_csv( toc_row['url'] = url toc_id = toc_row['id'] metadata = ijson.ObjectBuilder() + + # Initialize set for deduplication + toc_id_set = set() for prefix, event, value in parser: if (prefix, event, value) == ('reporting_structure', 'start_array', None): for plan_file in gen_plan_file(parser): - write_plan_file(plan_file, toc_id, out_dir) + write_plan_file(plan_file, toc_id, out_dir, toc_id_set) else: metadata.event(event, value) toc_row.update(metadata.value) From f4191f054ed9ed8f0a88453f599cf09f9a8becde Mon Sep 17 00:00:00 2001 From: CaptainStabs <40151222+CaptainStabs@users.noreply.github.com> Date: Mon, 20 Mar 2023 15:40:38 -0400 Subject: [PATCH 4/5] Update flatteners.py --- .../python/mrfutils/src/mrfutils/flatteners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py b/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py index d7aaa0e..87940d5 100644 --- a/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py +++ b/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py @@ -697,7 +697,7 @@ def in_network_file_to_csv( except StopIteration: if completed: break if ref_map is None: ref_map = {} - parser = start_parser(file) + parser = start_parser(file, zip_file) ffwd(parser, to_prefix='', to_value='in_network') prefix, event, value = ('', 'map_key', 'in_network') prepend(('', 'map_key', 'in_network'), parser) From ea6a5015c93e16304e3f01c534403c00e2c798e6 Mon Sep 17 00:00:00 2001 From: CaptainStabs <40151222+CaptainStabs@users.noreply.github.com> Date: Mon, 20 Mar 2023 15:46:42 -0400 Subject: [PATCH 5/5] Update flatteners.py --- .../mrfutils/src/mrfutils/flatteners.py | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py b/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py index 87940d5..53edf40 100644 --- a/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py +++ b/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py @@ -63,7 +63,7 @@ log = logging.getLogger(__name__) logging.basicConfig(format='%(asctime)s - %(message)s') -# log.setLevel(logging.DEBUG) +log.setLevel(logging.DEBUG) # To distinguish data from rows Row = dict @@ -292,6 +292,7 @@ def process_arr(func, arr, *args, **kwargs): processed_arr.append(processed_item) return processed_arr +# experimental mod from array import array def process_group(group: dict, npi_filter: set) -> dict | None: try: @@ -301,20 +302,18 @@ def process_group(group: dict, npi_filter: set) -> dict | None: # HOTFIX group['npi'] = [int(n) for n in group['NPI']] - try: - group['npi'] = array('L', group['npi']) - except: - print(group['npi']) - raise + group['npi'] = array('L', group['npi']) - # I was alerted that some if not npi_filter: return group group['npi'] = [n for n in group['npi'] if n in npi_filter] + if not group['npi']: return + group['npi'] = array('L', group['npi']) + return group @@ -743,7 +742,7 @@ def gen_plan_file(parser): elif (prefix, event, value) == ('reporting_structure', 'end_array', None): return -def write_plan_file(plan_file, toc_id, out_dir, toc_id_set): +def write_plan_file(plan_file, toc_id, out_dir): if not plan_file.get('in_network_files'): return @@ -772,10 +771,11 @@ def write_plan_file(plan_file, toc_id, out_dir, toc_id_set): file_row['url'] = url file_row['description'] = file['description'] - if toc_id not in toc_id_set: - write_table(file_row, 'toc_file', out_dir) - file_rows.append(file_row) - toc_id_set.add(toc_id) + write_table(file_row, 'toc_file', out_dir) + file_rows.append(file_row) + + print(len(plan_rows)) + print(len(file_rows)) for plan_row in plan_rows: for file_row in file_rows: @@ -808,13 +808,10 @@ def toc_file_to_csv( toc_row['url'] = url toc_id = toc_row['id'] metadata = ijson.ObjectBuilder() - - # Initialize set for deduplication - toc_id_set = set() for prefix, event, value in parser: if (prefix, event, value) == ('reporting_structure', 'start_array', None): for plan_file in gen_plan_file(parser): - write_plan_file(plan_file, toc_id, out_dir, toc_id_set) + write_plan_file(plan_file, toc_id, out_dir) else: metadata.event(event, value) toc_row.update(metadata.value)