diff --git a/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py b/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py index de5f43e..53edf40 100644 --- a/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py +++ b/transparency-in-coverage/python/mrfutils/src/mrfutils/flatteners.py @@ -639,8 +639,8 @@ def swap_references( yield item -def start_parser(filename) -> Generator: - with JSONOpen(filename) as f: +def start_parser(filename, zip_file=None) -> Generator: + with JSONOpen(filename, zip_file) as f: yield from ijson.parse(f, use_float = True) @@ -648,6 +648,7 @@ def in_network_file_to_csv( url: str, out_dir: str, file: str | None = None, + zip_file: str | None = None, code_filter: set | None = None, npi_filter: set | None = None, ) -> None: @@ -673,7 +674,7 @@ def in_network_file_to_csv( ref_map = None metadata = ijson.ObjectBuilder() - parser = start_parser(file) + parser = start_parser(file, zip_file) file_row = file_row_from_url(url) file_row['url'] = url @@ -695,7 +696,7 @@ def in_network_file_to_csv( except StopIteration: if completed: break if ref_map is None: ref_map = {} - parser = start_parser(file) + parser = start_parser(file, zip_file) ffwd(parser, to_prefix='', to_value='in_network') prefix, event, value = ('', 'map_key', 'in_network') prepend(('', 'map_key', 'in_network'), parser) diff --git a/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py b/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py index fb236d3..78ec059 100644 --- a/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py +++ b/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py @@ -1,19 +1,20 @@ import csv import gzip import hashlib +import io import json import logging import os +import zipfile from itertools import chain from pathlib import Path from urllib.parse import urlparse import requests - from mrfutils.exceptions import InvalidMRF -log = logging.getLogger('mrfutils') -log.setLevel(logging.INFO) +log = logging.getLogger('flatteners') +# log.setLevel(logging.DEBUG) def prepend(value, iterator): @@ -37,7 +38,7 @@ def peek(iterator): class JSONOpen: """ - Context manager for opening JSON(.gz) MRFs. + Context manager for opening JSON(.gz/.zip) MRFs. Usage: >>> with JSONOpen('localfile.json') as f: or @@ -45,57 +46,72 @@ class JSONOpen: including both zipped and unzipped files. """ - def __init__(self, filename): + def __init__(self, filename, zip_file=None): self.filename = filename + self.zip_file = zip_file self.f = None self.r = None self.is_remote = None - parsed_url = urlparse(self.filename) - self.suffix = ''.join(Path(parsed_url.path).suffixes) - if not self.suffix: - self.suffix = ''.join(Path(parsed_url.query).suffixes) + if not self.zip_file: + parsed_url = urlparse(self.filename) + self.suffix = ''.join(Path(parsed_url.path).suffixes) + if not self.suffix: + self.suffix = ''.join(Path(parsed_url.query).suffixes) - if not ( - self.suffix.endswith('.json.gz') or - self.suffix.endswith('.json') - ): - raise InvalidMRF(f'Suffix not JSON: {self.filename=} {self.suffix=}') + if not ( + self.suffix.endswith('.json.gz') or + self.suffix.endswith('.json') or + self.suffix.endswith('.zip') + ): + raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}') - self.is_remote = parsed_url.scheme in ('http', 'https') + self.is_remote = parsed_url.scheme in ('http', 'https') + else: + self.suffix = ".zip" + self.is_remote = False def __enter__(self): - if ( - self.is_remote - # endswith is used to protect against the case - # where the filename contains lots of dots - # insurer.stuff.json.gz - and self.suffix.endswith('.json.gz') - ): - self.s = requests.Session() - self.r = self.s.get(self.filename, stream=True) - self.f = gzip.GzipFile(fileobj=self.r.raw) - - elif ( - self.is_remote - and self.suffix.endswith('.json') - ): - self.s = requests.Session() - self.r = self.s.get(self.filename, stream=True) - self.r.raw.decode_content = True - self.f = self.r.raw - - elif self.suffix == '.json.gz': - self.f = gzip.open(self.filename, 'rb') - + if self.suffix.endswith('.zip'): + if self.is_remote: + # Download the zip file and store it in memory + response = requests.get(self.filename) + response.raise_for_status() + zip_data = io.BytesIO(response.content) + + # Open the first file in the zip + with zipfile.ZipFile(zip_data) as zip_file: + inner_filename = zip_file.namelist()[0] + self.f = zip_file.open(inner_filename) + else: + with zipfile.ZipFile(self.zip_file) as z: + self.f = z.open(self.filename) + + elif self.suffix.endswith('.json.gz'): + if self.is_remote: + self.s = requests.Session() + self.r = self.s.get(self.filename, stream=True) + self.f = gzip.GzipFile(fileobj=self.r.raw) + else: + self.f = gzip.open(self.filename, 'rb') + elif self.suffix.endswith('.json'): + if self.is_remote: + self.s = requests.Session() + self.r = self.s.get(self.filename, stream=True) + self.r.raw.decode_content = True + self.f = self.r.raw + else: + self.f = open(self.filename, 'rb') else: - self.f = open(self.filename, 'rb') + raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}') log.info(f'Opened file: {self.filename}') return self.f + def __exit__(self, exc_type, exc_val, exc_tb): - if self.is_remote: + # ZIP files do not use sessions and are thus not closable + if self.is_remote and not self.suffix.endswith('.zip'): self.s.close() self.r.close()