From 2d412c3c95458a180f4e0925849b976a830005cd Mon Sep 17 00:00:00 2001 From: CaptainStabs <40151222+CaptainStabs@users.noreply.github.com> Date: Sat, 18 Mar 2023 21:03:37 -0400 Subject: [PATCH 1/3] Update helpers.py --- .../python/mrfutils/src/mrfutils/helpers.py | 235 +++++++++--------- 1 file changed, 123 insertions(+), 112 deletions(-) diff --git a/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py b/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py index fb236d3..020dba6 100644 --- a/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py +++ b/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py @@ -7,157 +7,168 @@ from itertools import chain from pathlib import Path from urllib.parse import urlparse +import zipfile +import io +import zipfile import requests from mrfutils.exceptions import InvalidMRF log = logging.getLogger('mrfutils') -log.setLevel(logging.INFO) +log.setLevel(logging.DEBUG) def prepend(value, iterator): - """Prepend a single value in front of an iterator - >>> prepend(1, [2, 3, 4]) - >>> 1 2 3 4 - """ - return chain([value], iterator) + """Prepend a single value in front of an iterator + >>> prepend(1, [2, 3, 4]) + >>> 1 2 3 4 + """ + return chain([value], iterator) def peek(iterator): - """ - Usage: - >>> next_, iter = peek(iter) - allows you to peek at the next value of the iterator - """ - try: next_ = next(iterator) - except StopIteration: return None, iterator - return next_, prepend(next_, iterator) + """ + Usage: + >>> next_, iter = peek(iter) + allows you to peek at the next value of the iterator + """ + try: next_ = next(iterator) + except StopIteration: return None, iterator + return next_, prepend(next_, iterator) + + class JSONOpen: - """ - Context manager for opening JSON(.gz) MRFs. - Usage: - >>> with JSONOpen('localfile.json') as f: - or - >>> with JSONOpen(some_json_url) as f: - including both zipped and unzipped files. - """ - - def __init__(self, filename): - self.filename = filename - self.f = None - self.r = None - self.is_remote = None - - parsed_url = urlparse(self.filename) - self.suffix = ''.join(Path(parsed_url.path).suffixes) - if not self.suffix: - self.suffix = ''.join(Path(parsed_url.query).suffixes) - - if not ( - self.suffix.endswith('.json.gz') or - self.suffix.endswith('.json') - ): - raise InvalidMRF(f'Suffix not JSON: {self.filename=} {self.suffix=}') - - self.is_remote = parsed_url.scheme in ('http', 'https') - - def __enter__(self): - if ( - self.is_remote - # endswith is used to protect against the case - # where the filename contains lots of dots - # insurer.stuff.json.gz - and self.suffix.endswith('.json.gz') - ): - self.s = requests.Session() - self.r = self.s.get(self.filename, stream=True) - self.f = gzip.GzipFile(fileobj=self.r.raw) - - elif ( - self.is_remote - and self.suffix.endswith('.json') - ): - self.s = requests.Session() - self.r = self.s.get(self.filename, stream=True) - self.r.raw.decode_content = True - self.f = self.r.raw - - elif self.suffix == '.json.gz': - self.f = gzip.open(self.filename, 'rb') - - else: - self.f = open(self.filename, 'rb') - - log.info(f'Opened file: {self.filename}') - return self.f - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.is_remote: - self.s.close() - self.r.close() - - self.f.close() + """ + Context manager for opening JSON(.gz/.zip) MRFs. + Usage: + >>> with JSONOpen('localfile.json') as f: + or + >>> with JSONOpen(some_json_url) as f: + including both zipped and unzipped files. + """ + + def __init__(self, filename): + self.filename = filename + self.f = None + self.r = None + self.is_remote = None + + parsed_url = urlparse(self.filename) + self.suffix = ''.join(Path(parsed_url.path).suffixes) + if not self.suffix: + self.suffix = ''.join(Path(parsed_url.query).suffixes) + + if not ( + self.suffix.endswith('.json.gz') or + self.suffix.endswith('.json') or + self.suffix.endswith('.zip') + ): + raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}') + + self.is_remote = parsed_url.scheme in ('http', 'https') + + def __enter__(self): + if self.is_remote and self.suffix.endswith('.zip'): + # Download the zip file and store it in memory + response = requests.get(self.filename) + response.raise_for_status() + zip_data = io.BytesIO(response.content) + + # Open the first file in the zip + with zipfile.ZipFile(zip_data) as zip_file: + inner_filename = zip_file.namelist()[0] + self.f = zip_file.open(inner_filename) + elif self.suffix.endswith('.json.gz'): + if self.is_remote: + self.s = requests.Session() + self.r = self.s.get(self.filename, stream=True) + self.f = gzip.GzipFile(fileobj=self.r.raw) + else: + self.f = gzip.open(self.filename, 'rb') + elif self.suffix.endswith('.json'): + if self.is_remote: + self.s = requests.Session() + self.r = self.s.get(self.filename, stream=True) + self.r.raw.decode_content = True + self.f = self.r.raw + else: + self.f = open(self.filename, 'rb') + else: + raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}') + + log.info(f'Opened file: {self.filename}') + return self.f + + + + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.is_remote: + self.s.close() + self.r.close() + + self.f.close() def import_csv_to_set(filename: str): - """Imports data as tuples from a given file.""" - items = set() + """Imports data as tuples from a given file.""" + items = set() - with open(filename, 'r') as f: - reader = csv.reader(f) - for row in reader: - row = [col.strip() for col in row] - if len(row) > 1: - items.add(tuple(row)) - else: - item = row.pop() - items.add(item) - return items + with open(filename, 'r') as f: + reader = csv.reader(f) + for row in reader: + row = [col.strip() for col in row] + if len(row) > 1: + items.add(tuple(row)) + else: + item = row.pop() + items.add(item) + return items def make_dir(out_dir): - if not os.path.exists(out_dir): - os.mkdir(out_dir) + if not os.path.exists(out_dir): + os.mkdir(out_dir) def dicthasher(data: dict, n_bytes = 8) -> int: - if not data: - raise Exception("Hashed dictionary can't be empty") + if not data: + raise Exception("Hashed dictionary can't be empty") - data = json.dumps(data, sort_keys=True).encode('utf-8') - hash_s = hashlib.sha256(data).digest()[:n_bytes] - hash_i = int.from_bytes(hash_s, 'little') + data = json.dumps(data, sort_keys=True).encode('utf-8') + hash_s = hashlib.sha256(data).digest()[:n_bytes] + hash_i = int.from_bytes(hash_s, 'little') - return hash_i + return hash_i def append_hash(item: dict, name: str) -> dict: - hash_ = dicthasher(item) - item[name] = hash_ + hash_ = dicthasher(item) + item[name] = hash_ - return item + return item def filename_hasher(filename: str) -> int: - # retrieve/only/this_part_of_the_file.json(.gz) - filename = Path(filename).stem.split('.')[0] - file_row = {'filename': filename} - filename_hash = dicthasher(file_row) + # retrieve/only/this_part_of_the_file.json(.gz) + filename = Path(filename).stem.split('.')[0] + file_row = {'filename': filename} + filename_hash = dicthasher(file_row) - return filename_hash + return filename_hash def validate_url(test_url: str) -> bool: - # https://stackoverflow.com/a/38020041 - try: - result = urlparse(test_url) - return all([result.scheme, result.netloc]) - except: - return False + # https://stackoverflow.com/a/38020041 + try: + result = urlparse(test_url) + return all([result.scheme, result.netloc]) + except: + return False From 8fd2608d1d9efa29a95e92a5db2aa9026db65e20 Mon Sep 17 00:00:00 2001 From: CaptainStabs <40151222+CaptainStabs@users.noreply.github.com> Date: Sat, 18 Mar 2023 21:04:25 -0400 Subject: [PATCH 2/3] Update helpers.py convert to tabs --- .../python/mrfutils/src/mrfutils/helpers.py | 238 +++++++++--------- 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py b/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py index 020dba6..37e0fa1 100644 --- a/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py +++ b/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py @@ -9,7 +9,7 @@ from urllib.parse import urlparse import zipfile import io -import zipfile +import heartrate; heartrate.trace(browser=True, daemon=True) import requests @@ -20,155 +20,155 @@ def prepend(value, iterator): - """Prepend a single value in front of an iterator - >>> prepend(1, [2, 3, 4]) - >>> 1 2 3 4 - """ - return chain([value], iterator) + """Prepend a single value in front of an iterator + >>> prepend(1, [2, 3, 4]) + >>> 1 2 3 4 + """ + return chain([value], iterator) def peek(iterator): - """ - Usage: - >>> next_, iter = peek(iter) - allows you to peek at the next value of the iterator - """ - try: next_ = next(iterator) - except StopIteration: return None, iterator - return next_, prepend(next_, iterator) - + """ + Usage: + >>> next_, iter = peek(iter) + allows you to peek at the next value of the iterator + """ + try: next_ = next(iterator) + except StopIteration: return None, iterator + return next_, prepend(next_, iterator) +import zipfile class JSONOpen: - """ - Context manager for opening JSON(.gz/.zip) MRFs. - Usage: - >>> with JSONOpen('localfile.json') as f: - or - >>> with JSONOpen(some_json_url) as f: - including both zipped and unzipped files. - """ - - def __init__(self, filename): - self.filename = filename - self.f = None - self.r = None - self.is_remote = None - - parsed_url = urlparse(self.filename) - self.suffix = ''.join(Path(parsed_url.path).suffixes) - if not self.suffix: - self.suffix = ''.join(Path(parsed_url.query).suffixes) - - if not ( - self.suffix.endswith('.json.gz') or - self.suffix.endswith('.json') or - self.suffix.endswith('.zip') - ): - raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}') - - self.is_remote = parsed_url.scheme in ('http', 'https') - - def __enter__(self): - if self.is_remote and self.suffix.endswith('.zip'): - # Download the zip file and store it in memory - response = requests.get(self.filename) - response.raise_for_status() - zip_data = io.BytesIO(response.content) - - # Open the first file in the zip - with zipfile.ZipFile(zip_data) as zip_file: - inner_filename = zip_file.namelist()[0] - self.f = zip_file.open(inner_filename) - elif self.suffix.endswith('.json.gz'): - if self.is_remote: - self.s = requests.Session() - self.r = self.s.get(self.filename, stream=True) - self.f = gzip.GzipFile(fileobj=self.r.raw) - else: - self.f = gzip.open(self.filename, 'rb') - elif self.suffix.endswith('.json'): - if self.is_remote: - self.s = requests.Session() - self.r = self.s.get(self.filename, stream=True) - self.r.raw.decode_content = True - self.f = self.r.raw - else: - self.f = open(self.filename, 'rb') - else: - raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}') - - log.info(f'Opened file: {self.filename}') - return self.f - - - - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.is_remote: - self.s.close() - self.r.close() - - self.f.close() + """ + Context manager for opening JSON(.gz/.zip) MRFs. + Usage: + >>> with JSONOpen('localfile.json') as f: + or + >>> with JSONOpen(some_json_url) as f: + including both zipped and unzipped files. + """ + + def __init__(self, filename): + self.filename = filename + self.f = None + self.r = None + self.is_remote = None + + parsed_url = urlparse(self.filename) + self.suffix = ''.join(Path(parsed_url.path).suffixes) + if not self.suffix: + self.suffix = ''.join(Path(parsed_url.query).suffixes) + + if not ( + self.suffix.endswith('.json.gz') or + self.suffix.endswith('.json') or + self.suffix.endswith('.zip') + ): + raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}') + + self.is_remote = parsed_url.scheme in ('http', 'https') + + def __enter__(self): + if self.is_remote and self.suffix.endswith('.zip'): + # Download the zip file and store it in memory + response = requests.get(self.filename) + response.raise_for_status() + zip_data = io.BytesIO(response.content) + + # Open the first file in the zip + with zipfile.ZipFile(zip_data) as zip_file: + inner_filename = zip_file.namelist()[0] + self.f = zip_file.open(inner_filename) + elif self.suffix.endswith('.json.gz'): + if self.is_remote: + self.s = requests.Session() + self.r = self.s.get(self.filename, stream=True) + self.f = gzip.GzipFile(fileobj=self.r.raw) + else: + self.f = gzip.open(self.filename, 'rb') + elif self.suffix.endswith('.json'): + if self.is_remote: + self.s = requests.Session() + self.r = self.s.get(self.filename, stream=True) + self.r.raw.decode_content = True + self.f = self.r.raw + else: + self.f = open(self.filename, 'rb') + else: + raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}') + + log.info(f'Opened file: {self.filename}') + return self.f + + + + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.is_remote: + self.s.close() + self.r.close() + + self.f.close() def import_csv_to_set(filename: str): - """Imports data as tuples from a given file.""" - items = set() + """Imports data as tuples from a given file.""" + items = set() - with open(filename, 'r') as f: - reader = csv.reader(f) - for row in reader: - row = [col.strip() for col in row] - if len(row) > 1: - items.add(tuple(row)) - else: - item = row.pop() - items.add(item) - return items + with open(filename, 'r') as f: + reader = csv.reader(f) + for row in reader: + row = [col.strip() for col in row] + if len(row) > 1: + items.add(tuple(row)) + else: + item = row.pop() + items.add(item) + return items def make_dir(out_dir): - if not os.path.exists(out_dir): - os.mkdir(out_dir) + if not os.path.exists(out_dir): + os.mkdir(out_dir) def dicthasher(data: dict, n_bytes = 8) -> int: - if not data: - raise Exception("Hashed dictionary can't be empty") + if not data: + raise Exception("Hashed dictionary can't be empty") - data = json.dumps(data, sort_keys=True).encode('utf-8') - hash_s = hashlib.sha256(data).digest()[:n_bytes] - hash_i = int.from_bytes(hash_s, 'little') + data = json.dumps(data, sort_keys=True).encode('utf-8') + hash_s = hashlib.sha256(data).digest()[:n_bytes] + hash_i = int.from_bytes(hash_s, 'little') - return hash_i + return hash_i def append_hash(item: dict, name: str) -> dict: - hash_ = dicthasher(item) - item[name] = hash_ + hash_ = dicthasher(item) + item[name] = hash_ - return item + return item def filename_hasher(filename: str) -> int: - # retrieve/only/this_part_of_the_file.json(.gz) - filename = Path(filename).stem.split('.')[0] - file_row = {'filename': filename} - filename_hash = dicthasher(file_row) + # retrieve/only/this_part_of_the_file.json(.gz) + filename = Path(filename).stem.split('.')[0] + file_row = {'filename': filename} + filename_hash = dicthasher(file_row) - return filename_hash + return filename_hash def validate_url(test_url: str) -> bool: - # https://stackoverflow.com/a/38020041 - try: - result = urlparse(test_url) - return all([result.scheme, result.netloc]) - except: - return False + # https://stackoverflow.com/a/38020041 + try: + result = urlparse(test_url) + return all([result.scheme, result.netloc]) + except: + return False From c4018e03454ded58169215e9f141975f32d7f123 Mon Sep 17 00:00:00 2001 From: CaptainStabs <40151222+CaptainStabs@users.noreply.github.com> Date: Mon, 20 Mar 2023 10:37:41 -0400 Subject: [PATCH 3/3] Update helpers.py --- .../python/mrfutils/src/mrfutils/helpers.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py b/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py index 37e0fa1..c5f2264 100644 --- a/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py +++ b/transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py @@ -9,7 +9,6 @@ from urllib.parse import urlparse import zipfile import io -import heartrate; heartrate.trace(browser=True, daemon=True) import requests @@ -38,8 +37,6 @@ def peek(iterator): return next_, prepend(next_, iterator) -import zipfile - class JSONOpen: """ Context manager for opening JSON(.gz/.zip) MRFs. @@ -103,10 +100,8 @@ def __enter__(self): return self.f - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.is_remote: + if self.is_remote and not self.suffix.endswith('.zip'): self.s.close() self.r.close()