From 0c6cafe81680cca0a2bd5632f337f4dd7a0bca39 Mon Sep 17 00:00:00 2001 From: Tom Mazurek Date: Tue, 9 Mar 2021 19:10:53 -0600 Subject: [PATCH 1/7] Add storage of static data for faster file regeneration --- python_baby_names/.gitignore | 3 +- python_baby_names/question_2.py | 150 +++++++++++++++++++++++++++++--- 2 files changed, 140 insertions(+), 13 deletions(-) diff --git a/python_baby_names/.gitignore b/python_baby_names/.gitignore index 10326c2..01c1529 100644 --- a/python_baby_names/.gitignore +++ b/python_baby_names/.gitignore @@ -1 +1,2 @@ -*.xlsx \ No newline at end of file +*.xlsx +*.pkl \ No newline at end of file diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py index c427047..db9702b 100644 --- a/python_baby_names/question_2.py +++ b/python_baby_names/question_2.py @@ -1,8 +1,10 @@ +import collections import mixins import os import pandas as pd import settings + from bs4 import BeautifulSoup @@ -18,6 +20,8 @@ class Script(mixins.BabyNamesMixin): def __init__(self, names_in_report: list, excel_filename: str = '', excel_sheetname: str = ''): + assert isinstance(names_in_report, (list, tuple)), ( + 'The names_in_report must be a list or a tuple.') if excel_filename: assert excel_filename[-5:] == '.xlsx', ( 'The excel_filename must end with ".xlsx"') @@ -26,8 +30,50 @@ def __init__(self, names_in_report: list, excel_filename: str = '', self.excel_sheetname = excel_sheetname def execute_report(self): - # Instantiate the table/dataframe for males. + pkl_filename = self.get_pkl_filename() + already_scraped = pkl_filename in os.listdir(settings.RELATIVE_PATH) + + if not already_scraped: + male_df, female_df = self.dfs_from_html() + else: + male_df, female_df = self.dfs_from_pkl() + + self.save_to_excel([male_df, female_df]) + + def get_pkl_filename(self): + return os.path.basename(__file__).replace('.py', '_data.pkl') + + def get_pandas_data_path(self) -> str: + """ + Returns: + str: The path to the pandas PKL data file. + """ + pkl_filename = self.get_pkl_filename() + return os.path.dirname(os.path.abspath(__file__)) + '\\' + pkl_filename + + def save_pkl(self, data: collections.deque): + """ + Saves a Pandas PKL data file. + + Args: + data (collections.deque): [description] + """ + path = self.get_pandas_data_path() + pd.DataFrame( + data, + columns=['gender', 'year', 'name', 'rank'] + ).to_pickle(path) + + def get_empty_dataframes(self): + """ + Creates both male and a female dataframes. + + Returns: + [tuple]: A 2-tuple containing DataFrames. + """ header_2 = ['Year'] + self.names_in_report + + # Instantiate the table/dataframe for males. male_df = pd.DataFrame(columns=header_2) header_1 = ['Male Name Rankings Per Year'] + len(self.names_in_report) * [''] male_df.columns = pd.MultiIndex.from_tuples(zip(header_1, header_2)) @@ -37,6 +83,19 @@ def execute_report(self): header_1 = ['Female Name Rankings Per Year'] + len(self.names_in_report) * [''] female_df.columns = pd.MultiIndex.from_tuples(zip(header_1, header_2)) + return male_df, female_df + + def dfs_from_html(self): + """ + Scrapes the HTML files found in this file's directory to + generate 2 DataFrames (one for males and one for females). + + Returns: + [tuple]: A 2-tuple containing non-empty DataFrames. + """ + male_df, female_df = self.get_empty_dataframes() + pandas_data = collections.deque() + names_in_report = self.names_in_report filenames, available_years = self.get_filename_info() # Gather the data from the files. @@ -44,7 +103,6 @@ def execute_report(self): html_file = open(f'{settings.RELATIVE_PATH}/{filename}', 'r') contents = html_file.read() soup = BeautifulSoup(contents, 'html.parser') - year = available_years[index] self.validate_year(year, soup) table = self.get_table(soup) @@ -55,8 +113,9 @@ def execute_report(self): # The dictionary element's key will be the name. # The dictionary element's value will be the rank. - male_names = {} - female_names = {} + male_names_dict = {} + female_names_dict = {} + for row in rows: # For most of the HTML files, the table rows are missing the # closing tr tags, so I am calling "next" until I get the @@ -65,22 +124,89 @@ def execute_report(self): rank_num = int(str(rank_el)) tag = rank_el.next male_name = tag.text.strip() - male_names[male_name] = rank_num + male_names_dict[male_name] = rank_num female_name = tag.next_sibling.text.strip() - female_names[female_name] = rank_num + female_names_dict[female_name] = rank_num + + pandas_data.extend( + [ + ['m', year, male_name, rank_num], + ['f', year, female_name, rank_num], + ] + ) # Append a new row to the male's table. male_df.loc[len(male_df.index)] = \ - self.add_data_to_row(male_names, [year]) + self.get_all_row_data(male_names_dict, [year]) # Append a new row to the female's table. female_df.loc[len(female_df.index)] = \ - self.add_data_to_row(female_names, [year]) - html_file.close() + self.get_all_row_data(female_names_dict, [year]) - self.save_to_excel([male_df, female_df]) + html_file.close() - def add_data_to_row(self, names: dict, new_row_data: list = []) -> list: + # Save the data in the current directory, so you want have to do run + # scrape the html again. + self.save_pkl(pandas_data) + return male_df, female_df + + def dfs_from_pkl(self): + """ + Uses the PKL file (static data) found in this file's directory to + generate 2 DataFrames (one for males and one for females). + + Much faster than the dfs_from_html method because it is not scraping + data from HTML. Also faster because it is querying the PKL data file. + + Returns: + [tuple]: A 2-tuple containing non-empty DataFrames. + """ + male_df, female_df = self.get_empty_dataframes() + path = self.get_pandas_data_path() + + # Filter and fetch the data from the pkl file. + df_filter = f'name in {str(list(self.names_in_report))}' + # "df" is a very short data set. It is only: + # len(self.names_in_report) * available_years * 2. + # In this example: 3 * 10 * 2 = 60 + df = pd.read_pickle(path).query(df_filter) + nested_data = { + 'm': {}, + 'f': {} + } + + # Transform the data to a structure similar to what was in the + # dfs_from_html method. + for i in df.itertuples(): + try: + # Accessing the year can raise a KeyError. + nested_data[i.gender][i.year][i.name] = i.rank + except KeyError: + nested_data[i.gender][i.year] = {i.name: i.rank} + + for gender, value_dict in nested_data.items(): + if gender == 'm': + for year, value_dict in value_dict.items(): + male_df.loc[len(male_df.index)] = \ + self.get_all_row_data(value_dict, [year]) + else: + for year, value_dict in value_dict.items(): + female_df.loc[len(female_df.index)] = \ + self.get_all_row_data(value_dict, [year]) + + return male_df, female_df + + def get_all_row_data(self, names: dict, new_row_data: list = []) -> list: + """ + Returns 1 row's worth of data that will be appended the end of a table. + + Args: + names (dict): A dictionary of names as keys and ranks as values. + new_row_data (list, optional): D. Defaults to []. + + Returns: + list: 1 row of data + """ for name in self.names_in_report: try: rank = names[name] @@ -89,7 +215,7 @@ def add_data_to_row(self, names: dict, new_row_data: list = []) -> list: new_row_data.append(rank) return new_row_data - def save_to_excel(self, dataframes): + def save_to_excel(self, dataframes: list): output_path = self.get_output_path(__file__) writer = pd.ExcelWriter(output_path, engine='xlsxwriter') row = 0 From 36e2d982f27c698cab30566201d9a85ed306966d Mon Sep 17 00:00:00 2001 From: Tom Mazurek Date: Thu, 11 Mar 2021 18:07:40 -0600 Subject: [PATCH 2/7] Add timer to execute_report function --- python_baby_names/question_2.py | 42 +++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py index db9702b..bfbcc89 100644 --- a/python_baby_names/question_2.py +++ b/python_baby_names/question_2.py @@ -1,15 +1,35 @@ import collections import mixins +import functools import os +import timeit import pandas as pd import settings from bs4 import BeautifulSoup +def timer(func): + """Print the runtime of the decorated function.""" NAMES_IN_REPORT = ["Ryan", "Ben", "Eugene"] EXCEL_SHEETNAME = 'Great Report' + @functools.wraps(func) + def wrapper_timer(*args, **kwargs): + start_time = timeit.default_timer() + value = func(*args, **kwargs) + end_time = timeit.default_timer() + elapsed_time = end_time - start_time + console_msg = f'Took {round(elapsed_time, 2)} seconds.' + script_obj = args[0] + if script_obj.generate_excel: + console_msg += ' (Including the time it took to generate the Excel file.)' + else: + console_msg += ' (NOT including the time it took to generate the Excel file.)' + print(console_msg) + return value + + return wrapper_timer class Script(mixins.BabyNamesMixin): @@ -18,8 +38,13 @@ class Script(mixins.BabyNamesMixin): baby names. """ - def __init__(self, names_in_report: list, excel_filename: str = '', - excel_sheetname: str = ''): + def __init__( + self, + names_in_report: list, + excel_filename: str = '', + excel_sheetname: str = '', + generate_excel: bool = True, + ): assert isinstance(names_in_report, (list, tuple)), ( 'The names_in_report must be a list or a tuple.') if excel_filename: @@ -28,8 +53,12 @@ def __init__(self, names_in_report: list, excel_filename: str = '', self.excel_filename = excel_filename self.names_in_report = names_in_report self.excel_sheetname = excel_sheetname + self.generate_excel = generate_excel + @timer def execute_report(self): + print() + pkl_filename = self.get_pkl_filename() already_scraped = pkl_filename in os.listdir(settings.RELATIVE_PATH) @@ -38,7 +67,8 @@ def execute_report(self): else: male_df, female_df = self.dfs_from_pkl() - self.save_to_excel([male_df, female_df]) + if self.generate_excel: + self.save_to_excel([male_df, female_df]) def get_pkl_filename(self): return os.path.basename(__file__).replace('.py', '_data.pkl') @@ -231,7 +261,9 @@ def save_to_excel(self, dataframes: list): print(f'Created: {output_path}') -Script( +script = Script( NAMES_IN_REPORT, excel_sheetname=EXCEL_SHEETNAME, -).execute_report() + generate_excel=True +) +script.execute_report() From f480286e2f645ad51422f5a188813fe377757895 Mon Sep 17 00:00:00 2001 From: Tom Mazurek Date: Thu, 11 Mar 2021 18:24:26 -0600 Subject: [PATCH 3/7] Reformat --- python_baby_names/question_2.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py index bfbcc89..8763c5c 100644 --- a/python_baby_names/question_2.py +++ b/python_baby_names/question_2.py @@ -1,19 +1,21 @@ import collections -import mixins import functools import os import timeit + import pandas as pd +from bs4 import BeautifulSoup + +import mixins import settings +NAMES_IN_REPORT = ['Ryan', 'Ben', 'Eugene'] +EXCEL_SHEETNAME = 'Great Report' -from bs4 import BeautifulSoup def timer(func): """Print the runtime of the decorated function.""" -NAMES_IN_REPORT = ["Ryan", "Ben", "Eugene"] -EXCEL_SHEETNAME = 'Great Report' @functools.wraps(func) def wrapper_timer(*args, **kwargs): start_time = timeit.default_timer() @@ -81,12 +83,12 @@ def get_pandas_data_path(self) -> str: pkl_filename = self.get_pkl_filename() return os.path.dirname(os.path.abspath(__file__)) + '\\' + pkl_filename - def save_pkl(self, data: collections.deque): + def save_pkl(self, data): """ Saves a Pandas PKL data file. Args: - data (collections.deque): [description] + data (Iterable): An iterable in a structure that Pandas understands. """ path = self.get_pandas_data_path() pd.DataFrame( @@ -96,10 +98,10 @@ def save_pkl(self, data: collections.deque): def get_empty_dataframes(self): """ - Creates both male and a female dataframes. + Creates male and a female empty dataframes. Returns: - [tuple]: A 2-tuple containing DataFrames. + [tuple]: A 2-tuple of empty DataFrames. """ header_2 = ['Year'] + self.names_in_report @@ -117,15 +119,14 @@ def get_empty_dataframes(self): def dfs_from_html(self): """ - Scrapes the HTML files found in this file's directory to - generate 2 DataFrames (one for males and one for females). + Scrapes the HTML files found in this file's directory to generate 2 + DataFrames (one for males and one for females). Returns: - [tuple]: A 2-tuple containing non-empty DataFrames. + [tuple]: A 2-tuple of non-empty DataFrames. """ male_df, female_df = self.get_empty_dataframes() pandas_data = collections.deque() - names_in_report = self.names_in_report filenames, available_years = self.get_filename_info() # Gather the data from the files. From 0a8df9a535e7591ce7c21cac392fa24c9abf76f5 Mon Sep 17 00:00:00 2001 From: Tom Mazurek Date: Thu, 11 Mar 2021 18:25:24 -0600 Subject: [PATCH 4/7] Fix undefined variable named "filename" --- python_baby_names/mixins.py | 10 ++++++---- python_baby_names/question_2.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/python_baby_names/mixins.py b/python_baby_names/mixins.py index d9581aa..f61712c 100644 --- a/python_baby_names/mixins.py +++ b/python_baby_names/mixins.py @@ -91,28 +91,30 @@ def validate_year(self, year: int, soup): assert year_in_title == year, ( f'Year "{year_in_title}" !== "{year}"') - def get_table(self, soup): + def get_table(self, soup, filename: str): """ Finds the correct table within BeautifulSoup object and validates it before it is returned. Args: soup (BeautifulSoup): The soup of the entire page. + filename (str): The name of the file. Returns: [BeautifulSoup]: The HTML table element within the soup. """ # In all cases, the data we need is in the 3rd table. table = soup.find_all('table')[2] - self.validate_table_columns(table) + self.validate_table_columns(table, filename) return table - def validate_table_columns(self, table): + def validate_table_columns(self, table, filename: str): """ - Validate that the table's columns are in the expected order. + Validates that the table's columns are in the expected order. Args: table (BeautifulSoup): The table object found in the HTML file. + filename (str): The name of the file. """ # Make sure the order of the columns are what is expected. table_header = table.select_one('tr') diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py index 8763c5c..d79db83 100644 --- a/python_baby_names/question_2.py +++ b/python_baby_names/question_2.py @@ -136,7 +136,7 @@ def dfs_from_html(self): soup = BeautifulSoup(contents, 'html.parser') year = available_years[index] self.validate_year(year, soup) - table = self.get_table(soup) + table = self.get_table(soup, filename) # Find all the rows in the table. # The first row is the header, so skip it. From a3b1cc6bcaa1a3eb36019d18691261cd6d24e65c Mon Sep 17 00:00:00 2001 From: Tom Mazurek Date: Sat, 13 Mar 2021 10:49:40 -0600 Subject: [PATCH 5/7] Optimize the parsing of the HTML --- Pipfile | 1 + Pipfile.lock | 45 ++++++++++++++++++++++++++++++++- python_baby_names/question_2.py | 2 +- 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/Pipfile b/Pipfile index 1e089c1..784fe96 100644 --- a/Pipfile +++ b/Pipfile @@ -7,6 +7,7 @@ name = "pypi" beautifulsoup4 = "*" pandas = "*" xlsxwriter = "*" +lxml = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index a6b7880..7af400f 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "4a37889d0ac50a8f82bdb9700bdbb153e96c7d259ee4d6768af3f7858e3a3c01" + "sha256": "c021e31f0cb26a656fa12813871b2dea68debbed7a87b35b2c5017d2397c47aa" }, "pipfile-spec": 6, "requires": { @@ -25,6 +25,49 @@ "index": "pypi", "version": "==4.9.3" }, + "lxml": { + "hashes": [ + "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d", + "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37", + "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01", + "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2", + "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644", + "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75", + "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80", + "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2", + "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780", + "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98", + "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308", + "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf", + "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388", + "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d", + "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3", + "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8", + "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af", + "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2", + "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e", + "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939", + "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03", + "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d", + "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a", + "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5", + "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a", + "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711", + "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf", + "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089", + "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505", + "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b", + "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f", + "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc", + "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e", + "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931", + "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc", + "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe", + "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e" + ], + "index": "pypi", + "version": "==4.6.2" + }, "numpy": { "hashes": [ "sha256:032be656d89bbf786d743fee11d01ef318b0781281241997558fa7950028dd29", diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py index d79db83..3436bef 100644 --- a/python_baby_names/question_2.py +++ b/python_baby_names/question_2.py @@ -133,7 +133,7 @@ def dfs_from_html(self): for index, filename in enumerate(filenames): html_file = open(f'{settings.RELATIVE_PATH}/{filename}', 'r') contents = html_file.read() - soup = BeautifulSoup(contents, 'html.parser') + soup = BeautifulSoup(contents, 'lxml') year = available_years[index] self.validate_year(year, soup) table = self.get_table(soup, filename) From 899eef2a69b809d5750ef675f64cca7b4bb6e05e Mon Sep 17 00:00:00 2001 From: Tom Mazurek Date: Sat, 13 Mar 2021 10:51:59 -0600 Subject: [PATCH 6/7] Reformat --- python_baby_names/mixins.py | 28 +++++++++++++------------ python_baby_names/question_2.py | 36 ++++++++++++--------------------- 2 files changed, 28 insertions(+), 36 deletions(-) diff --git a/python_baby_names/mixins.py b/python_baby_names/mixins.py index f61712c..56fce00 100644 --- a/python_baby_names/mixins.py +++ b/python_baby_names/mixins.py @@ -1,32 +1,34 @@ import os -import settings from bs4 import Tag +import settings + class BabyNamesMixin: header_tags = settings.HEADER_TAGS excel_filename = '' - def get_output_path(self, file_dundar: str) -> str: + def get_output_path(self, file_dunder: str) -> str: """ - Returns the path of the file (including the filename within the path) - that will be created. + Returns the path of the file that will be created (including the + filename within the path). Args: - file_dundar (str): + file_dunder (str): The __file__ value that's available in every Python file. Returns: - [str]: The path of the output. + str: The path of the output. """ excel_filename = ( self.excel_filename or - os.path.basename(file_dundar).replace('.py', '_report.xlsx') + os.path.basename(file_dunder).replace('.py', '_report.xlsx') ) output_path = ( - os.path.dirname(os.path.abspath(file_dundar)) + '\\' + excel_filename) + os.path.dirname(os.path.abspath(file_dunder)) + '\\' + excel_filename + ) return output_path def get_filename_info(self) -> tuple: @@ -35,7 +37,7 @@ def get_filename_info(self) -> tuple: years that are within those filenames. Returns: - tuple: filenames, available years + tuple: filenames, available_years """ available_years = [] @@ -52,8 +54,8 @@ def get_filename_info(self) -> tuple: if settings.FILENAME_PREFIX == prefix2 and settings.FILE_TYPE == file_type2: # Get the year from the middle of the string. year = filename[prefix_len: -file_type_len] - # If there are any non-digit characters, it means - # that the filename is not in the correct format. + # If there are any non-digit characters, it means that the + # filename is not in the correct format. assert year.isdigit(), ( f'File "{filename}" is not in a valid format.') assert len(year) == 4 @@ -89,7 +91,7 @@ def validate_year(self, year: int, soup): # It will be a long time until there more than 4 digits are in a year. assert len(year_in_title) == 4 assert year_in_title == year, ( - f'Year "{year_in_title}" !== "{year}"') + f'Year "{year_in_title}" != "{year}"') def get_table(self, soup, filename: str): """ @@ -101,7 +103,7 @@ def get_table(self, soup, filename: str): filename (str): The name of the file. Returns: - [BeautifulSoup]: The HTML table element within the soup. + BeautifulSoup: The HTML table element within the soup. """ # In all cases, the data we need is in the 3rd table. table = soup.find_all('table')[2] diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py index 3436bef..92f2bb7 100644 --- a/python_baby_names/question_2.py +++ b/python_baby_names/question_2.py @@ -45,7 +45,7 @@ def __init__( names_in_report: list, excel_filename: str = '', excel_sheetname: str = '', - generate_excel: bool = True, + generate_excel: bool = True ): assert isinstance(names_in_report, (list, tuple)), ( 'The names_in_report must be a list or a tuple.') @@ -65,8 +65,10 @@ def execute_report(self): already_scraped = pkl_filename in os.listdir(settings.RELATIVE_PATH) if not already_scraped: + print('Scraping HTML files for data.') male_df, female_df = self.dfs_from_html() else: + print('Using the stored data in the Pandas PKL file.') male_df, female_df = self.dfs_from_pkl() if self.generate_excel: @@ -77,8 +79,7 @@ def get_pkl_filename(self): def get_pandas_data_path(self) -> str: """ - Returns: - str: The path to the pandas PKL data file. + Returns the path to the Pandas PKL data file. """ pkl_filename = self.get_pkl_filename() return os.path.dirname(os.path.abspath(__file__)) + '\\' + pkl_filename @@ -97,12 +98,6 @@ def save_pkl(self, data): ).to_pickle(path) def get_empty_dataframes(self): - """ - Creates male and a female empty dataframes. - - Returns: - [tuple]: A 2-tuple of empty DataFrames. - """ header_2 = ['Year'] + self.names_in_report # Instantiate the table/dataframe for males. @@ -123,7 +118,7 @@ def dfs_from_html(self): DataFrames (one for males and one for females). Returns: - [tuple]: A 2-tuple of non-empty DataFrames. + tuple: A 2-tuple of non-empty DataFrames. """ male_df, female_df = self.get_empty_dataframes() pandas_data = collections.deque() @@ -176,8 +171,8 @@ def dfs_from_html(self): html_file.close() - # Save the data in the current directory, so you want have to do run - # scrape the html again. + # Save the data in the current directory, so you won't have to scrape + # the HTML again. self.save_pkl(pandas_data) return male_df, female_df @@ -190,12 +185,12 @@ def dfs_from_pkl(self): data from HTML. Also faster because it is querying the PKL data file. Returns: - [tuple]: A 2-tuple containing non-empty DataFrames. + tuple: A 2-tuple containing non-empty DataFrames. """ male_df, female_df = self.get_empty_dataframes() path = self.get_pandas_data_path() - # Filter and fetch the data from the pkl file. + # Filter and fetch the data from the PKL file. df_filter = f'name in {str(list(self.names_in_report))}' # "df" is a very short data set. It is only: # len(self.names_in_report) * available_years * 2. @@ -206,7 +201,7 @@ def dfs_from_pkl(self): 'f': {} } - # Transform the data to a structure similar to what was in the + # Transform the queried data to a structure similar to what is in the # dfs_from_html method. for i in df.itertuples(): try: @@ -218,10 +213,12 @@ def dfs_from_pkl(self): for gender, value_dict in nested_data.items(): if gender == 'm': for year, value_dict in value_dict.items(): + # Append a new row to the male's table. male_df.loc[len(male_df.index)] = \ self.get_all_row_data(value_dict, [year]) else: for year, value_dict in value_dict.items(): + # Append a new row to the female's table. female_df.loc[len(female_df.index)] = \ self.get_all_row_data(value_dict, [year]) @@ -229,14 +226,7 @@ def dfs_from_pkl(self): def get_all_row_data(self, names: dict, new_row_data: list = []) -> list: """ - Returns 1 row's worth of data that will be appended the end of a table. - - Args: - names (dict): A dictionary of names as keys and ranks as values. - new_row_data (list, optional): D. Defaults to []. - - Returns: - list: 1 row of data + Creates and returns two empty dataframes. """ for name in self.names_in_report: try: From ceedc4598ccf5245db38d1e57d5a7468e712c765 Mon Sep 17 00:00:00 2001 From: Tom Mazurek Date: Sat, 13 Mar 2021 10:52:31 -0600 Subject: [PATCH 7/7] Refactor --- python_baby_names/mixins.py | 5 ++--- python_baby_names/question_2.py | 22 ++++++++++------------ 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/python_baby_names/mixins.py b/python_baby_names/mixins.py index 56fce00..a0301c2 100644 --- a/python_baby_names/mixins.py +++ b/python_baby_names/mixins.py @@ -78,10 +78,9 @@ def validate_year(self, year: int, soup): title_el = soup.select_one(tag) try: title_text = title_el.text - except AttributeError: - continue - else: break + except AttributeError: + pass # Make sure the year in the filename is the same as the year in the # title. diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py index 92f2bb7..d679128 100644 --- a/python_baby_names/question_2.py +++ b/python_baby_names/question_2.py @@ -17,21 +17,21 @@ def timer(func): """Print the runtime of the decorated function.""" @functools.wraps(func) - def wrapper_timer(*args, **kwargs): + def wrapper(*args, **kwargs): start_time = timeit.default_timer() value = func(*args, **kwargs) end_time = timeit.default_timer() elapsed_time = end_time - start_time - console_msg = f'Took {round(elapsed_time, 2)} seconds.' + msg = f'Took {round(elapsed_time, 2)} seconds.' script_obj = args[0] if script_obj.generate_excel: - console_msg += ' (Including the time it took to generate the Excel file.)' + msg += ' (Including the time it took to generate the Excel file.)' else: - console_msg += ' (NOT including the time it took to generate the Excel file.)' - print(console_msg) + msg += ' (NOT including the time it took to generate the Excel file.)' + print(msg) return value - return wrapper_timer + return wrapper class Script(mixins.BabyNamesMixin): @@ -47,8 +47,6 @@ def __init__( excel_sheetname: str = '', generate_excel: bool = True ): - assert isinstance(names_in_report, (list, tuple)), ( - 'The names_in_report must be a list or a tuple.') if excel_filename: assert excel_filename[-5:] == '.xlsx', ( 'The excel_filename must end with ".xlsx"') @@ -77,7 +75,7 @@ def execute_report(self): def get_pkl_filename(self): return os.path.basename(__file__).replace('.py', '_data.pkl') - def get_pandas_data_path(self) -> str: + def get_pkl_path(self) -> str: """ Returns the path to the Pandas PKL data file. """ @@ -91,7 +89,7 @@ def save_pkl(self, data): Args: data (Iterable): An iterable in a structure that Pandas understands. """ - path = self.get_pandas_data_path() + path = self.get_pkl_path() pd.DataFrame( data, columns=['gender', 'year', 'name', 'rank'] @@ -188,14 +186,14 @@ def dfs_from_pkl(self): tuple: A 2-tuple containing non-empty DataFrames. """ male_df, female_df = self.get_empty_dataframes() - path = self.get_pandas_data_path() + pkl_path = self.get_pkl_path() # Filter and fetch the data from the PKL file. df_filter = f'name in {str(list(self.names_in_report))}' # "df" is a very short data set. It is only: # len(self.names_in_report) * available_years * 2. # In this example: 3 * 10 * 2 = 60 - df = pd.read_pickle(path).query(df_filter) + df = pd.read_pickle(pkl_path).query(df_filter) nested_data = { 'm': {}, 'f': {}