diff --git a/Pipfile b/Pipfile index 1e089c1..784fe96 100644 --- a/Pipfile +++ b/Pipfile @@ -7,6 +7,7 @@ name = "pypi" beautifulsoup4 = "*" pandas = "*" xlsxwriter = "*" +lxml = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index a6b7880..7af400f 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "4a37889d0ac50a8f82bdb9700bdbb153e96c7d259ee4d6768af3f7858e3a3c01" + "sha256": "c021e31f0cb26a656fa12813871b2dea68debbed7a87b35b2c5017d2397c47aa" }, "pipfile-spec": 6, "requires": { @@ -25,6 +25,49 @@ "index": "pypi", "version": "==4.9.3" }, + "lxml": { + "hashes": [ + "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d", + "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37", + "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01", + "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2", + "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644", + "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75", + "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80", + "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2", + "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780", + "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98", + "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308", + "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf", + "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388", + "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d", + "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3", + "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8", + "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af", + "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2", + "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e", + "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939", + "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03", + "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d", + "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a", + "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5", + "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a", + "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711", + "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf", + "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089", + "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505", + "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b", + "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f", + "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc", + "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e", + "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931", + "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc", + "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe", + "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e" + ], + "index": "pypi", + "version": "==4.6.2" + }, "numpy": { "hashes": [ "sha256:032be656d89bbf786d743fee11d01ef318b0781281241997558fa7950028dd29", diff --git a/python_baby_names/.gitignore b/python_baby_names/.gitignore index 10326c2..01c1529 100644 --- a/python_baby_names/.gitignore +++ b/python_baby_names/.gitignore @@ -1 +1,2 @@ -*.xlsx \ No newline at end of file +*.xlsx +*.pkl \ No newline at end of file diff --git a/python_baby_names/mixins.py b/python_baby_names/mixins.py index d9581aa..a0301c2 100644 --- a/python_baby_names/mixins.py +++ b/python_baby_names/mixins.py @@ -1,32 +1,34 @@ import os -import settings from bs4 import Tag +import settings + class BabyNamesMixin: header_tags = settings.HEADER_TAGS excel_filename = '' - def get_output_path(self, file_dundar: str) -> str: + def get_output_path(self, file_dunder: str) -> str: """ - Returns the path of the file (including the filename within the path) - that will be created. + Returns the path of the file that will be created (including the + filename within the path). Args: - file_dundar (str): + file_dunder (str): The __file__ value that's available in every Python file. Returns: - [str]: The path of the output. + str: The path of the output. """ excel_filename = ( self.excel_filename or - os.path.basename(file_dundar).replace('.py', '_report.xlsx') + os.path.basename(file_dunder).replace('.py', '_report.xlsx') ) output_path = ( - os.path.dirname(os.path.abspath(file_dundar)) + '\\' + excel_filename) + os.path.dirname(os.path.abspath(file_dunder)) + '\\' + excel_filename + ) return output_path def get_filename_info(self) -> tuple: @@ -35,7 +37,7 @@ def get_filename_info(self) -> tuple: years that are within those filenames. Returns: - tuple: filenames, available years + tuple: filenames, available_years """ available_years = [] @@ -52,8 +54,8 @@ def get_filename_info(self) -> tuple: if settings.FILENAME_PREFIX == prefix2 and settings.FILE_TYPE == file_type2: # Get the year from the middle of the string. year = filename[prefix_len: -file_type_len] - # If there are any non-digit characters, it means - # that the filename is not in the correct format. + # If there are any non-digit characters, it means that the + # filename is not in the correct format. assert year.isdigit(), ( f'File "{filename}" is not in a valid format.') assert len(year) == 4 @@ -76,10 +78,9 @@ def validate_year(self, year: int, soup): title_el = soup.select_one(tag) try: title_text = title_el.text - except AttributeError: - continue - else: break + except AttributeError: + pass # Make sure the year in the filename is the same as the year in the # title. @@ -89,30 +90,32 @@ def validate_year(self, year: int, soup): # It will be a long time until there more than 4 digits are in a year. assert len(year_in_title) == 4 assert year_in_title == year, ( - f'Year "{year_in_title}" !== "{year}"') + f'Year "{year_in_title}" != "{year}"') - def get_table(self, soup): + def get_table(self, soup, filename: str): """ Finds the correct table within BeautifulSoup object and validates it before it is returned. Args: soup (BeautifulSoup): The soup of the entire page. + filename (str): The name of the file. Returns: - [BeautifulSoup]: The HTML table element within the soup. + BeautifulSoup: The HTML table element within the soup. """ # In all cases, the data we need is in the 3rd table. table = soup.find_all('table')[2] - self.validate_table_columns(table) + self.validate_table_columns(table, filename) return table - def validate_table_columns(self, table): + def validate_table_columns(self, table, filename: str): """ - Validate that the table's columns are in the expected order. + Validates that the table's columns are in the expected order. Args: table (BeautifulSoup): The table object found in the HTML file. + filename (str): The name of the file. """ # Make sure the order of the columns are what is expected. table_header = table.select_one('tr') diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py index c427047..d679128 100644 --- a/python_baby_names/question_2.py +++ b/python_baby_names/question_2.py @@ -1,33 +1,104 @@ -import mixins +import collections +import functools import os -import pandas as pd -import settings +import timeit +import pandas as pd from bs4 import BeautifulSoup +import mixins +import settings -NAMES_IN_REPORT = ["Ryan", "Ben", "Eugene"] +NAMES_IN_REPORT = ['Ryan', 'Ben', 'Eugene'] EXCEL_SHEETNAME = 'Great Report' +def timer(func): + """Print the runtime of the decorated function.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + start_time = timeit.default_timer() + value = func(*args, **kwargs) + end_time = timeit.default_timer() + elapsed_time = end_time - start_time + msg = f'Took {round(elapsed_time, 2)} seconds.' + script_obj = args[0] + if script_obj.generate_excel: + msg += ' (Including the time it took to generate the Excel file.)' + else: + msg += ' (NOT including the time it took to generate the Excel file.)' + print(msg) + return value + + return wrapper + + class Script(mixins.BabyNamesMixin): """ This script details the rankings over the available years for specific baby names. """ - def __init__(self, names_in_report: list, excel_filename: str = '', - excel_sheetname: str = ''): + def __init__( + self, + names_in_report: list, + excel_filename: str = '', + excel_sheetname: str = '', + generate_excel: bool = True + ): if excel_filename: assert excel_filename[-5:] == '.xlsx', ( 'The excel_filename must end with ".xlsx"') self.excel_filename = excel_filename self.names_in_report = names_in_report self.excel_sheetname = excel_sheetname + self.generate_excel = generate_excel + @timer def execute_report(self): - # Instantiate the table/dataframe for males. + print() + + pkl_filename = self.get_pkl_filename() + already_scraped = pkl_filename in os.listdir(settings.RELATIVE_PATH) + + if not already_scraped: + print('Scraping HTML files for data.') + male_df, female_df = self.dfs_from_html() + else: + print('Using the stored data in the Pandas PKL file.') + male_df, female_df = self.dfs_from_pkl() + + if self.generate_excel: + self.save_to_excel([male_df, female_df]) + + def get_pkl_filename(self): + return os.path.basename(__file__).replace('.py', '_data.pkl') + + def get_pkl_path(self) -> str: + """ + Returns the path to the Pandas PKL data file. + """ + pkl_filename = self.get_pkl_filename() + return os.path.dirname(os.path.abspath(__file__)) + '\\' + pkl_filename + + def save_pkl(self, data): + """ + Saves a Pandas PKL data file. + + Args: + data (Iterable): An iterable in a structure that Pandas understands. + """ + path = self.get_pkl_path() + pd.DataFrame( + data, + columns=['gender', 'year', 'name', 'rank'] + ).to_pickle(path) + + def get_empty_dataframes(self): header_2 = ['Year'] + self.names_in_report + + # Instantiate the table/dataframe for males. male_df = pd.DataFrame(columns=header_2) header_1 = ['Male Name Rankings Per Year'] + len(self.names_in_report) * [''] male_df.columns = pd.MultiIndex.from_tuples(zip(header_1, header_2)) @@ -37,17 +108,28 @@ def execute_report(self): header_1 = ['Female Name Rankings Per Year'] + len(self.names_in_report) * [''] female_df.columns = pd.MultiIndex.from_tuples(zip(header_1, header_2)) + return male_df, female_df + + def dfs_from_html(self): + """ + Scrapes the HTML files found in this file's directory to generate 2 + DataFrames (one for males and one for females). + + Returns: + tuple: A 2-tuple of non-empty DataFrames. + """ + male_df, female_df = self.get_empty_dataframes() + pandas_data = collections.deque() filenames, available_years = self.get_filename_info() # Gather the data from the files. for index, filename in enumerate(filenames): html_file = open(f'{settings.RELATIVE_PATH}/{filename}', 'r') contents = html_file.read() - soup = BeautifulSoup(contents, 'html.parser') - + soup = BeautifulSoup(contents, 'lxml') year = available_years[index] self.validate_year(year, soup) - table = self.get_table(soup) + table = self.get_table(soup, filename) # Find all the rows in the table. # The first row is the header, so skip it. @@ -55,8 +137,9 @@ def execute_report(self): # The dictionary element's key will be the name. # The dictionary element's value will be the rank. - male_names = {} - female_names = {} + male_names_dict = {} + female_names_dict = {} + for row in rows: # For most of the HTML files, the table rows are missing the # closing tr tags, so I am calling "next" until I get the @@ -65,22 +148,84 @@ def execute_report(self): rank_num = int(str(rank_el)) tag = rank_el.next male_name = tag.text.strip() - male_names[male_name] = rank_num + male_names_dict[male_name] = rank_num female_name = tag.next_sibling.text.strip() - female_names[female_name] = rank_num + female_names_dict[female_name] = rank_num + + pandas_data.extend( + [ + ['m', year, male_name, rank_num], + ['f', year, female_name, rank_num], + ] + ) # Append a new row to the male's table. male_df.loc[len(male_df.index)] = \ - self.add_data_to_row(male_names, [year]) + self.get_all_row_data(male_names_dict, [year]) # Append a new row to the female's table. female_df.loc[len(female_df.index)] = \ - self.add_data_to_row(female_names, [year]) + self.get_all_row_data(female_names_dict, [year]) + html_file.close() - self.save_to_excel([male_df, female_df]) + # Save the data in the current directory, so you won't have to scrape + # the HTML again. + self.save_pkl(pandas_data) + return male_df, female_df + + def dfs_from_pkl(self): + """ + Uses the PKL file (static data) found in this file's directory to + generate 2 DataFrames (one for males and one for females). + + Much faster than the dfs_from_html method because it is not scraping + data from HTML. Also faster because it is querying the PKL data file. + + Returns: + tuple: A 2-tuple containing non-empty DataFrames. + """ + male_df, female_df = self.get_empty_dataframes() + pkl_path = self.get_pkl_path() + + # Filter and fetch the data from the PKL file. + df_filter = f'name in {str(list(self.names_in_report))}' + # "df" is a very short data set. It is only: + # len(self.names_in_report) * available_years * 2. + # In this example: 3 * 10 * 2 = 60 + df = pd.read_pickle(pkl_path).query(df_filter) + nested_data = { + 'm': {}, + 'f': {} + } + + # Transform the queried data to a structure similar to what is in the + # dfs_from_html method. + for i in df.itertuples(): + try: + # Accessing the year can raise a KeyError. + nested_data[i.gender][i.year][i.name] = i.rank + except KeyError: + nested_data[i.gender][i.year] = {i.name: i.rank} + + for gender, value_dict in nested_data.items(): + if gender == 'm': + for year, value_dict in value_dict.items(): + # Append a new row to the male's table. + male_df.loc[len(male_df.index)] = \ + self.get_all_row_data(value_dict, [year]) + else: + for year, value_dict in value_dict.items(): + # Append a new row to the female's table. + female_df.loc[len(female_df.index)] = \ + self.get_all_row_data(value_dict, [year]) + + return male_df, female_df - def add_data_to_row(self, names: dict, new_row_data: list = []) -> list: + def get_all_row_data(self, names: dict, new_row_data: list = []) -> list: + """ + Creates and returns two empty dataframes. + """ for name in self.names_in_report: try: rank = names[name] @@ -89,7 +234,7 @@ def add_data_to_row(self, names: dict, new_row_data: list = []) -> list: new_row_data.append(rank) return new_row_data - def save_to_excel(self, dataframes): + def save_to_excel(self, dataframes: list): output_path = self.get_output_path(__file__) writer = pd.ExcelWriter(output_path, engine='xlsxwriter') row = 0 @@ -105,7 +250,9 @@ def save_to_excel(self, dataframes): print(f'Created: {output_path}') -Script( +script = Script( NAMES_IN_REPORT, excel_sheetname=EXCEL_SHEETNAME, -).execute_report() + generate_excel=True +) +script.execute_report()