From 0c6cafe81680cca0a2bd5632f337f4dd7a0bca39 Mon Sep 17 00:00:00 2001
From: Tom Mazurek <tom@quantfxcapital.com>
Date: Tue, 9 Mar 2021 19:10:53 -0600
Subject: [PATCH 1/7] Add storage of static data for faster file regeneration

---
 python_baby_names/.gitignore    |   3 +-
 python_baby_names/question_2.py | 150 +++++++++++++++++++++++++++++---
 2 files changed, 140 insertions(+), 13 deletions(-)

diff --git a/python_baby_names/.gitignore b/python_baby_names/.gitignore
index 10326c2..01c1529 100644
--- a/python_baby_names/.gitignore
+++ b/python_baby_names/.gitignore
@@ -1 +1,2 @@
-*.xlsx
\ No newline at end of file
+*.xlsx
+*.pkl
\ No newline at end of file
diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py
index c427047..db9702b 100644
--- a/python_baby_names/question_2.py
+++ b/python_baby_names/question_2.py
@@ -1,8 +1,10 @@
+import collections
 import mixins
 import os
 import pandas as pd
 import settings
 
+
 from bs4 import BeautifulSoup
 
 
@@ -18,6 +20,8 @@ class Script(mixins.BabyNamesMixin):
 
     def __init__(self, names_in_report: list, excel_filename: str = '',
                  excel_sheetname: str = ''):
+        assert isinstance(names_in_report, (list, tuple)), (
+            'The names_in_report must be a list or a tuple.')
         if excel_filename:
             assert excel_filename[-5:] == '.xlsx', (
                 'The excel_filename must end with ".xlsx"')
@@ -26,8 +30,50 @@ def __init__(self, names_in_report: list, excel_filename: str = '',
         self.excel_sheetname = excel_sheetname
 
     def execute_report(self):
-        # Instantiate the table/dataframe for males.
+        pkl_filename = self.get_pkl_filename()
+        already_scraped = pkl_filename in os.listdir(settings.RELATIVE_PATH)
+
+        if not already_scraped:
+            male_df, female_df = self.dfs_from_html()
+        else:
+            male_df, female_df = self.dfs_from_pkl()
+
+        self.save_to_excel([male_df, female_df])
+
+    def get_pkl_filename(self):
+        return os.path.basename(__file__).replace('.py', '_data.pkl')
+
+    def get_pandas_data_path(self) -> str:
+        """
+        Returns:
+            str: The path to the pandas PKL data file.
+        """
+        pkl_filename = self.get_pkl_filename()
+        return os.path.dirname(os.path.abspath(__file__)) + '\\' + pkl_filename
+
+    def save_pkl(self, data: collections.deque):
+        """
+        Saves a Pandas PKL data file.
+
+        Args:
+            data (collections.deque): [description]
+        """
+        path = self.get_pandas_data_path()
+        pd.DataFrame(
+            data,
+            columns=['gender', 'year', 'name', 'rank']
+        ).to_pickle(path)
+
+    def get_empty_dataframes(self):
+        """
+        Creates both male and a female dataframes.
+
+        Returns:
+            [tuple]: A 2-tuple containing DataFrames.
+        """
         header_2 = ['Year'] + self.names_in_report
+
+        # Instantiate the table/dataframe for males.
         male_df = pd.DataFrame(columns=header_2)
         header_1 = ['Male Name Rankings Per Year'] + len(self.names_in_report) * ['']
         male_df.columns = pd.MultiIndex.from_tuples(zip(header_1, header_2))
@@ -37,6 +83,19 @@ def execute_report(self):
         header_1 = ['Female Name Rankings Per Year'] + len(self.names_in_report) * ['']
         female_df.columns = pd.MultiIndex.from_tuples(zip(header_1, header_2))
 
+        return male_df, female_df
+
+    def dfs_from_html(self):
+        """
+        Scrapes the HTML files found in this file's directory to
+        generate 2 DataFrames (one for males and one for females).
+
+        Returns:
+            [tuple]: A 2-tuple containing non-empty DataFrames.
+        """
+        male_df, female_df = self.get_empty_dataframes()
+        pandas_data = collections.deque()
+        names_in_report = self.names_in_report
         filenames, available_years = self.get_filename_info()
 
         # Gather the data from the files.
@@ -44,7 +103,6 @@ def execute_report(self):
             html_file = open(f'{settings.RELATIVE_PATH}/{filename}', 'r')
             contents = html_file.read()
             soup = BeautifulSoup(contents, 'html.parser')
-
             year = available_years[index]
             self.validate_year(year, soup)
             table = self.get_table(soup)
@@ -55,8 +113,9 @@ def execute_report(self):
 
             # The dictionary element's key will be the name.
             # The dictionary element's value will be the rank.
-            male_names = {}
-            female_names = {}
+            male_names_dict = {}
+            female_names_dict = {}
+
             for row in rows:
                 # For most of the HTML files, the table rows are missing the
                 # closing tr tags, so I am calling "next" until I get the
@@ -65,22 +124,89 @@ def execute_report(self):
                 rank_num = int(str(rank_el))
                 tag = rank_el.next
                 male_name = tag.text.strip()
-                male_names[male_name] = rank_num
+                male_names_dict[male_name] = rank_num
                 female_name = tag.next_sibling.text.strip()
-                female_names[female_name] = rank_num
+                female_names_dict[female_name] = rank_num
+
+                pandas_data.extend(
+                    [
+                        ['m', year, male_name, rank_num],
+                        ['f', year, female_name, rank_num],
+                    ]
+                )
 
             # Append a new row to the male's table.
             male_df.loc[len(male_df.index)] = \
-                self.add_data_to_row(male_names, [year])
+                self.get_all_row_data(male_names_dict, [year])
 
             # Append a new row to the female's table.
             female_df.loc[len(female_df.index)] = \
-                self.add_data_to_row(female_names, [year])
-            html_file.close()
+                self.get_all_row_data(female_names_dict, [year])
 
-        self.save_to_excel([male_df, female_df])
+            html_file.close()
 
-    def add_data_to_row(self, names: dict, new_row_data: list = []) -> list:
+        # Save the data in the current directory, so you want have to do run
+        # scrape the html again.
+        self.save_pkl(pandas_data)
+        return male_df, female_df
+
+    def dfs_from_pkl(self):
+        """
+        Uses the PKL file (static data) found in this file's directory to
+        generate 2 DataFrames (one for males and one for females).
+
+        Much faster than the dfs_from_html method because it is not scraping
+        data from HTML. Also faster because it is querying the PKL data file.
+
+        Returns:
+            [tuple]: A 2-tuple containing non-empty DataFrames.
+        """
+        male_df, female_df = self.get_empty_dataframes()
+        path = self.get_pandas_data_path()
+
+        # Filter and fetch the data from the pkl file.
+        df_filter = f'name in {str(list(self.names_in_report))}'
+        # "df" is a very short data set. It is only:
+        # len(self.names_in_report) * available_years * 2.
+        # In this example: 3 * 10 * 2 = 60
+        df = pd.read_pickle(path).query(df_filter)
+        nested_data = {
+            'm': {},
+            'f': {}
+        }
+
+        # Transform the data to a structure similar to what was in the
+        # dfs_from_html method.
+        for i in df.itertuples():
+            try:
+                # Accessing the year can raise a KeyError.
+                nested_data[i.gender][i.year][i.name] = i.rank
+            except KeyError:
+                nested_data[i.gender][i.year] = {i.name: i.rank}
+
+        for gender, value_dict in nested_data.items():
+            if gender == 'm':
+                for year, value_dict in value_dict.items():
+                    male_df.loc[len(male_df.index)] = \
+                        self.get_all_row_data(value_dict, [year])
+            else:
+                for year, value_dict in value_dict.items():
+                    female_df.loc[len(female_df.index)] = \
+                        self.get_all_row_data(value_dict, [year])
+
+        return male_df, female_df
+
+    def get_all_row_data(self, names: dict, new_row_data: list = []) -> list:
+        """
+        Returns 1 row's worth of data that will be appended the end of a table.
+
+        Args:
+            names (dict): A dictionary of names as keys and ranks as values.
+            new_row_data (list, optional): D. Defaults to [].
+
+        Returns:
+            list: 1 row of data
+        """
         for name in self.names_in_report:
             try:
                 rank = names[name]
@@ -89,7 +215,7 @@ def add_data_to_row(self, names: dict, new_row_data: list = []) -> list:
             new_row_data.append(rank)
         return new_row_data
 
-    def save_to_excel(self, dataframes):
+    def save_to_excel(self, dataframes: list):
         output_path = self.get_output_path(__file__)
         writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
         row = 0

From 36e2d982f27c698cab30566201d9a85ed306966d Mon Sep 17 00:00:00 2001
From: Tom Mazurek <tom@quantfxcapital.com>
Date: Thu, 11 Mar 2021 18:07:40 -0600
Subject: [PATCH 2/7] Add timer to execute_report function

---
 python_baby_names/question_2.py | 42 +++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py
index db9702b..bfbcc89 100644
--- a/python_baby_names/question_2.py
+++ b/python_baby_names/question_2.py
@@ -1,15 +1,35 @@
 import collections
 import mixins
+import functools
 import os
+import timeit
 import pandas as pd
 import settings
 
 
 from bs4 import BeautifulSoup
 
+def timer(func):
+    """Print the runtime of the decorated function."""
 
 NAMES_IN_REPORT = ["Ryan", "Ben", "Eugene"]
 EXCEL_SHEETNAME = 'Great Report'
+    @functools.wraps(func)
+    def wrapper_timer(*args, **kwargs):
+        start_time = timeit.default_timer()
+        value = func(*args, **kwargs)
+        end_time = timeit.default_timer()
+        elapsed_time = end_time - start_time
+        console_msg = f'Took {round(elapsed_time, 2)} seconds.'
+        script_obj = args[0]
+        if script_obj.generate_excel:
+            console_msg += ' (Including the time it took to generate the Excel file.)'
+        else:
+            console_msg += ' (NOT including the time it took to generate the Excel file.)'
+        print(console_msg)
+        return value
+
+    return wrapper_timer
 
 
 class Script(mixins.BabyNamesMixin):
@@ -18,8 +38,13 @@ class Script(mixins.BabyNamesMixin):
     baby names.
     """
 
-    def __init__(self, names_in_report: list, excel_filename: str = '',
-                 excel_sheetname: str = ''):
+    def __init__(
+        self,
+        names_in_report: list,
+        excel_filename: str = '',
+        excel_sheetname: str = '',
+        generate_excel: bool = True,
+    ):
         assert isinstance(names_in_report, (list, tuple)), (
             'The names_in_report must be a list or a tuple.')
         if excel_filename:
@@ -28,8 +53,12 @@ def __init__(self, names_in_report: list, excel_filename: str = '',
         self.excel_filename = excel_filename
         self.names_in_report = names_in_report
         self.excel_sheetname = excel_sheetname
+        self.generate_excel = generate_excel
 
+    @timer
     def execute_report(self):
+        print()
+
         pkl_filename = self.get_pkl_filename()
         already_scraped = pkl_filename in os.listdir(settings.RELATIVE_PATH)
 
@@ -38,7 +67,8 @@ def execute_report(self):
         else:
             male_df, female_df = self.dfs_from_pkl()
 
-        self.save_to_excel([male_df, female_df])
+        if self.generate_excel:
+            self.save_to_excel([male_df, female_df])
 
     def get_pkl_filename(self):
         return os.path.basename(__file__).replace('.py', '_data.pkl')
@@ -231,7 +261,9 @@ def save_to_excel(self, dataframes: list):
         print(f'Created: {output_path}')
 
 
-Script(
+script = Script(
     NAMES_IN_REPORT,
     excel_sheetname=EXCEL_SHEETNAME,
-).execute_report()
+    generate_excel=True
+)
+script.execute_report()

From f480286e2f645ad51422f5a188813fe377757895 Mon Sep 17 00:00:00 2001
From: Tom Mazurek <tom@quantfxcapital.com>
Date: Thu, 11 Mar 2021 18:24:26 -0600
Subject: [PATCH 3/7] Reformat

---
 python_baby_names/question_2.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py
index bfbcc89..8763c5c 100644
--- a/python_baby_names/question_2.py
+++ b/python_baby_names/question_2.py
@@ -1,19 +1,21 @@
 import collections
-import mixins
 import functools
 import os
 import timeit
+
 import pandas as pd
+from bs4 import BeautifulSoup
+
+import mixins
 import settings
 
+NAMES_IN_REPORT = ['Ryan', 'Ben', 'Eugene']
+EXCEL_SHEETNAME = 'Great Report'
 
-from bs4 import BeautifulSoup
 
 def timer(func):
     """Print the runtime of the decorated function."""
 
-NAMES_IN_REPORT = ["Ryan", "Ben", "Eugene"]
-EXCEL_SHEETNAME = 'Great Report'
     @functools.wraps(func)
     def wrapper_timer(*args, **kwargs):
         start_time = timeit.default_timer()
@@ -81,12 +83,12 @@ def get_pandas_data_path(self) -> str:
         pkl_filename = self.get_pkl_filename()
         return os.path.dirname(os.path.abspath(__file__)) + '\\' + pkl_filename
 
-    def save_pkl(self, data: collections.deque):
+    def save_pkl(self, data):
         """
         Saves a Pandas PKL data file.
 
         Args:
-            data (collections.deque): [description]
+            data (Iterable): An iterable in a structure that Pandas understands.
         """
         path = self.get_pandas_data_path()
         pd.DataFrame(
@@ -96,10 +98,10 @@ def save_pkl(self, data: collections.deque):
 
     def get_empty_dataframes(self):
         """
-        Creates both male and a female dataframes.
+        Creates male and a female empty dataframes.
 
         Returns:
-            [tuple]: A 2-tuple containing DataFrames.
+            [tuple]: A 2-tuple of empty DataFrames.
         """
         header_2 = ['Year'] + self.names_in_report
 
@@ -117,15 +119,14 @@ def get_empty_dataframes(self):
 
     def dfs_from_html(self):
         """
-        Scrapes the HTML files found in this file's directory to
-        generate 2 DataFrames (one for males and one for females).
+        Scrapes the HTML files found in this file's directory to generate 2
+        DataFrames (one for males and one for females).
 
         Returns:
-            [tuple]: A 2-tuple containing non-empty DataFrames.
+            [tuple]: A 2-tuple of non-empty DataFrames.
         """
         male_df, female_df = self.get_empty_dataframes()
         pandas_data = collections.deque()
-        names_in_report = self.names_in_report
         filenames, available_years = self.get_filename_info()
 
         # Gather the data from the files.

From 0a8df9a535e7591ce7c21cac392fa24c9abf76f5 Mon Sep 17 00:00:00 2001
From: Tom Mazurek <tom@quantfxcapital.com>
Date: Thu, 11 Mar 2021 18:25:24 -0600
Subject: [PATCH 4/7] Fix undefined variable named "filename"

---
 python_baby_names/mixins.py     | 10 ++++++----
 python_baby_names/question_2.py |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/python_baby_names/mixins.py b/python_baby_names/mixins.py
index d9581aa..f61712c 100644
--- a/python_baby_names/mixins.py
+++ b/python_baby_names/mixins.py
@@ -91,28 +91,30 @@ def validate_year(self, year: int, soup):
         assert year_in_title == year, (
             f'Year "{year_in_title}" !== "{year}"')
 
-    def get_table(self, soup):
+    def get_table(self, soup, filename: str):
         """
         Finds the correct table within BeautifulSoup object and validates it
         before it is returned.
 
         Args:
             soup (BeautifulSoup): The soup of the entire page.
+            filename (str): The name of the file.
 
         Returns:
             [BeautifulSoup]: The HTML table element within the soup.
         """
         # In all cases, the data we need is in the 3rd table.
         table = soup.find_all('table')[2]
-        self.validate_table_columns(table)
+        self.validate_table_columns(table, filename)
         return table
 
-    def validate_table_columns(self, table):
+    def validate_table_columns(self, table, filename: str):
         """
-        Validate that the table's columns are in the expected order.
+        Validates that the table's columns are in the expected order.
 
         Args:
             table (BeautifulSoup): The table object found in the HTML file.
+            filename (str): The name of the file.
         """
         # Make sure the order of the columns are what is expected.
         table_header = table.select_one('tr')
diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py
index 8763c5c..d79db83 100644
--- a/python_baby_names/question_2.py
+++ b/python_baby_names/question_2.py
@@ -136,7 +136,7 @@ def dfs_from_html(self):
             soup = BeautifulSoup(contents, 'html.parser')
             year = available_years[index]
             self.validate_year(year, soup)
-            table = self.get_table(soup)
+            table = self.get_table(soup, filename)
 
             # Find all the rows in the table.
             # The first row is the header, so skip it.

From a3b1cc6bcaa1a3eb36019d18691261cd6d24e65c Mon Sep 17 00:00:00 2001
From: Tom Mazurek <tom@quantfxcapital.com>
Date: Sat, 13 Mar 2021 10:49:40 -0600
Subject: [PATCH 5/7] Optimize the parsing of the HTML

---
 Pipfile                         |  1 +
 Pipfile.lock                    | 45 ++++++++++++++++++++++++++++++++-
 python_baby_names/question_2.py |  2 +-
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/Pipfile b/Pipfile
index 1e089c1..784fe96 100644
--- a/Pipfile
+++ b/Pipfile
@@ -7,6 +7,7 @@ name = "pypi"
 beautifulsoup4 = "*"
 pandas = "*"
 xlsxwriter = "*"
+lxml = "*"
 
 [dev-packages]
 
diff --git a/Pipfile.lock b/Pipfile.lock
index a6b7880..7af400f 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "4a37889d0ac50a8f82bdb9700bdbb153e96c7d259ee4d6768af3f7858e3a3c01"
+            "sha256": "c021e31f0cb26a656fa12813871b2dea68debbed7a87b35b2c5017d2397c47aa"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -25,6 +25,49 @@
             "index": "pypi",
             "version": "==4.9.3"
         },
+        "lxml": {
+            "hashes": [
+                "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
+                "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
+                "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
+                "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
+                "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
+                "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
+                "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
+                "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
+                "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
+                "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
+                "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
+                "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
+                "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
+                "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
+                "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
+                "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
+                "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
+                "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
+                "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
+                "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
+                "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
+                "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
+                "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
+                "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
+                "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
+                "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
+                "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
+                "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
+                "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
+                "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
+                "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
+                "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
+                "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
+                "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
+                "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
+                "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
+                "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
+            ],
+            "index": "pypi",
+            "version": "==4.6.2"
+        },
         "numpy": {
             "hashes": [
                 "sha256:032be656d89bbf786d743fee11d01ef318b0781281241997558fa7950028dd29",
diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py
index d79db83..3436bef 100644
--- a/python_baby_names/question_2.py
+++ b/python_baby_names/question_2.py
@@ -133,7 +133,7 @@ def dfs_from_html(self):
         for index, filename in enumerate(filenames):
             html_file = open(f'{settings.RELATIVE_PATH}/{filename}', 'r')
             contents = html_file.read()
-            soup = BeautifulSoup(contents, 'html.parser')
+            soup = BeautifulSoup(contents, 'lxml')
             year = available_years[index]
             self.validate_year(year, soup)
             table = self.get_table(soup, filename)

From 899eef2a69b809d5750ef675f64cca7b4bb6e05e Mon Sep 17 00:00:00 2001
From: Tom Mazurek <tom@quantfxcapital.com>
Date: Sat, 13 Mar 2021 10:51:59 -0600
Subject: [PATCH 6/7] Reformat

---
 python_baby_names/mixins.py     | 28 +++++++++++++------------
 python_baby_names/question_2.py | 36 ++++++++++++---------------------
 2 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/python_baby_names/mixins.py b/python_baby_names/mixins.py
index f61712c..56fce00 100644
--- a/python_baby_names/mixins.py
+++ b/python_baby_names/mixins.py
@@ -1,32 +1,34 @@
 import os
-import settings
 
 from bs4 import Tag
 
+import settings
+
 
 class BabyNamesMixin:
 
     header_tags = settings.HEADER_TAGS
     excel_filename = ''
 
-    def get_output_path(self, file_dundar: str) -> str:
+    def get_output_path(self, file_dunder: str) -> str:
         """
-        Returns the path of the file (including the filename within the path)
-        that will be created.
+        Returns the path of the file that will be created (including the
+        filename within the path).
 
         Args:
-            file_dundar (str):
+            file_dunder (str):
                 The __file__ value that's available in every Python file.
 
         Returns:
-            [str]: The path of the output.
+            str: The path of the output.
         """
         excel_filename = (
             self.excel_filename or
-            os.path.basename(file_dundar).replace('.py', '_report.xlsx')
+            os.path.basename(file_dunder).replace('.py', '_report.xlsx')
         )
         output_path = (
-            os.path.dirname(os.path.abspath(file_dundar)) + '\\' + excel_filename)
+            os.path.dirname(os.path.abspath(file_dunder)) + '\\' + excel_filename
+        )
         return output_path
 
     def get_filename_info(self) -> tuple:
@@ -35,7 +37,7 @@ def get_filename_info(self) -> tuple:
         years that are within those filenames.
 
         Returns:
-            tuple: filenames, available years
+            tuple: filenames, available_years
         """
         available_years = []
 
@@ -52,8 +54,8 @@ def get_filename_info(self) -> tuple:
             if settings.FILENAME_PREFIX == prefix2 and settings.FILE_TYPE == file_type2:
                 # Get the year from the middle of the string.
                 year = filename[prefix_len: -file_type_len]
-                # If there are any non-digit characters, it means
-                # that the filename is not in the correct format.
+                # If there are any non-digit characters, it means that the
+                # filename is not in the correct format.
                 assert year.isdigit(), (
                     f'File "{filename}" is not in a valid format.')
                 assert len(year) == 4
@@ -89,7 +91,7 @@ def validate_year(self, year: int, soup):
         # It will be a long time until there more than 4 digits are in a year.
         assert len(year_in_title) == 4
         assert year_in_title == year, (
-            f'Year "{year_in_title}" !== "{year}"')
+            f'Year "{year_in_title}" != "{year}"')
 
     def get_table(self, soup, filename: str):
         """
@@ -101,7 +103,7 @@ def get_table(self, soup, filename: str):
             filename (str): The name of the file.
 
         Returns:
-            [BeautifulSoup]: The HTML table element within the soup.
+            BeautifulSoup: The HTML table element within the soup.
         """
         # In all cases, the data we need is in the 3rd table.
         table = soup.find_all('table')[2]
diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py
index 3436bef..92f2bb7 100644
--- a/python_baby_names/question_2.py
+++ b/python_baby_names/question_2.py
@@ -45,7 +45,7 @@ def __init__(
         names_in_report: list,
         excel_filename: str = '',
         excel_sheetname: str = '',
-        generate_excel: bool = True,
+        generate_excel: bool = True
     ):
         assert isinstance(names_in_report, (list, tuple)), (
             'The names_in_report must be a list or a tuple.')
@@ -65,8 +65,10 @@ def execute_report(self):
         already_scraped = pkl_filename in os.listdir(settings.RELATIVE_PATH)
 
         if not already_scraped:
+            print('Scraping HTML files for data.')
             male_df, female_df = self.dfs_from_html()
         else:
+            print('Using the stored data in the Pandas PKL file.')
             male_df, female_df = self.dfs_from_pkl()
 
         if self.generate_excel:
@@ -77,8 +79,7 @@ def get_pkl_filename(self):
 
     def get_pandas_data_path(self) -> str:
         """
-        Returns:
-            str: The path to the pandas PKL data file.
+        Returns the path to the Pandas PKL data file.
         """
         pkl_filename = self.get_pkl_filename()
         return os.path.dirname(os.path.abspath(__file__)) + '\\' + pkl_filename
@@ -97,12 +98,6 @@ def save_pkl(self, data):
         ).to_pickle(path)
 
     def get_empty_dataframes(self):
-        """
-        Creates male and a female empty dataframes.
-
-        Returns:
-            [tuple]: A 2-tuple of empty DataFrames.
-        """
         header_2 = ['Year'] + self.names_in_report
 
         # Instantiate the table/dataframe for males.
@@ -123,7 +118,7 @@ def dfs_from_html(self):
         DataFrames (one for males and one for females).
 
         Returns:
-            [tuple]: A 2-tuple of non-empty DataFrames.
+            tuple: A 2-tuple of non-empty DataFrames.
         """
         male_df, female_df = self.get_empty_dataframes()
         pandas_data = collections.deque()
@@ -176,8 +171,8 @@ def dfs_from_html(self):
 
             html_file.close()
 
-        # Save the data in the current directory, so you want have to do run
-        # scrape the html again.
+        # Save the data in the current directory, so you won't have to scrape
+        # the HTML again.
         self.save_pkl(pandas_data)
         return male_df, female_df
 
@@ -190,12 +185,12 @@ def dfs_from_pkl(self):
         data from HTML. Also faster because it is querying the PKL data file.
 
         Returns:
-            [tuple]: A 2-tuple containing non-empty DataFrames.
+            tuple: A 2-tuple containing non-empty DataFrames.
         """
         male_df, female_df = self.get_empty_dataframes()
         path = self.get_pandas_data_path()
 
-        # Filter and fetch the data from the pkl file.
+        # Filter and fetch the data from the PKL file.
         df_filter = f'name in {str(list(self.names_in_report))}'
         # "df" is a very short data set. It is only:
         # len(self.names_in_report) * available_years * 2.
@@ -206,7 +201,7 @@ def dfs_from_pkl(self):
             'f': {}
         }
 
-        # Transform the data to a structure similar to what was in the
+        # Transform the queried data to a structure similar to what is in the
         # dfs_from_html method.
         for i in df.itertuples():
             try:
@@ -218,10 +213,12 @@ def dfs_from_pkl(self):
         for gender, value_dict in nested_data.items():
             if gender == 'm':
                 for year, value_dict in value_dict.items():
+                    # Append a new row to the male's table.
                     male_df.loc[len(male_df.index)] = \
                         self.get_all_row_data(value_dict, [year])
             else:
                 for year, value_dict in value_dict.items():
+                    # Append a new row to the female's table.
                     female_df.loc[len(female_df.index)] = \
                         self.get_all_row_data(value_dict, [year])
 
@@ -229,14 +226,7 @@ def dfs_from_pkl(self):
 
     def get_all_row_data(self, names: dict, new_row_data: list = []) -> list:
         """
-        Returns 1 row's worth of data that will be appended the end of a table.
-
-        Args:
-            names (dict): A dictionary of names as keys and ranks as values.
-            new_row_data (list, optional): D. Defaults to [].
-
-        Returns:
-            list: 1 row of data
+        Creates and returns two empty dataframes.
         """
         for name in self.names_in_report:
             try:

From ceedc4598ccf5245db38d1e57d5a7468e712c765 Mon Sep 17 00:00:00 2001
From: Tom Mazurek <tom@quantfxcapital.com>
Date: Sat, 13 Mar 2021 10:52:31 -0600
Subject: [PATCH 7/7] Refactor

---
 python_baby_names/mixins.py     |  5 ++---
 python_baby_names/question_2.py | 22 ++++++++++------------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/python_baby_names/mixins.py b/python_baby_names/mixins.py
index 56fce00..a0301c2 100644
--- a/python_baby_names/mixins.py
+++ b/python_baby_names/mixins.py
@@ -78,10 +78,9 @@ def validate_year(self, year: int, soup):
             title_el = soup.select_one(tag)
             try:
                 title_text = title_el.text
-            except AttributeError:
-                continue
-            else:
                 break
+            except AttributeError:
+                pass
 
         # Make sure the year in the filename is the same as the year in the
         # title.
diff --git a/python_baby_names/question_2.py b/python_baby_names/question_2.py
index 92f2bb7..d679128 100644
--- a/python_baby_names/question_2.py
+++ b/python_baby_names/question_2.py
@@ -17,21 +17,21 @@ def timer(func):
     """Print the runtime of the decorated function."""
 
     @functools.wraps(func)
-    def wrapper_timer(*args, **kwargs):
+    def wrapper(*args, **kwargs):
         start_time = timeit.default_timer()
         value = func(*args, **kwargs)
         end_time = timeit.default_timer()
         elapsed_time = end_time - start_time
-        console_msg = f'Took {round(elapsed_time, 2)} seconds.'
+        msg = f'Took {round(elapsed_time, 2)} seconds.'
         script_obj = args[0]
         if script_obj.generate_excel:
-            console_msg += ' (Including the time it took to generate the Excel file.)'
+            msg += ' (Including the time it took to generate the Excel file.)'
         else:
-            console_msg += ' (NOT including the time it took to generate the Excel file.)'
-        print(console_msg)
+            msg += ' (NOT including the time it took to generate the Excel file.)'
+        print(msg)
         return value
 
-    return wrapper_timer
+    return wrapper
 
 
 class Script(mixins.BabyNamesMixin):
@@ -47,8 +47,6 @@ def __init__(
         excel_sheetname: str = '',
         generate_excel: bool = True
     ):
-        assert isinstance(names_in_report, (list, tuple)), (
-            'The names_in_report must be a list or a tuple.')
         if excel_filename:
             assert excel_filename[-5:] == '.xlsx', (
                 'The excel_filename must end with ".xlsx"')
@@ -77,7 +75,7 @@ def execute_report(self):
     def get_pkl_filename(self):
         return os.path.basename(__file__).replace('.py', '_data.pkl')
 
-    def get_pandas_data_path(self) -> str:
+    def get_pkl_path(self) -> str:
         """
         Returns the path to the Pandas PKL data file.
         """
@@ -91,7 +89,7 @@ def save_pkl(self, data):
         Args:
             data (Iterable): An iterable in a structure that Pandas understands.
         """
-        path = self.get_pandas_data_path()
+        path = self.get_pkl_path()
         pd.DataFrame(
             data,
             columns=['gender', 'year', 'name', 'rank']
@@ -188,14 +186,14 @@ def dfs_from_pkl(self):
             tuple: A 2-tuple containing non-empty DataFrames.
         """
         male_df, female_df = self.get_empty_dataframes()
-        path = self.get_pandas_data_path()
+        pkl_path = self.get_pkl_path()
 
         # Filter and fetch the data from the PKL file.
         df_filter = f'name in {str(list(self.names_in_report))}'
         # "df" is a very short data set. It is only:
         # len(self.names_in_report) * available_years * 2.
         # In this example: 3 * 10 * 2 = 60
-        df = pd.read_pickle(path).query(df_filter)
+        df = pd.read_pickle(pkl_path).query(df_filter)
         nested_data = {
             'm': {},
             'f': {}