Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ name = "pypi"
beautifulsoup4 = "*"
pandas = "*"
xlsxwriter = "*"
lxml = "*"

[dev-packages]

Expand Down
45 changes: 44 additions & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion python_baby_names/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
*.xlsx
*.xlsx
*.pkl
43 changes: 23 additions & 20 deletions python_baby_names/mixins.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,34 @@
import os
import settings

from bs4 import Tag

import settings


class BabyNamesMixin:

header_tags = settings.HEADER_TAGS
excel_filename = ''

def get_output_path(self, file_dundar: str) -> str:
def get_output_path(self, file_dunder: str) -> str:
"""
Returns the path of the file (including the filename within the path)
that will be created.
Returns the path of the file that will be created (including the
filename within the path).

Args:
file_dundar (str):
file_dunder (str):
The __file__ value that's available in every Python file.

Returns:
[str]: The path of the output.
str: The path of the output.
"""
excel_filename = (
self.excel_filename or
os.path.basename(file_dundar).replace('.py', '_report.xlsx')
os.path.basename(file_dunder).replace('.py', '_report.xlsx')
)
output_path = (
os.path.dirname(os.path.abspath(file_dundar)) + '\\' + excel_filename)
os.path.dirname(os.path.abspath(file_dunder)) + '\\' + excel_filename
)
return output_path

def get_filename_info(self) -> tuple:
Expand All @@ -35,7 +37,7 @@ def get_filename_info(self) -> tuple:
years that are within those filenames.

Returns:
tuple: filenames, available years
tuple: filenames, available_years
"""
available_years = []

Expand All @@ -52,8 +54,8 @@ def get_filename_info(self) -> tuple:
if settings.FILENAME_PREFIX == prefix2 and settings.FILE_TYPE == file_type2:
# Get the year from the middle of the string.
year = filename[prefix_len: -file_type_len]
# If there are any non-digit characters, it means
# that the filename is not in the correct format.
# If there are any non-digit characters, it means that the
# filename is not in the correct format.
assert year.isdigit(), (
f'File "{filename}" is not in a valid format.')
assert len(year) == 4
Expand All @@ -76,10 +78,9 @@ def validate_year(self, year: int, soup):
title_el = soup.select_one(tag)
try:
title_text = title_el.text
except AttributeError:
continue
else:
break
except AttributeError:
pass

# Make sure the year in the filename is the same as the year in the
# title.
Expand All @@ -89,30 +90,32 @@ def validate_year(self, year: int, soup):
# It will be a long time until there more than 4 digits are in a year.
assert len(year_in_title) == 4
assert year_in_title == year, (
f'Year "{year_in_title}" !== "{year}"')
f'Year "{year_in_title}" != "{year}"')

def get_table(self, soup):
def get_table(self, soup, filename: str):
"""
Finds the correct table within BeautifulSoup object and validates it
before it is returned.

Args:
soup (BeautifulSoup): The soup of the entire page.
filename (str): The name of the file.

Returns:
[BeautifulSoup]: The HTML table element within the soup.
BeautifulSoup: The HTML table element within the soup.
"""
# In all cases, the data we need is in the 3rd table.
table = soup.find_all('table')[2]
self.validate_table_columns(table)
self.validate_table_columns(table, filename)
return table

def validate_table_columns(self, table):
def validate_table_columns(self, table, filename: str):
"""
Validate that the table's columns are in the expected order.
Validates that the table's columns are in the expected order.

Args:
table (BeautifulSoup): The table object found in the HTML file.
filename (str): The name of the file.
"""
# Make sure the order of the columns are what is expected.
table_header = table.select_one('tr')
Expand Down
Loading