Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,24 @@ Follow industry standards for each data type when decided on the final format fo
* For dimensions without units, assume inches. Convert anything which isn't in inches to inches.
* For weights without units, assume pounds. Convert anything which isn't in pounds to pounds.
* UPC / Gtin / EAN should be handled as strings
* Floating point and decimal numbers should preserve as much precision as possible
* Floating point and decimal numbers should preserve as much precision as possible

## Run ETL

### Requirements
- Python 3.10

### Install dependencies

```bash
pip install -r requirements.txt
```
### Usage

```
python3 run_etl.py
```
#### Can pass the filepath
```
python3 run_etl.py filepath
```
4 changes: 4 additions & 0 deletions elt_homework/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
27 changes: 27 additions & 0 deletions elt_homework/csv_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import logging
import pandas as pd

logger = logging.getLogger(__name__)

class CSVReader:
def __init__(self):
pass

def read_csv(self, file_path, columns):
"""
Reads a CSV file using pandas.

Args:
- file_path (str): Path to the CSV file.
- columns (list): List of only columns to load

Returns:
- DataFrame: The DataFrame containing the CSV data.
"""
try:
return pd.read_csv(file_path, usecols=columns, engine='python')
except Exception as e:
logger.exception("An error occurred:", e)



82 changes: 82 additions & 0 deletions elt_homework/elt_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@

import logging

import country_converter as coco
import numpy as np
import pandas as pd

from elt_homework.csv_reader import CSVReader
from elt_homework.mapping import columns_mapping, output_columns
from elt_homework.schema import ProductSchema

logger = logging.getLogger(__name__)

cc = coco.CountryConverter()

class ETL:
def __init__(self):
self._csv_reader = CSVReader()

def _read_data(self, file_path):
"""
Reads a CSV file and performs ETL operations with the useful columns

Args:
- file_path (str): Path to the CSV file.
"""
logger.info(f'Reading {file_path} file')
return self._csv_reader.read_csv(file_path, columns_mapping.keys())

def _transformations(self, df):

def yes_no_to_bool(v):
if isinstance(v, str):
return v.lower() == 'yes'
return False
# to bool
df['attrib__outdoor_safe'] = df['attrib__outdoor_safe'].apply(yes_no_to_bool)
df['attrib__kit'] = df['attrib__kit'].apply(yes_no_to_bool)
df['attrib__bulb_included'] = df['attrib__bulb_included'].apply(yes_no_to_bool)
# to int
df['attrib__number_bulbs'] = df['attrib__number_bulbs'].fillna(0).astype(int)
df['product__multipack_quantity'] = df['product__multipack_quantity'].fillna(0).astype(int)
# to str
df['ean13'] = df['ean13'].astype(str)
# contry:
df['product__country_of_origin__alpha_3'] = cc.pandas_convert( \
series=df['product__country_of_origin__alpha_3'], to='ISO3',not_found= np.NaN)
# currency:
df['cost_price'] = df['cost_price'].replace('[\$,]', '', regex=True).astype(float).round(2)

return df

def process(self, file_path):

# Read csv from file_path
df = self._read_data(file_path)

# Rename the columns according to the mapping
df.rename(columns=columns_mapping, inplace=True)

# Apply the transformations
df = self._transformations(df)

try:
ProductSchema.validate(df)
except Exception as ex:
# TODO alert system monitoring,
#It may be that the supplier/partner has changed something
raise ex

# Load all columns
output_df = pd.DataFrame(columns=output_columns)

# Fill the blank df columns with the existing data
for column in df.columns:
output_df[column] = df[column]
# Write csv
output_df.to_csv("output.csv", index=False)

logger.info(f'ETL finished, {output_df.shape[0]} rows,{output_df.shape[1]} cols saved')


Loading