Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .github/workflows/prod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Construction image Docker

on:
push:
branches:
- main
- dev


jobs:
docker:
runs-on: ubuntu-latest
steps:
-
name: Set up QEMU
uses: docker/setup-qemu-action@v3
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
push: true
tags: votre_compte_docker_hub/application:latest
31 changes: 31 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Python package

on:
push:
branches:
- main
- dev


jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
# latest python minor
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pylint
- name: Lint
run: |
pylint src --fail-under=6
- name: Test Python code
run: python main.py
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
config.yaml
__pycache__/
data/**/*.csv
titanic/
14 changes: 14 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM ubuntu:22.04

# Install Python
RUN apt-get -y update && \
apt-get install -y python3-pip

# Install project dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt

COPY src ./src
COPY main.py .

CMD ["python3", "main.py"]
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Probabilité de survie sur le Titanic

Pour pouvoir utiliser ce projet, il
est recommandé de créer un fichier `config.yaml`
ayant la structure suivante:

```yaml
jeton_api: ####
data_path: https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv
```

Pour installer les dépendances

```bash
pip install -r requirements.txt
```
892 changes: 0 additions & 892 deletions data.csv

This file was deleted.

17 changes: 17 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

# Install Python

apt-get -y update

apt-get install -y python3-pip python3-venv

# Create empty virtual environment

python3 -m venv titanic

source titanic/bin/activate

# Install project dependencies

pip install -r requirements.txt
83 changes: 83 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
Prediction de la survie d'un individu sur le Titanic
"""

import os
from dotenv import load_dotenv
import argparse
from loguru import logger

import pathlib
import pandas as pd
from sklearn.model_selection import train_test_split

from src.pipeline.build_pipeline import create_pipeline
from src.models.train_evaluate import evaluate_model


# ENVIRONMENT CONFIGURATION ---------------------------

logger.add("recording.log", rotation="500 MB")
load_dotenv()

parser = argparse.ArgumentParser(description="Paramètres du random forest")
parser.add_argument(
"--n_trees", type=int, default=20, help="Nombre d'arbres"
)
args = parser.parse_args()

URL_RAW = "https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv"

n_trees = args.n_trees
jeton_api = os.environ.get("JETON_API", "")
data_path = os.environ.get("data_path", URL_RAW)
data_train_path = os.environ.get("train_path", "data/derived/train.parquet")
data_test_path = os.environ.get("test_path", "data/derived/test.parquet")
MAX_DEPTH = None
MAX_FEATURES = "sqrt"

if jeton_api.startswith("$"):
logger.info("API token has been configured properly")
else:
logger.warning("API token has not been configured")


# IMPORT ET STRUCTURATION DONNEES --------------------------------

p = pathlib.Path("data/derived/")
p.mkdir(parents=True, exist_ok=True)

TrainingData = pd.read_csv(data_path)

y = TrainingData["Survived"]
X = TrainingData.drop("Survived", axis="columns")

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1
)
pd.concat([X_train, y_train], axis = 1).to_parquet(data_train_path)
pd.concat([X_test, y_test], axis = 1).to_parquet(data_test_path)



# PIPELINE ----------------------------


# Create the pipeline
pipe = create_pipeline(
n_trees, max_depth=MAX_DEPTH, max_features=MAX_FEATURES
)


# ESTIMATION ET EVALUATION ----------------------

pipe.fit(X_train, y_train)


# Evaluate the model
score, matrix = evaluate_model(pipe, X_test, y_test)

logger.success(f"{score:.1%} de bonnes réponses sur les données de test pour validation")
logger.debug(20 * "-")
logger.info("Matrice de confusion")
logger.debug(matrix)
4 changes: 2 additions & 2 deletions titanic.ipynb → notebooks/titanic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1181,8 +1181,8 @@
"y = TrainingData[\"Survived\"]\n",
"X = TrainingData.drop(\"Survived\", axis = 'columns')\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)\n",
"pd.concat([X_train, y_train]).to_csv(\"train.csv\")\n",
"pd.concat([X_test, y_test]).to_csv(\"test.csv\")"
"pd.concat([X_train, y_train], axis = 1).to_csv(\"train.csv\")\n",
"pd.concat([X_test, y_test], axis = 1).to_csv(\"test.csv\")"
]
},
{
Expand Down
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pandas
PyYAML
scikit-learn
python-dotenv
loguru
pyarrow
19 changes: 19 additions & 0 deletions src/models/train_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from loguru import logger
from sklearn.metrics import confusion_matrix

@logger.catch
def evaluate_model(pipe, X_test, y_test):
"""
Evaluate the model by calculating the score and confusion matrix.

Args:
pipe (sklearn.pipeline.Pipeline): The trained pipeline object.
X_test (pandas.DataFrame): The test data.
y_test (pandas.Series): The true labels for the test data.

Returns:
tuple: A tuple containing the score and confusion matrix.
"""
score = pipe.score(X_test, y_test)
matrix = confusion_matrix(y_test, pipe.predict(X_test))
return score, matrix
108 changes: 108 additions & 0 deletions src/pipeline/build_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from loguru import logger

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import pandas as pd

@logger.catch
def split_train_test(data, test_size, train_path="train.csv", test_path="test.csv"):
"""
Split the data into training and testing sets based on the specified test size.
Optionally, save the split datasets to CSV files.

Args:
data (pandas.DataFrame): The input data to split.
test_size (float): The proportion of the dataset to include in the test split.
train_path (str, optional): The file path to save the training dataset.
Defaults to "train.csv".
test_path (str, optional): The file path to save the testing dataset.
Defaults to "test.csv".

Returns:
tuple: A tuple containing the training and testing datasets.
"""
y = data["Survived"]
X = data.drop("Survived", axis="columns")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

if train_path:
pd.concat([X_train, y_train], axis = 1).to_parquet(train_path)
if test_path:
pd.concat([X_test, y_test], axis = 1).to_parquet(test_path)

return X_train, X_test, y_train, y_test

@logger.catch
def create_pipeline(
n_trees,
numeric_features=["Age", "Fare"],
categorical_features=["Embarked", "Sex"],
max_depth=None,
max_features="sqrt",
):
"""
Create a pipeline for preprocessing and model definition.

Args:
n_trees (int): The number of trees in the random forest.
numeric_features (list, optional): The numeric features to be included in the pipeline.
Defaults to ["Age", "Fare"].
categorical_features (list, optional): The categorical features to be included
in the pipeline.
Defaults to ["Embarked", "Sex"].
max_depth (int, optional): The maximum depth of the random forest. Defaults to None.
max_features (str, optional): The maximum number of features to consider
when looking for the best split.
Defaults to "sqrt".

Returns:
sklearn.pipeline.Pipeline: The pipeline object.
"""
# Variables numériques
numeric_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", MinMaxScaler()),
]
)

# Variables catégorielles
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder()),
]
)

# Preprocessing
preprocessor = ColumnTransformer(
transformers=[
("Preprocessing numerical", numeric_transformer, numeric_features),
(
"Preprocessing categorical",
categorical_transformer,
categorical_features,
),
]
)

# Pipeline
pipe = Pipeline(
[
("preprocessor", preprocessor),
(
"classifier",
RandomForestClassifier(
n_estimators=n_trees, max_depth=max_depth, max_features=max_features
),
),
]
)

return pipe
Loading