ensae-reproductibilite · marcderoo · Mar 17, 2025
diff --git a/.github/workflows/prod.yaml b/.github/workflows/prod.yaml
@@ -0,0 +1,31 @@
+name: Construction image Docker
+
+on:
+  push:
+    branches:
+      - main
+      - dev
+
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    steps:
+      -
+        name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          push: true
+          tags: votre_compte_docker_hub/application:latest
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,31 @@
+name: Python package
+
+on:
+  push:
+    branches:
+      - main
+      - dev
+
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          # latest python minor
+          python-version: '3.x'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pylint
+      - name: Lint
+        run: |
+          pylint src --fail-under=6
+      - name: Test Python code
+        run: python main.py
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+config.yaml
+__pycache__/
+data/**/*.csv
+titanic/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:22.04
+
+# Install Python
+RUN apt-get -y update && \
+    apt-get install -y python3-pip
+
+# Install project dependencies
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY src ./src
+COPY main.py .
+
+CMD ["python3", "main.py"]
diff --git a/README.md b/README.md
@@ -0,0 +1,16 @@
+# Probabilité de survie sur le Titanic
+
+Pour pouvoir utiliser ce projet, il 
+est recommandé de créer un fichier `config.yaml`
+ayant la structure suivante:
+
+```yaml
+jeton_api: ####
+data_path: https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv
+```
+
+Pour installer les dépendances
+
+```bash
+pip install -r requirements.txt
+```
diff --git a/data.csv b/data.csv
diff --git a/install.sh b/install.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Install Python
+
+apt-get -y update
+
+apt-get install -y python3-pip python3-venv
+
+# Create empty virtual environment
+
+python3 -m venv titanic
+
+source titanic/bin/activate
+
+# Install project dependencies
+
+pip install -r requirements.txt
diff --git a/main.py b/main.py
@@ -0,0 +1,83 @@
+"""
+Prediction de la survie d'un individu sur le Titanic
+"""
+
+import os
+from dotenv import load_dotenv
+import argparse
+from loguru import logger
+
+import pathlib
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+from src.pipeline.build_pipeline import create_pipeline
+from src.models.train_evaluate import evaluate_model
+
+
+# ENVIRONMENT CONFIGURATION ---------------------------
+
+logger.add("recording.log", rotation="500 MB")
+load_dotenv()
+
+parser = argparse.ArgumentParser(description="Paramètres du random forest")
+parser.add_argument(
+    "--n_trees", type=int, default=20, help="Nombre d'arbres"
+)
+args = parser.parse_args()
+
+URL_RAW = "https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv"
+
+n_trees = args.n_trees
+jeton_api = os.environ.get("JETON_API", "")
+data_path = os.environ.get("data_path", URL_RAW)
+data_train_path = os.environ.get("train_path", "data/derived/train.parquet")
+data_test_path = os.environ.get("test_path", "data/derived/test.parquet")
+MAX_DEPTH = None
+MAX_FEATURES = "sqrt"
+
+if jeton_api.startswith("$"):
+    logger.info("API token has been configured properly")
+else:
+    logger.warning("API token has not been configured")
+
+
+# IMPORT ET STRUCTURATION DONNEES --------------------------------
+
+p = pathlib.Path("data/derived/")
+p.mkdir(parents=True, exist_ok=True)
+
+TrainingData = pd.read_csv(data_path)
+
+y = TrainingData["Survived"]
+X = TrainingData.drop("Survived", axis="columns")
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.1
+)
+pd.concat([X_train, y_train], axis = 1).to_parquet(data_train_path)
+pd.concat([X_test, y_test], axis = 1).to_parquet(data_test_path)
+
+
+
+# PIPELINE ----------------------------
+
+
+# Create the pipeline
+pipe = create_pipeline(
+    n_trees, max_depth=MAX_DEPTH, max_features=MAX_FEATURES
+)
+
+
+# ESTIMATION ET EVALUATION ----------------------
+
+pipe.fit(X_train, y_train)
+
+
+# Evaluate the model
+score, matrix = evaluate_model(pipe, X_test, y_test)
+
+logger.success(f"{score:.1%} de bonnes réponses sur les données de test pour validation")
+logger.debug(20 * "-")
+logger.info("Matrice de confusion")
+logger.debug(matrix)
diff --git a/titanic.ipynb → notebooks/titanic.ipynb b/titanic.ipynb → notebooks/titanic.ipynb
@@ -1181,8 +1181,8 @@
     "y = TrainingData[\"Survived\"]\n",
     "X = TrainingData.drop(\"Survived\", axis = 'columns')\n",
     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)\n",
-    "pd.concat([X_train, y_train]).to_csv(\"train.csv\")\n",
-    "pd.concat([X_test, y_test]).to_csv(\"test.csv\")"
+    "pd.concat([X_train, y_train], axis = 1).to_csv(\"train.csv\")\n",
+    "pd.concat([X_test, y_test], axis = 1).to_csv(\"test.csv\")"
    ]
   },
   {

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+pandas
+PyYAML
+scikit-learn
+python-dotenv
+loguru
+pyarrow
diff --git a/src/models/train_evaluate.py b/src/models/train_evaluate.py
@@ -0,0 +1,19 @@
+from loguru import logger
+from sklearn.metrics import confusion_matrix
+
+@logger.catch
+def evaluate_model(pipe, X_test, y_test):
+    """
+    Evaluate the model by calculating the score and confusion matrix.
+
+    Args:
+        pipe (sklearn.pipeline.Pipeline): The trained pipeline object.
+        X_test (pandas.DataFrame): The test data.
+        y_test (pandas.Series): The true labels for the test data.
+
+    Returns:
+        tuple: A tuple containing the score and confusion matrix.
+    """
+    score = pipe.score(X_test, y_test)
+    matrix = confusion_matrix(y_test, pipe.predict(X_test))
+    return score, matrix
diff --git a/src/pipeline/build_pipeline.py b/src/pipeline/build_pipeline.py
@@ -0,0 +1,108 @@
+from loguru import logger
+
+from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.model_selection import train_test_split
+
+import pandas as pd
+
+@logger.catch
+def split_train_test(data, test_size, train_path="train.csv", test_path="test.csv"):
+    """
+    Split the data into training and testing sets based on the specified test size.
+    Optionally, save the split datasets to CSV files.
+
+    Args:
+        data (pandas.DataFrame): The input data to split.
+        test_size (float): The proportion of the dataset to include in the test split.
+        train_path (str, optional): The file path to save the training dataset.
+            Defaults to "train.csv".
+        test_path (str, optional): The file path to save the testing dataset.
+            Defaults to "test.csv".
+
+    Returns:
+        tuple: A tuple containing the training and testing datasets.
+    """
+    y = data["Survived"]
+    X = data.drop("Survived", axis="columns")
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
+
+    if train_path:
+        pd.concat([X_train, y_train], axis = 1).to_parquet(train_path)
+    if test_path:
+        pd.concat([X_test, y_test], axis = 1).to_parquet(test_path)
+
+    return X_train, X_test, y_train, y_test
+
+@logger.catch
+def create_pipeline(
+    n_trees,
+    numeric_features=["Age", "Fare"],
+    categorical_features=["Embarked", "Sex"],
+    max_depth=None,
+    max_features="sqrt",
+):
+    """
+    Create a pipeline for preprocessing and model definition.
+
+    Args:
+        n_trees (int): The number of trees in the random forest.
+        numeric_features (list, optional): The numeric features to be included in the pipeline.
+            Defaults to ["Age", "Fare"].
+        categorical_features (list, optional): The categorical features to be included
+            in the pipeline.
+            Defaults to ["Embarked", "Sex"].
+        max_depth (int, optional): The maximum depth of the random forest. Defaults to None.
+        max_features (str, optional): The maximum number of features to consider
+            when looking for the best split.
+            Defaults to "sqrt".
+
+    Returns:
+        sklearn.pipeline.Pipeline: The pipeline object.
+    """
+    # Variables numériques
+    numeric_transformer = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="median")),
+            ("scaler", MinMaxScaler()),
+        ]
+    )
+
+    # Variables catégorielles
+    categorical_transformer = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="most_frequent")),
+            ("onehot", OneHotEncoder()),
+        ]
+    )
+
+    # Preprocessing
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("Preprocessing numerical", numeric_transformer, numeric_features),
+            (
+                "Preprocessing categorical",
+                categorical_transformer,
+                categorical_features,
+            ),
+        ]
+    )
+
+    # Pipeline
+    pipe = Pipeline(
+        [
+            ("preprocessor", preprocessor),
+            (
+                "classifier",
+                RandomForestClassifier(
+                    n_estimators=n_trees, max_depth=max_depth, max_features=max_features
+                ),
+            ),
+        ]
+    )
+
+    return pipe