ensae-reproductibilite · kellybourbon2 · Feb 1, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/.github/workflows/prod.yaml b/.github/workflows/prod.yaml
@@ -0,0 +1,27 @@
+name: Construction image Docker
+
+on:
+  push:
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    steps:
+      -
+        name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          push: true
+          tags: kellybrbn/application:latest
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,31 @@
+name: Python package
+
+on:
+  push:
+    branches:
+      - main
+      - dev
+
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          # latest python minor
+          python-version: '3.x'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pylint
+      - name: Lint
+        run: |
+          pylint src --fail-under=6
+      - name: Test Python code
+        run: python train.py
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+config.yaml
+__pycache__/
+data/**/*.csv
+data/**/*.parquet
+titanic/
+model.joblib
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,15 @@
+FROM ubuntu:22.04
+
+# Install Python
+RUN apt-get -y update && \
+    apt-get install -y python3-pip
+
+# Install project dependencies
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY src ./src
+COPY train.py .
+COPY app ./app
+
+CMD ["bash", "-c", "./app/run.sh"]
diff --git a/README.md b/README.md
@@ -0,0 +1,16 @@
+# Probabilité de survie sur le Titanic
+
+Pour pouvoir utiliser ce projet, il 
+est recommandé de créer un fichier `config.yaml`
+ayant la structure suivante:
+
+```yaml
+jeton_api: ####
+data_path: https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv
+```
+
+Pour installer les dépendances
+
+```bash
+pip install -r requirements.txt
+```
diff --git a/app/api.py b/app/api.py
@@ -0,0 +1,51 @@
+"""A simple API to expose our trained RandomForest model for Tutanic survival."""
+from fastapi import FastAPI
+from joblib import load
+
+import pandas as pd
+
+model = load('model.joblib')
+
+app = FastAPI(
+    title="Prédiction de survie sur le Titanic",
+    description=
+    "Application de prédiction de survie sur le Titanic 🚢 <br>Une version par API pour faciliter la réutilisation du modèle 🚀" +\
+        "<br><br><img src=\"https://media.vogue.fr/photos/5faac06d39c5194ff9752ec9/1:1/w_2404,h_2404,c_limit/076_CHL_126884.jpg\" width=\"200\">"
+    )
+
+
+@app.get("/", tags=["Welcome"])
+def show_welcome_page():
+    """
+    Show welcome page with model name and version.
+    """
+
+    return {
+        "Message": "API de prédiction de survie sur le Titanic",
+        "Model_name": 'Titanic ML',
+        "Model_version": "0.1",
+    }
+
+
+@app.get("/predict", tags=["Predict"])
+async def predict(
+    sex: str = "female",
+    age: float = 29.0,
+    fare: float = 16.5,
+    embarked: str = "S"
+) -> str:
+    """
+    """
+
+    df = pd.DataFrame(
+        {
+            "Sex": [sex],
+            "Age": [age],
+            "Fare": [fare],
+            "Embarked": [embarked],
+        }
+    )
+
+    prediction = "Survived 🎉" if int(model.predict(df)) == 1 else "Dead ⚰️"
+
+    return prediction
diff --git a/app/run.sh b/app/run.sh
@@ -0,0 +1,3 @@
+#/bin/bash
+python3 train.py
+uvicorn app.api:app --host "0.0.0.0"
diff --git a/data.csv b/data.csv
diff --git a/deployment/deployment.yaml b/deployment/deployment.yaml
@@ -0,0 +1,21 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: titanic-deployment
+  labels:
+    app: titanic
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: titanic
+  template:
+    metadata:
+      labels:
+        app: titanic
+    spec:
+      containers:
+      - name: titanic
+        image: kellybrbn/application:latest
+        ports:
+        - containerPort: 8000
diff --git a/deployment/ingress.yaml b/deployment/ingress.yaml
@@ -0,0 +1,20 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: titanic-ingress
+spec:
+  ingressClassName: nginx
+  tls:
+  - hosts:
+    - titanic-kelly.lab.sspcloud.fr
+  rules:
+  - host: titanic-kelly.lab.sspcloud.fr
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: titanic-service
+            port:
+              number: 80
diff --git a/deployment/service.yaml b/deployment/service.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: titanic-service
+spec:
+  selector:
+    app: titanic
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 8000 
diff --git a/install.sh b/install.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Install Python
+
+apt-get -y update
+
+apt-get install -y python3-pip python3-venv
+
+# Create empty virtual environment
+
+python3 -m venv titanic
+
+source titanic/bin/activate
+
+# Install project dependencies
+
+pip install -r requirements.txt
diff --git a/titanic.ipynb → notebooks/titanic.ipynb b/titanic.ipynb → notebooks/titanic.ipynb
@@ -1181,8 +1181,8 @@
     "y = TrainingData[\"Survived\"]\n",
     "X = TrainingData.drop(\"Survived\", axis = 'columns')\n",
     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)\n",
-    "pd.concat([X_train, y_train]).to_csv(\"train.csv\")\n",
-    "pd.concat([X_test, y_test]).to_csv(\"test.csv\")"
+    "pd.concat([X_train, y_train], axis = 1).to_csv(\"train.csv\")\n",
+    "pd.concat([X_test, y_test], axis = 1).to_csv(\"test.csv\")"
    ]
   },
   {

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,8 @@
+pandas
+PyYAML
+scikit-learn
+python-dotenv
+loguru
+pyarrow
+uvicorn
+fastapi
diff --git a/src/models/train_evaluate.py b/src/models/train_evaluate.py
@@ -0,0 +1,19 @@
+from loguru import logger
+from sklearn.metrics import confusion_matrix
+
+@logger.catch
+def evaluate_model(pipe, X_test, y_test):
+    """
+    Evaluate the model by calculating the score and confusion matrix.
+
+    Args:
+        pipe (sklearn.pipeline.Pipeline): The trained pipeline object.
+        X_test (pandas.DataFrame): The test data.
+        y_test (pandas.Series): The true labels for the test data.
+
+    Returns:
+        tuple: A tuple containing the score and confusion matrix.
+    """
+    score = pipe.score(X_test, y_test)
+    matrix = confusion_matrix(y_test, pipe.predict(X_test))
+    return score, matrix
diff --git a/src/pipeline/build_pipeline.py b/src/pipeline/build_pipeline.py
@@ -0,0 +1,108 @@
+from loguru import logger
+
+from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.model_selection import train_test_split
+
+import pandas as pd
+
+@logger.catch
+def split_train_test(data, test_size, train_path="train.csv", test_path="test.csv"):
+    """
+    Split the data into training and testing sets based on the specified test size.
+    Optionally, save the split datasets to CSV files.
+
+    Args:
+        data (pandas.DataFrame): The input data to split.
+        test_size (float): The proportion of the dataset to include in the test split.
+        train_path (str, optional): The file path to save the training dataset.
+            Defaults to "train.csv".
+        test_path (str, optional): The file path to save the testing dataset.
+            Defaults to "test.csv".
+
+    Returns:
+        tuple: A tuple containing the training and testing datasets.
+    """
+    y = data["Survived"]
+    X = data.drop("Survived", axis="columns")
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
+
+    if train_path:
+        pd.concat([X_train, y_train], axis = 1).to_parquet(train_path)
+    if test_path:
+        pd.concat([X_test, y_test], axis = 1).to_parquet(test_path)
+
+    return X_train, X_test, y_train, y_test
+
+@logger.catch
+def create_pipeline(
+    n_trees,
+    numeric_features=["Age", "Fare"],
+    categorical_features=["Embarked", "Sex"],
+    max_depth=None,
+    max_features="sqrt",
+):
+    """
+    Create a pipeline for preprocessing and model definition.
+
+    Args:
+        n_trees (int): The number of trees in the random forest.
+        numeric_features (list, optional): The numeric features to be included in the pipeline.
+            Defaults to ["Age", "Fare"].
+        categorical_features (list, optional): The categorical features to be included
+            in the pipeline.
+            Defaults to ["Embarked", "Sex"].
+        max_depth (int, optional): The maximum depth of the random forest. Defaults to None.
+        max_features (str, optional): The maximum number of features to consider
+            when looking for the best split.
+            Defaults to "sqrt".
+
+    Returns:
+        sklearn.pipeline.Pipeline: The pipeline object.
+    """
+    # Variables numériques
+    numeric_transformer = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="median")),
+            ("scaler", MinMaxScaler()),
+        ]
+    )
+
+    # Variables catégorielles
+    categorical_transformer = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="most_frequent")),
+            ("onehot", OneHotEncoder()),
+        ]
+    )
+
+    # Preprocessing
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("Preprocessing numerical", numeric_transformer, numeric_features),
+            (
+                "Preprocessing categorical",
+                categorical_transformer,
+                categorical_features,
+            ),
+        ]
+    )
+
+    # Pipeline
+    pipe = Pipeline(
+        [
+            ("preprocessor", preprocessor),
+            (
+                "classifier",
+                RandomForestClassifier(
+                    n_estimators=n_trees, max_depth=max_depth, max_features=max_features
+                ),
+            ),
+        ]
+    )
+
+    return pipe