Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/workflows/prod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Construction image Docker

on:
push:

jobs:
docker:
runs-on: ubuntu-latest
steps:
-
name: Set up QEMU
uses: docker/setup-qemu-action@v3
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
push: true
tags: kellybrbn/application:latest
31 changes: 31 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Python package

on:
push:
branches:
- main
- dev


jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
# latest python minor
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pylint
- name: Lint
run: |
pylint src --fail-under=6
- name: Test Python code
run: python train.py
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
config.yaml
__pycache__/
data/**/*.csv
data/**/*.parquet
titanic/
model.joblib
15 changes: 15 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM ubuntu:22.04

# Install Python
RUN apt-get -y update && \
apt-get install -y python3-pip

# Install project dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt

COPY src ./src
COPY train.py .
COPY app ./app

CMD ["bash", "-c", "./app/run.sh"]
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Probabilité de survie sur le Titanic

Pour pouvoir utiliser ce projet, il
est recommandé de créer un fichier `config.yaml`
ayant la structure suivante:

```yaml
jeton_api: ####
data_path: https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv
```

Pour installer les dépendances

```bash
pip install -r requirements.txt
```
51 changes: 51 additions & 0 deletions app/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""A simple API to expose our trained RandomForest model for Tutanic survival."""
from fastapi import FastAPI
from joblib import load

import pandas as pd

model = load('model.joblib')

app = FastAPI(
title="Prédiction de survie sur le Titanic",
description=
"Application de prédiction de survie sur le Titanic 🚢 <br>Une version par API pour faciliter la réutilisation du modèle 🚀" +\
"<br><br><img src=\"https://media.vogue.fr/photos/5faac06d39c5194ff9752ec9/1:1/w_2404,h_2404,c_limit/076_CHL_126884.jpg\" width=\"200\">"
)


@app.get("/", tags=["Welcome"])
def show_welcome_page():
"""
Show welcome page with model name and version.
"""

return {
"Message": "API de prédiction de survie sur le Titanic",
"Model_name": 'Titanic ML',
"Model_version": "0.1",
}


@app.get("/predict", tags=["Predict"])
async def predict(
sex: str = "female",
age: float = 29.0,
fare: float = 16.5,
embarked: str = "S"
) -> str:
"""
"""

df = pd.DataFrame(
{
"Sex": [sex],
"Age": [age],
"Fare": [fare],
"Embarked": [embarked],
}
)

prediction = "Survived 🎉" if int(model.predict(df)) == 1 else "Dead ⚰️"

return prediction
3 changes: 3 additions & 0 deletions app/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#/bin/bash
python3 train.py
uvicorn app.api:app --host "0.0.0.0"
892 changes: 0 additions & 892 deletions data.csv

This file was deleted.

21 changes: 21 additions & 0 deletions deployment/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: titanic-deployment
labels:
app: titanic
spec:
replicas: 1
selector:
matchLabels:
app: titanic
template:
metadata:
labels:
app: titanic
spec:
containers:
- name: titanic
image: kellybrbn/application:latest
ports:
- containerPort: 8000
20 changes: 20 additions & 0 deletions deployment/ingress.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: titanic-ingress
spec:
ingressClassName: nginx
tls:
- hosts:
- titanic-kelly.lab.sspcloud.fr
rules:
- host: titanic-kelly.lab.sspcloud.fr
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: titanic-service
port:
number: 80
11 changes: 11 additions & 0 deletions deployment/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: v1
kind: Service
metadata:
name: titanic-service
spec:
selector:
app: titanic
ports:
- protocol: TCP
port: 80
targetPort: 8000
17 changes: 17 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

# Install Python

apt-get -y update

apt-get install -y python3-pip python3-venv

# Create empty virtual environment

python3 -m venv titanic

source titanic/bin/activate

# Install project dependencies

pip install -r requirements.txt
4 changes: 2 additions & 2 deletions titanic.ipynb → notebooks/titanic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1181,8 +1181,8 @@
"y = TrainingData[\"Survived\"]\n",
"X = TrainingData.drop(\"Survived\", axis = 'columns')\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)\n",
"pd.concat([X_train, y_train]).to_csv(\"train.csv\")\n",
"pd.concat([X_test, y_test]).to_csv(\"test.csv\")"
"pd.concat([X_train, y_train], axis = 1).to_csv(\"train.csv\")\n",
"pd.concat([X_test, y_test], axis = 1).to_csv(\"test.csv\")"
]
},
{
Expand Down
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
pandas
PyYAML
scikit-learn
python-dotenv
loguru
pyarrow
uvicorn
fastapi
19 changes: 19 additions & 0 deletions src/models/train_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from loguru import logger
from sklearn.metrics import confusion_matrix

@logger.catch
def evaluate_model(pipe, X_test, y_test):
"""
Evaluate the model by calculating the score and confusion matrix.

Args:
pipe (sklearn.pipeline.Pipeline): The trained pipeline object.
X_test (pandas.DataFrame): The test data.
y_test (pandas.Series): The true labels for the test data.

Returns:
tuple: A tuple containing the score and confusion matrix.
"""
score = pipe.score(X_test, y_test)
matrix = confusion_matrix(y_test, pipe.predict(X_test))
return score, matrix
108 changes: 108 additions & 0 deletions src/pipeline/build_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from loguru import logger

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import pandas as pd

@logger.catch
def split_train_test(data, test_size, train_path="train.csv", test_path="test.csv"):
"""
Split the data into training and testing sets based on the specified test size.
Optionally, save the split datasets to CSV files.

Args:
data (pandas.DataFrame): The input data to split.
test_size (float): The proportion of the dataset to include in the test split.
train_path (str, optional): The file path to save the training dataset.
Defaults to "train.csv".
test_path (str, optional): The file path to save the testing dataset.
Defaults to "test.csv".

Returns:
tuple: A tuple containing the training and testing datasets.
"""
y = data["Survived"]
X = data.drop("Survived", axis="columns")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

if train_path:
pd.concat([X_train, y_train], axis = 1).to_parquet(train_path)
if test_path:
pd.concat([X_test, y_test], axis = 1).to_parquet(test_path)

return X_train, X_test, y_train, y_test

@logger.catch
def create_pipeline(
n_trees,
numeric_features=["Age", "Fare"],
categorical_features=["Embarked", "Sex"],
max_depth=None,
max_features="sqrt",
):
"""
Create a pipeline for preprocessing and model definition.

Args:
n_trees (int): The number of trees in the random forest.
numeric_features (list, optional): The numeric features to be included in the pipeline.
Defaults to ["Age", "Fare"].
categorical_features (list, optional): The categorical features to be included
in the pipeline.
Defaults to ["Embarked", "Sex"].
max_depth (int, optional): The maximum depth of the random forest. Defaults to None.
max_features (str, optional): The maximum number of features to consider
when looking for the best split.
Defaults to "sqrt".

Returns:
sklearn.pipeline.Pipeline: The pipeline object.
"""
# Variables numériques
numeric_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", MinMaxScaler()),
]
)

# Variables catégorielles
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder()),
]
)

# Preprocessing
preprocessor = ColumnTransformer(
transformers=[
("Preprocessing numerical", numeric_transformer, numeric_features),
(
"Preprocessing categorical",
categorical_transformer,
categorical_features,
),
]
)

# Pipeline
pipe = Pipeline(
[
("preprocessor", preprocessor),
(
"classifier",
RandomForestClassifier(
n_estimators=n_trees, max_depth=max_depth, max_features=max_features
),
),
]
)

return pipe
Loading