forked from ensae-reproductibilite/application
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
83 lines (56 loc) · 2.11 KB
/
main.py
File metadata and controls
83 lines (56 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
Prediction de la survie d'un individu sur le Titanic
"""
import os
from dotenv import load_dotenv
import argparse
from loguru import logger
import pathlib
import pandas as pd
from sklearn.model_selection import train_test_split
from src.pipeline.build_pipeline import create_pipeline
from src.models.train_evaluate import evaluate_model
# ENVIRONMENT CONFIGURATION ---------------------------
logger.add("recording.log", rotation="500 MB")
load_dotenv()
parser = argparse.ArgumentParser(description="Paramètres du random forest")
parser.add_argument(
"--n_trees", type=int, default=20, help="Nombre d'arbres"
)
args = parser.parse_args()
URL_RAW = "https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv"
n_trees = args.n_trees
jeton_api = os.environ.get("JETON_API", "")
data_path = os.environ.get("data_path", URL_RAW)
data_train_path = os.environ.get("train_path", "data/derived/train.parquet")
data_test_path = os.environ.get("test_path", "data/derived/test.parquet")
MAX_DEPTH = None
MAX_FEATURES = "sqrt"
if jeton_api.startswith("$"):
logger.info("API token has been configured properly")
else:
logger.warning("API token has not been configured")
# IMPORT ET STRUCTURATION DONNEES --------------------------------
p = pathlib.Path("data/derived/")
p.mkdir(parents=True, exist_ok=True)
TrainingData = pd.read_csv(data_path)
y = TrainingData["Survived"]
X = TrainingData.drop("Survived", axis="columns")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1
)
pd.concat([X_train, y_train], axis = 1).to_parquet(data_train_path)
pd.concat([X_test, y_test], axis = 1).to_parquet(data_test_path)
# PIPELINE ----------------------------
# Create the pipeline
pipe = create_pipeline(
n_trees, max_depth=MAX_DEPTH, max_features=MAX_FEATURES
)
# ESTIMATION ET EVALUATION ----------------------
pipe.fit(X_train, y_train)
# Evaluate the model
score, matrix = evaluate_model(pipe, X_test, y_test)
logger.success(f"{score:.1%} de bonnes réponses sur les données de test pour validation")
logger.debug(20 * "-")
logger.info("Matrice de confusion")
logger.debug(matrix)