glucose/data_acquisition.py at master · Livia-Zaharia/glucose · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Data acquisition module that imports the data from the csv
"""

from pathlib import Path

import pandas as pd

CURRENT_PATH_CWD = Path.cwd()


def read_csv_file(file_name: str) -> pd.DataFrame:
    """
    Read the input CSV file and return the DataFrame.

    Args:
        file_name: The name of the input CSV file.

    Returns:
        A Pandas DataFrame containing the CSV data.

    Raises:
        Exception: If the input CSV file is missing or corrupted.
    """
    # Combine the current working directory path with the file name
    file_path = CURRENT_PATH_CWD / file_name

    # Check if the file exists
    if not file_path.exists():
        # Raise an exception if the file is missing
        raise Exception(f'The expected input file name does not exist at path: {file_path}')

    # Read the CSV file and return the DataFrame
    df = pd.read_csv(filepath_or_buffer=file_path, index_col=0)

    return df


def get_insulin_data(file_name: str) -> pd.DataFrame:
    """
    Create a dataframe containing insulin data from a given CSV file, preprocess it by removing nulls and renaming columns.

    Args:
        file_name: The name of the input CSV file.

    Returns:
        A preprocessed Pandas DataFrame containing insulin data.

    Raises:
        Exception: If the input CSV file is missing or corrupted.
    """
    # Read the CSV file and store it in a DataFrame
    df = read_csv_file(file_name=file_name)
    # Preprocess the insulin data and return the resulting DataFrame
    insulin = _preprocess_insulin_data(df=df)

    return insulin


def get_glucose_data(file_name: str) -> pd.DataFrame:
    """
    Create a dataframe containing glucose data from a given CSV file, preprocess it by removing nulls and renaming columns.

    Args:
        file_name: The name of the input CSV file.

    Returns:
        A preprocessed Pandas DataFrame containing glucose data.

    Raises:
        Exception: If the input CSV file is missing or corrupted.
    """
    # Read the CSV file and store it in a DataFrame
    df = read_csv_file(file_name=file_name)
    # Preprocess the glucose data and return the resulting DataFrame
    glucose = _preprocess_glucose_data(df=df)

    return glucose


def _preprocess_glucose_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the DataFrame containing glucose data by removing nulls and renaming columns.

    Args:
        df: A Pandas DataFrame containing the glucose data.

    Returns:
        A preprocessed Pandas DataFrame containing glucose data.
    """
    # Select the relevant columns from the input DataFrame
    glucose = df[['Timestamp (YYYY-MM-DDThh:mm:ss)', 'Glucose Value (mg/dL)']]

    # Convert the timestamp column to datetime format
    glucose['Timestamp (YYYY-MM-DDThh:mm:ss)'] = pd.to_datetime(glucose['Timestamp (YYYY-MM-DDThh:mm:ss)'])

    # Convert the glucose value column to numeric, setting any errors to NaN
    glucose['Glucose Value (mg/dL)'] = pd.to_numeric(glucose['Glucose Value (mg/dL)'], errors='coerce')

    # Remove rows with missing values
    glucose.dropna(inplace=True)

    # Convert the glucose value column to float data type
    glucose['Glucose Value (mg/dL)'] = glucose['Glucose Value (mg/dL)'].astype(float)

    # Rename the timestamp column for better readability
    glucose.rename(columns={'Timestamp (YYYY-MM-DDThh:mm:ss)': 'Timestamp'}, inplace=True)

    return glucose


def _preprocess_insulin_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the DataFrame containing insulin data by removing nulls and renaming columns.

    Args:
        df: A Pandas DataFrame containing the insulin data.

    Returns:
        A preprocessed Pandas DataFrame containing insulin data.
    """
    # Select the relevant columns from the input DataFrame
    insulin = df[['Timestamp (YYYY-MM-DDThh:mm:ss)', 'Event Subtype', 'Insulin Value (u)']]

    # Convert the timestamp column to datetime format
    insulin['Timestamp (YYYY-MM-DDThh:mm:ss)'] = pd.to_datetime(insulin['Timestamp (YYYY-MM-DDThh:mm:ss)'])

    # Convert the insulin value column to numeric, setting any errors to NaN
    insulin['Insulin Value (u)'] = pd.to_numeric(insulin['Insulin Value (u)'], errors='coerce')

    # Remove rows with missing values
    insulin.dropna(inplace=True)

    # Convert the insulin value column to float data type
    insulin['Insulin Value (u)'] = insulin['Insulin Value (u)'].astype(float)

    # Rename the columns for better readability
    insulin.rename(
        columns={'Timestamp (YYYY-MM-DDThh:mm:ss)': 'Timestamp', 'Event Subtype': 'Type', 'Insulin Value (u)': 'Value'},
        inplace=True)

    return insulin