-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_acquisition.py
More file actions
142 lines (99 loc) · 4.56 KB
/
data_acquisition.py
File metadata and controls
142 lines (99 loc) · 4.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Data acquisition module that imports the data from the csv
"""
from pathlib import Path
import pandas as pd
CURRENT_PATH_CWD = Path.cwd()
def read_csv_file(file_name: str) -> pd.DataFrame:
"""
Read the input CSV file and return the DataFrame.
Args:
file_name: The name of the input CSV file.
Returns:
A Pandas DataFrame containing the CSV data.
Raises:
Exception: If the input CSV file is missing or corrupted.
"""
# Combine the current working directory path with the file name
file_path = CURRENT_PATH_CWD / file_name
# Check if the file exists
if not file_path.exists():
# Raise an exception if the file is missing
raise Exception(f'The expected input file name does not exist at path: {file_path}')
# Read the CSV file and return the DataFrame
df = pd.read_csv(filepath_or_buffer=file_path, index_col=0)
return df
def get_insulin_data(file_name: str) -> pd.DataFrame:
"""
Create a dataframe containing insulin data from a given CSV file, preprocess it by removing nulls and renaming columns.
Args:
file_name: The name of the input CSV file.
Returns:
A preprocessed Pandas DataFrame containing insulin data.
Raises:
Exception: If the input CSV file is missing or corrupted.
"""
# Read the CSV file and store it in a DataFrame
df = read_csv_file(file_name=file_name)
# Preprocess the insulin data and return the resulting DataFrame
insulin = _preprocess_insulin_data(df=df)
return insulin
def get_glucose_data(file_name: str) -> pd.DataFrame:
"""
Create a dataframe containing glucose data from a given CSV file, preprocess it by removing nulls and renaming columns.
Args:
file_name: The name of the input CSV file.
Returns:
A preprocessed Pandas DataFrame containing glucose data.
Raises:
Exception: If the input CSV file is missing or corrupted.
"""
# Read the CSV file and store it in a DataFrame
df = read_csv_file(file_name=file_name)
# Preprocess the glucose data and return the resulting DataFrame
glucose = _preprocess_glucose_data(df=df)
return glucose
def _preprocess_glucose_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Preprocess the DataFrame containing glucose data by removing nulls and renaming columns.
Args:
df: A Pandas DataFrame containing the glucose data.
Returns:
A preprocessed Pandas DataFrame containing glucose data.
"""
# Select the relevant columns from the input DataFrame
glucose = df[['Timestamp (YYYY-MM-DDThh:mm:ss)', 'Glucose Value (mg/dL)']]
# Convert the timestamp column to datetime format
glucose['Timestamp (YYYY-MM-DDThh:mm:ss)'] = pd.to_datetime(glucose['Timestamp (YYYY-MM-DDThh:mm:ss)'])
# Convert the glucose value column to numeric, setting any errors to NaN
glucose['Glucose Value (mg/dL)'] = pd.to_numeric(glucose['Glucose Value (mg/dL)'], errors='coerce')
# Remove rows with missing values
glucose.dropna(inplace=True)
# Convert the glucose value column to float data type
glucose['Glucose Value (mg/dL)'] = glucose['Glucose Value (mg/dL)'].astype(float)
# Rename the timestamp column for better readability
glucose.rename(columns={'Timestamp (YYYY-MM-DDThh:mm:ss)': 'Timestamp'}, inplace=True)
return glucose
def _preprocess_insulin_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Preprocess the DataFrame containing insulin data by removing nulls and renaming columns.
Args:
df: A Pandas DataFrame containing the insulin data.
Returns:
A preprocessed Pandas DataFrame containing insulin data.
"""
# Select the relevant columns from the input DataFrame
insulin = df[['Timestamp (YYYY-MM-DDThh:mm:ss)', 'Event Subtype', 'Insulin Value (u)']]
# Convert the timestamp column to datetime format
insulin['Timestamp (YYYY-MM-DDThh:mm:ss)'] = pd.to_datetime(insulin['Timestamp (YYYY-MM-DDThh:mm:ss)'])
# Convert the insulin value column to numeric, setting any errors to NaN
insulin['Insulin Value (u)'] = pd.to_numeric(insulin['Insulin Value (u)'], errors='coerce')
# Remove rows with missing values
insulin.dropna(inplace=True)
# Convert the insulin value column to float data type
insulin['Insulin Value (u)'] = insulin['Insulin Value (u)'].astype(float)
# Rename the columns for better readability
insulin.rename(
columns={'Timestamp (YYYY-MM-DDThh:mm:ss)': 'Timestamp', 'Event Subtype': 'Type', 'Insulin Value (u)': 'Value'},
inplace=True)
return insulin