diff --git a/.gitignore b/.gitignore index fa30cb2fb..748573f3e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ index.py -env/ \ No newline at end of file +env/ +__pycache__/ +results/ \ No newline at end of file diff --git a/src/clean_data.py b/src/clean_data.py index da613640a..12e5a5fb8 100644 --- a/src/clean_data.py +++ b/src/clean_data.py @@ -1,3 +1,6 @@ +import pandas as pd + + def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame: """ Clean sensor data by handling missing or invalid values. @@ -5,3 +8,19 @@ def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame: Cleaned data. """ + + cleaned = df.copy() + + # Remove duplicate rows that may come from repeated exports. + cleaned = cleaned.drop_duplicates().reset_index(drop=True) + + text_columns = cleaned.select_dtypes(include="object").columns + cleaned[text_columns] = cleaned[text_columns].apply(lambda col: col.str.strip()) + + # Make sure numeric columns are floats so we can compare them. + numeric_columns = ["pH", "turbidity", "dissolved_oxygen", "temperature"] + for column in numeric_columns: + if column in cleaned.columns: + cleaned[column] = pd.to_numeric(cleaned[column], errors="coerce") + + return cleaned diff --git a/src/evaluate.py b/src/evaluate.py index 006256224..6e4188b46 100644 --- a/src/evaluate.py +++ b/src/evaluate.py @@ -1,3 +1,5 @@ +import pandas as pd + class WaterQualityEvaluator: def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0): self.ph_range = ph_range @@ -7,3 +9,14 @@ def is_safe(self, row: pd.Series) -> bool: """ Determine if a row of water data is safe. """ + ph_min, ph_max = self.ph_range + ph_value = row.get("pH") + turbidity_value = row.get("turbidity") + + if pd.isna(ph_value) or pd.isna(turbidity_value): + return False + if not (ph_min <= ph_value <= ph_max): + return False + if turbidity_value > self.turbidity_threshold: + return False + return True diff --git a/src/load_data.py b/src/load_data.py index c0126703a..32a6be265 100644 --- a/src/load_data.py +++ b/src/load_data.py @@ -1,3 +1,6 @@ +import pandas as pd + + def load_csv(filepath: str) -> pd.DataFrame: """ Load sensor data from a CSV file. @@ -8,3 +11,5 @@ def load_csv(filepath: str) -> pd.DataFrame: Returns: pd.DataFrame: Loaded data as a pandas DataFrame. """ + + return pd.read_csv(filepath, parse_dates=["timestamp"]) diff --git a/src/main.py b/src/main.py index e69de29bb..f84c7a35b 100644 --- a/src/main.py +++ b/src/main.py @@ -0,0 +1,42 @@ +import os + +from clean_data import clean_sensor_data +from evaluate import WaterQualityEvaluator +from load_data import load_csv + +INPUT_PATH = "data/sensor_data.csv" +OUTPUT_PATH = "results/results.csv" +ROWS_TO_SHOW = 5 + + +def main() -> None: + data = load_csv(INPUT_PATH) + cleaned = clean_sensor_data(data) + + evaluator = WaterQualityEvaluator() + evaluated = evaluator.evaluate(cleaned) + + os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) + evaluated.to_csv(OUTPUT_PATH, index=False) + + rows_to_show = evaluated.head(ROWS_TO_SHOW) + for _, row in rows_to_show.iterrows(): + sensor = row.get("sensor_id", "Unknown sensor") + timestamp = row.get("timestamp", "Unknown time") + status = "✅ Safe" if row.get("is_safe") else "❌ Unsafe" + if status.startswith("❌") and row.get("issues"): + status = f"{status} ({row['issues']})" + print(f"Sensor {sensor} at {timestamp}: {status}") + + total = len(evaluated) + safe = int(evaluated["is_safe"].sum()) + unsafe = total - safe + print("\nSummary:") + print(f" Total readings analysed: {total}") + print(f" Safe readings: {safe}") + print(f" Unsafe readings: {unsafe}") + print(f"\nResults saved to {OUTPUT_PATH}") + + +if __name__ == "__main__": + main()