Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
index.py
env/
env/
__pycache__/
results/
19 changes: 19 additions & 0 deletions src/clean_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,26 @@
import pandas as pd


def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean sensor data by handling missing or invalid values.

Returns:
pd.DataFrame: Cleaned data.
"""

cleaned = df.copy()

# Remove duplicate rows that may come from repeated exports.
cleaned = cleaned.drop_duplicates().reset_index(drop=True)

text_columns = cleaned.select_dtypes(include="object").columns
cleaned[text_columns] = cleaned[text_columns].apply(lambda col: col.str.strip())

# Make sure numeric columns are floats so we can compare them.
numeric_columns = ["pH", "turbidity", "dissolved_oxygen", "temperature"]
for column in numeric_columns:
if column in cleaned.columns:
cleaned[column] = pd.to_numeric(cleaned[column], errors="coerce")

return cleaned
13 changes: 13 additions & 0 deletions src/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pandas as pd

class WaterQualityEvaluator:
def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0):
self.ph_range = ph_range
Expand All @@ -7,3 +9,14 @@ def is_safe(self, row: pd.Series) -> bool:
"""
Determine if a row of water data is safe.
"""
ph_min, ph_max = self.ph_range
ph_value = row.get("pH")
turbidity_value = row.get("turbidity")

if pd.isna(ph_value) or pd.isna(turbidity_value):
return False
if not (ph_min <= ph_value <= ph_max):
return False
if turbidity_value > self.turbidity_threshold:
return False
return True
5 changes: 5 additions & 0 deletions src/load_data.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import pandas as pd


def load_csv(filepath: str) -> pd.DataFrame:
"""
Load sensor data from a CSV file.
Expand All @@ -8,3 +11,5 @@ def load_csv(filepath: str) -> pd.DataFrame:
Returns:
pd.DataFrame: Loaded data as a pandas DataFrame.
"""

return pd.read_csv(filepath, parse_dates=["timestamp"])
42 changes: 42 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os

from clean_data import clean_sensor_data
from evaluate import WaterQualityEvaluator
from load_data import load_csv

INPUT_PATH = "data/sensor_data.csv"
OUTPUT_PATH = "results/results.csv"
ROWS_TO_SHOW = 5


def main() -> None:
data = load_csv(INPUT_PATH)
cleaned = clean_sensor_data(data)

evaluator = WaterQualityEvaluator()
evaluated = evaluator.evaluate(cleaned)

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
evaluated.to_csv(OUTPUT_PATH, index=False)

rows_to_show = evaluated.head(ROWS_TO_SHOW)
for _, row in rows_to_show.iterrows():
sensor = row.get("sensor_id", "Unknown sensor")
timestamp = row.get("timestamp", "Unknown time")
status = "✅ Safe" if row.get("is_safe") else "❌ Unsafe"
if status.startswith("❌") and row.get("issues"):
status = f"{status} ({row['issues']})"
print(f"Sensor {sensor} at {timestamp}: {status}")

total = len(evaluated)
safe = int(evaluated["is_safe"].sum())
unsafe = total - safe
print("\nSummary:")
print(f" Total readings analysed: {total}")
print(f" Safe readings: {safe}")
print(f" Unsafe readings: {unsafe}")
print(f"\nResults saved to {OUTPUT_PATH}")


if __name__ == "__main__":
main()