diff --git a/scripts/_business_etl.py b/scripts/_business_etl.py
index eaa8032..cb4a3bd 100644
--- a/scripts/_business_etl.py
+++ b/scripts/_business_etl.py
@@ -1,245 +1,12 @@
-# SPDX-License-Identifier: MIT
+"""Backwards-compatible shim for Track D ETL helpers.
 
-"""ETL helpers for Track D (Business).
+The repo uses ``scripts/_business_etl.py`` in Business Track chapters.
 
-Chapter 7: preparing accounting data for analysis.
-
-This module provides small, testable transformations that turn the synthetic
-general ledger into analysis-friendly (“tidy”) tables.
-
-Core idea:
-- A raw GL export typically has *two amount columns* (debit, credit).
-- Many analytic workflows prefer a *single signed amount* column.
-- "Signed" here means "positive when the account increases on its normal side".
-  (Assets/expenses normally increase with debits; liabilities/equity/revenue
-  normally increase with credits.)
+The installed package exposes the canonical implementation at
+``pystatsv1.trackd.etl``. This shim keeps existing imports working for students
+running scripts directly from the repo.
 """
 
 from __future__ import annotations
 
-from dataclasses import dataclass
-from typing import Any
-
-import numpy as np
-import pandas as pd
-
-
-@dataclass(frozen=True)
-class GLPrepOutputs:
-    """Outputs for Chapter 7 ETL."""
-
-    gl_tidy: pd.DataFrame
-    gl_monthly_summary: pd.DataFrame
-    summary: dict[str, Any]
-
-
-def _to_float(series: pd.Series) -> pd.Series:
-    return pd.to_numeric(series, errors="coerce").fillna(0.0).astype(float)
-
-
-def prepare_gl_tidy(gl_journal: pd.DataFrame, chart_of_accounts: pd.DataFrame) -> pd.DataFrame:
-    """Return a line-level tidy GL dataset.
-
-    Parameters
-    ----------
-    gl_journal:
-        Raw journal export with debit/credit columns.
-    chart_of_accounts:
-        COA mapping account_id -> account_name/account_type/normal_side.
-
-    Returns
-    -------
-    pd.DataFrame
-        A normalized table with one row per GL line, plus:
-        - joined account labels
-        - parsed dates + a month key
-        - `raw_amount = debit - credit` (debit-positive convention)
-        - `amount` where sign is aligned to the account's normal side
-    """
-
-    gl = gl_journal.copy()
-    coa = chart_of_accounts.copy()
-
-    gl["account_id"] = gl["account_id"].astype(str)
-    coa["account_id"] = coa["account_id"].astype(str)
-
-    coa_cols = ["account_id", "account_name", "account_type", "normal_side"]
-    out = gl.merge(
-        coa[coa_cols],
-        on="account_id",
-        how="left",
-        validate="many_to_one",
-        suffixes=("", "_coa"),
-    )
-
-    # If the GL already carried labels, keep them; otherwise fill from the COA.
-    for col in ("account_name", "account_type", "normal_side"):
-        rhs = f"{col}_coa"
-        if rhs in out.columns:
-            if col in out.columns:
-                out[col] = out[col].where(out[col].notna() & (out[col].astype(str) != ""), out[rhs])
-            else:
-                out[col] = out[rhs]
-            out = out.drop(columns=[rhs])
-
-
-    out["doc_id"] = out["doc_id"].astype(str)
-    out["description"] = out["description"].astype(str)
-
-    out["date"] = pd.to_datetime(out["date"], errors="coerce")
-    out["month"] = out["date"].dt.strftime("%Y-%m")
-
-    out["debit"] = _to_float(out.get("debit", 0.0))
-    out["credit"] = _to_float(out.get("credit", 0.0))
-
-    out["dc"] = np.where(out["debit"] > 0, "D", np.where(out["credit"] > 0, "C", ""))
-
-    # Debit-positive convention
-    out["raw_amount"] = out["debit"] - out["credit"]
-
-    # Signed-by-normal-side: positive means "account increased"
-    normal = out["normal_side"].astype(str).str.lower()
-    out["amount"] = np.where(normal.eq("credit"), -out["raw_amount"], out["raw_amount"])
-
-    # Stable row ids (helpful for downstream joins)
-    out = out.sort_values(["date", "txn_id", "account_id"], kind="mergesort").reset_index(drop=True)
-    out["line_no"] = out.groupby("txn_id").cumcount() + 1
-    out["gl_line_id"] = out["txn_id"].astype(str) + "-" + out["line_no"].astype(str)
-
-    cols = [
-        "gl_line_id",
-        "txn_id",
-        "line_no",
-        "date",
-        "month",
-        "doc_id",
-        "description",
-        "account_id",
-        "account_name",
-        "account_type",
-        "normal_side",
-        "dc",
-        "debit",
-        "credit",
-        "raw_amount",
-        "amount",
-    ]
-
-    # Keep any extra columns at the end (future-proof)
-    extra = [c for c in out.columns if c not in cols]
-    return out[cols + extra]
-
-
-def build_gl_tidy_dataset(gl: pd.DataFrame, coa: pd.DataFrame) -> pd.DataFrame:
-    """Backward-compatible alias for :func:`prepare_gl_tidy`.
-
-    Chapter 8 imports ``build_gl_tidy_dataset``.
-    Chapter 7 uses the canonical name ``prepare_gl_tidy``.
-    """
-
-    return prepare_gl_tidy(gl, coa)
-
-
-def prepare_gl_monthly_summary(gl_tidy: pd.DataFrame) -> pd.DataFrame:
-    """Monthly rollup of tidy GL.
-
-    Produces one row per (month, account) with debit/credit totals and a
-    signed net change (`net_change`) aligned to the account's normal side.
-    """
-
-    g = gl_tidy.copy()
-
-    group_cols = ["month", "account_id", "account_name", "account_type", "normal_side"]
-    out = (
-        g.groupby(group_cols, dropna=False)
-        .agg(
-            n_lines=("gl_line_id", "count"),
-            debit=("debit", "sum"),
-            credit=("credit", "sum"),
-            net_change=("amount", "sum"),
-        )
-        .reset_index()
-    )
-
-    out["debit"] = out["debit"].astype(float)
-    out["credit"] = out["credit"].astype(float)
-    out["net_change"] = out["net_change"].astype(float)
-
-    return out.sort_values(["month", "account_id"], kind="mergesort").reset_index(drop=True)
-
-
-
-
-def build_data_dictionary() -> dict[str, str]:
-    """A lightweight data dictionary for the Chapter 7 output tables.
-
-    This is intentionally small and human-readable (useful for docs + downstream
-    notebooks). It is *not* intended to be a formal metadata standard.
-    """
-
-    return {
-        # Keys used in gl_tidy.csv
-        "gl_line_id": "Stable line identifier (txn_id-line_no).",
-        "txn_id": "Journal transaction id (groups debit/credit lines for one event).",
-        "line_no": "Line number within txn_id (1..k).",
-        "date": "Journal posting date (YYYY-MM-DD).",
-        "month": "Month key derived from date (YYYY-MM).",
-        "doc_id": "Source document id (invoice, payroll run, bank transfer, etc.).",
-        "description": "Text description from the journal.",
-        "account_id": "Chart-of-accounts id.",
-        "account_name": "Chart-of-accounts account name.",
-        "account_type": "High-level account class (Asset, Liability, Equity, Revenue, Expense).",
-        "normal_side": "Normal balance side for the account (debit or credit).",
-        "debit": "Debit amount for the line (0 if none).",
-        "credit": "Credit amount for the line (0 if none).",
-        "dc": "D if debit>0, C if credit>0, blank if both are 0.",
-        "raw_amount": "Single-column amount in debit-positive convention: debit - credit.",
-        "amount": "Signed amount aligned to the account's normal side (positive means the account increased).",
-        # Keys used in gl_monthly_summary.csv
-        "n_lines": "Number of GL lines aggregated into the month/account group.",
-        "net_change": "Sum of `amount` in the month/account group.",
-    }
-
-
-def analyze_gl_preparation(gl_journal: pd.DataFrame, chart_of_accounts: pd.DataFrame) -> GLPrepOutputs:
-    """Compute Chapter 7 outputs + a small QC summary."""
-
-    gl_tidy = prepare_gl_tidy(gl_journal, chart_of_accounts)
-    monthly = prepare_gl_monthly_summary(gl_tidy)
-
-    n_lines = int(len(gl_tidy))
-    n_txns = int(gl_tidy["txn_id"].nunique()) if n_lines else 0
-    n_missing_accounts = int(gl_tidy["account_name"].isna().sum())
-    n_bad_dates = int(gl_tidy["date"].isna().sum())
-
-    # Basic accounting invariant: sum of raw debit-positive amounts should be ~0
-    raw_total = float(gl_tidy["raw_amount"].sum()) if n_lines else 0.0
-    gl_balances = bool(abs(raw_total) < 1e-6)
-
-    summary: dict[str, Any] = {
-        "checks": {
-            "gl_balances_raw_amount_sum_zero": gl_balances,
-            "coa_join_coverage_ok": n_missing_accounts == 0,
-            "no_missing_coa_mappings": n_missing_accounts == 0,
-            "all_gl_dates_parse": n_bad_dates == 0,
-            "no_unparseable_dates": n_bad_dates == 0,
-        },
-        "metrics": {
-            "n_gl_lines": n_lines,
-            "n_txns": n_txns,
-            "n_accounts": int(gl_tidy["account_id"].nunique()) if n_lines else 0,
-            "n_months": int(gl_tidy["month"].nunique()) if n_lines else 0,
-            "n_missing_coa_mappings": n_missing_accounts,
-            "n_bad_dates": n_bad_dates,
-            "raw_amount_sum": raw_total,
-        },
-        "data_dictionary": build_data_dictionary(),
-        "notes": {
-            "amount_definition": (
-                "amount is signed so positive means the account increased on its normal side; "
-                "raw_amount uses debit-positive convention (debit - credit)."
-            )
-        },
-    }
-
-    return GLPrepOutputs(gl_tidy=gl_tidy, gl_monthly_summary=monthly, summary=summary)
+from pystatsv1.trackd.etl import *  # noqa: F401,F403
diff --git a/src/pystatsv1/trackd/__init__.py b/src/pystatsv1/trackd/__init__.py
index e38722f..e1e9b3c 100644
--- a/src/pystatsv1/trackd/__init__.py
+++ b/src/pystatsv1/trackd/__init__.py
@@ -18,6 +18,15 @@
     validate_schema,
 )
 
+from .etl import (
+    GLPrepOutputs,
+    analyze_gl_preparation,
+    build_data_dictionary,
+    build_gl_tidy_dataset,
+    prepare_gl_monthly_summary,
+    prepare_gl_tidy,
+)
+
 __all__ = [
     "DataFrame",
     "DataFrames",
@@ -30,4 +39,10 @@
     "TrackDSchemaError",
     "read_csv_required",
     "validate_schema",
+    "GLPrepOutputs",
+    "prepare_gl_tidy",
+    "build_gl_tidy_dataset",
+    "prepare_gl_monthly_summary",
+    "build_data_dictionary",
+    "analyze_gl_preparation",
 ]
diff --git a/src/pystatsv1/trackd/etl.py b/src/pystatsv1/trackd/etl.py
new file mode 100644
index 0000000..ec6d73b
--- /dev/null
+++ b/src/pystatsv1/trackd/etl.py
@@ -0,0 +1,341 @@
+"""Track D ETL helpers.
+
+This module mirrors the public API of ``scripts/_business_etl.py``.
+
+Rationale
+---------
+We want chapter/workbook code to be able to import a stable implementation from
+the installed package (``pystatsv1.trackd``) while keeping the repo-local
+``scripts/_business_etl.py`` as a thin, backwards-compatible shim.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import pandas as pd
+
+
+@dataclass(frozen=True)
+class GLPrepOutputs:
+    """Outputs for Chapter 7 ETL."""
+
+    gl_tidy: pd.DataFrame
+    gl_monthly_summary: pd.DataFrame
+    summary: dict[str, Any]
+
+
+def _to_float(series: pd.Series) -> pd.Series:
+    return pd.to_numeric(series, errors="coerce").fillna(0.0).astype(float)
+
+
+
+def _infer_normal_side(account_type: pd.Series) -> pd.Series:
+    """Infer a normal side (debit/credit) from an account type label.
+
+    This is intentionally forgiving: simplified student datasets often only include
+    ``account_type`` (e.g., Asset, Liability, Revenue, Expense, Equity).
+
+    If a label contains ``contra``, we flip the inferred side (e.g., contra-asset
+    is normally credit). Unknown/blank types yield an empty string.
+    """
+
+    s = account_type.astype(str).str.strip().str.lower()
+    is_contra = s.str.contains("contra")
+    base = s.str.replace("contra", "", regex=False).str.strip()
+
+    # Broad, case-insensitive buckets
+    side = pd.Series("", index=account_type.index, dtype="object")
+    side = side.mask(base.str.contains("asset"), "debit")
+    side = side.mask(base.str.contains("expense"), "debit")
+    side = side.mask(base.str.contains("liabil"), "credit")
+    side = side.mask(base.str.contains("equity"), "credit")
+    side = side.mask(base.str.contains("revenue") | base.str.contains("income"), "credit")
+
+    # Flip contra accounts when we have a known base side
+    side = side.mask(is_contra & side.eq("debit"), "credit")
+    side = side.mask(is_contra & side.eq("credit"), "debit")
+    return side
+
+
+def prepare_gl_tidy(gl_journal: pd.DataFrame, chart_of_accounts: pd.DataFrame) -> pd.DataFrame:
+    """Return a line-level tidy GL dataset.
+
+    Parameters
+    ----------
+    gl_journal:
+        Raw journal export with debit/credit columns.
+    chart_of_accounts:
+        COA mapping account_id -> account_name/account_type/normal_side.
+
+    Returns
+    -------
+    pd.DataFrame
+        A normalized table with one row per GL line, plus:
+        - joined account labels
+        - parsed dates + a month key
+        - `raw_amount = debit - credit` (debit-positive convention)
+        - `amount` where sign is aligned to the account's normal side
+    """
+
+    gl = gl_journal.copy()
+    coa = chart_of_accounts.copy()
+
+    gl["account_id"] = gl["account_id"].astype(str)
+    coa["account_id"] = coa["account_id"].astype(str)
+
+    # `normal_side` is optional in simplified COA exports; infer it from account_type when absent.
+    if "normal_side" not in coa.columns:
+        coa["normal_side"] = _infer_normal_side(
+            coa.get("account_type", pd.Series("", index=coa.index))
+        )
+    else:
+        mask = coa["normal_side"].isna() | (coa["normal_side"].astype(str).str.strip() == "")
+        if mask.any():
+            inferred = _infer_normal_side(coa.get("account_type", pd.Series("", index=coa.index)))
+            coa.loc[mask, "normal_side"] = inferred.loc[mask]
+
+    # Be forgiving: some templates may omit label columns; fill with empty strings.
+    for col in ("account_name", "account_type"):
+        if col not in coa.columns:
+            coa[col] = ""
+
+    coa_cols = ["account_id", "account_name", "account_type", "normal_side"]
+    out = gl.merge(
+        coa[coa_cols],
+        on="account_id",
+        how="left",
+        validate="many_to_one",
+        suffixes=("", "_coa"),
+    )
+
+    # If the GL already carried labels, keep them; otherwise fill from the COA.
+    for col in ("account_name", "account_type", "normal_side"):
+        rhs = f"{col}_coa"
+        if rhs in out.columns:
+            if col in out.columns:
+                out[col] = out[col].where(out[col].notna() & (out[col].astype(str) != ""), out[rhs])
+            else:
+                out[col] = out[rhs]
+            out = out.drop(columns=[rhs])
+
+
+    # If still missing, infer from account_type (useful for BYOD or minimal COA inputs).
+    mask = out["normal_side"].isna() | (out["normal_side"].astype(str).str.strip() == "")
+    if mask.any():
+        out.loc[mask, "normal_side"] = _infer_normal_side(
+            out.get("account_type", pd.Series("", index=out.index))
+        ).loc[mask]
+
+
+    # Be forgiving for simplified inputs: allow missing txn_id/doc_id/description.
+    if "txn_id" not in out.columns:
+        if "doc_id" in out.columns:
+            out["txn_id"] = out["doc_id"].astype(str)
+        else:
+            out["txn_id"] = pd.Series(range(1, len(out) + 1), index=out.index).astype(str)
+    else:
+        out["txn_id"] = out["txn_id"].astype(str)
+
+    if "doc_id" not in out.columns:
+        out["doc_id"] = out["txn_id"].astype(str)
+    out["doc_id"] = out["doc_id"].astype(str)
+
+    if "description" not in out.columns:
+        out["description"] = ""
+    out["description"] = out["description"].astype(str)
+
+    out["date"] = pd.to_datetime(out["date"], errors="coerce")
+    out["month"] = out["date"].dt.strftime("%Y-%m")
+
+    had_debit = "debit" in out.columns
+    had_credit = "credit" in out.columns
+    had_dc = "dc" in out.columns
+    had_amount = "amount" in out.columns
+
+    dc_input = out["dc"].astype(str).str.upper() if had_dc else pd.Series("", index=out.index)
+    amount_input = _to_float(out["amount"]) if had_amount else pd.Series(0.0, index=out.index)
+
+    if "debit" not in out.columns:
+        out["debit"] = 0.0
+    if "credit" not in out.columns:
+        out["credit"] = 0.0
+
+    out["debit"] = _to_float(out["debit"])
+    out["credit"] = _to_float(out["credit"])
+
+    # Support the minimal (dc, amount) input format by materializing debit/credit.
+    if (not had_debit and not had_credit) and had_dc and had_amount:
+        mask_d = dc_input.eq("D")
+        mask_c = dc_input.eq("C")
+        out.loc[mask_d, "debit"] = amount_input.loc[mask_d]
+        out.loc[mask_c, "credit"] = amount_input.loc[mask_c]
+
+    # Prefer provided dc, else infer from debit/credit.
+    out["dc"] = dc_input.where(dc_input.isin(["D", "C"]), "")
+    out.loc[out["dc"].eq(""), "dc"] = np.where(
+        out["debit"] > 0,
+        "D",
+        np.where(out["credit"] > 0, "C", ""),
+    )
+
+    # Debit-positive convention
+    out["raw_amount"] = out["debit"] - out["credit"]
+
+    # Signed-by-normal-side: positive means "account increased"
+    normal = out["normal_side"].astype(str).str.lower()
+    out["amount"] = np.where(normal.eq("credit"), -out["raw_amount"], out["raw_amount"])
+
+    # Stable row ids (helpful for downstream joins)
+    out = out.sort_values(["date", "txn_id", "account_id"], kind="mergesort").reset_index(drop=True)
+    out["line_no"] = out.groupby("txn_id").cumcount() + 1
+    out["gl_line_id"] = out["txn_id"].astype(str) + "-" + out["line_no"].astype(str)
+
+    cols = [
+        "gl_line_id",
+        "txn_id",
+        "line_no",
+        "date",
+        "month",
+        "doc_id",
+        "description",
+        "account_id",
+        "account_name",
+        "account_type",
+        "normal_side",
+        "dc",
+        "debit",
+        "credit",
+        "raw_amount",
+        "amount",
+    ]
+
+    # Keep any extra columns at the end (future-proof)
+    extra = [c for c in out.columns if c not in cols]
+    return out[cols + extra]
+
+
+def build_gl_tidy_dataset(gl: pd.DataFrame, coa: pd.DataFrame) -> pd.DataFrame:
+    """Backward-compatible alias for :func:`prepare_gl_tidy`.
+
+    Chapter 8 imports ``build_gl_tidy_dataset``.
+    Chapter 7 uses the canonical name ``prepare_gl_tidy``.
+    """
+
+    return prepare_gl_tidy(gl, coa)
+
+
+def prepare_gl_monthly_summary(gl_tidy: pd.DataFrame) -> pd.DataFrame:
+    """Monthly rollup of tidy GL.
+
+    Produces one row per (month, account) with debit/credit totals and a
+    signed net change (`net_change`) aligned to the account's normal side.
+    """
+
+    g = gl_tidy.copy()
+
+    group_cols = ["month", "account_id", "account_name", "account_type", "normal_side"]
+    out = (
+        g.groupby(group_cols, dropna=False)
+        .agg(
+            n_lines=("gl_line_id", "count"),
+            debit=("debit", "sum"),
+            credit=("credit", "sum"),
+            net_change=("amount", "sum"),
+        )
+        .reset_index()
+    )
+
+    out["debit"] = out["debit"].astype(float)
+    out["credit"] = out["credit"].astype(float)
+    out["net_change"] = out["net_change"].astype(float)
+
+    return out.sort_values(["month", "account_id"], kind="mergesort").reset_index(drop=True)
+
+
+
+
+def build_data_dictionary() -> dict[str, str]:
+    """A lightweight data dictionary for the Chapter 7 output tables.
+
+    This is intentionally small and human-readable (useful for docs + downstream
+    notebooks). It is *not* intended to be a formal metadata standard.
+    """
+
+    return {
+        # Keys used in gl_tidy.csv
+        "gl_line_id": "Stable line identifier (txn_id-line_no).",
+        "txn_id": "Journal transaction id (groups debit/credit lines for one event).",
+        "line_no": "Line number within txn_id (1..k).",
+        "date": "Journal posting date (YYYY-MM-DD).",
+        "month": "Month key derived from date (YYYY-MM).",
+        "doc_id": "Source document id (invoice, payroll run, bank transfer, etc.).",
+        "description": "Text description from the journal.",
+        "account_id": "Chart-of-accounts id.",
+        "account_name": "Chart-of-accounts account name.",
+        "account_type": "High-level account class (Asset, Liability, Equity, Revenue, Expense).",
+        "normal_side": "Normal balance side for the account (debit or credit).",
+        "debit": "Debit amount for the line (0 if none).",
+        "credit": "Credit amount for the line (0 if none).",
+        "dc": "D if debit>0, C if credit>0, blank if both are 0.",
+        "raw_amount": "Single-column amount in debit-positive convention: debit - credit.",
+        "amount": "Signed amount aligned to the account's normal side (positive means the account increased).",
+        # Keys used in gl_monthly_summary.csv
+        "n_lines": "Number of GL lines aggregated into the month/account group.",
+        "net_change": "Sum of `amount` in the month/account group.",
+    }
+
+
+def analyze_gl_preparation(gl_journal: pd.DataFrame, chart_of_accounts: pd.DataFrame) -> GLPrepOutputs:
+    """Compute Chapter 7 outputs + a small QC summary."""
+
+    gl_tidy = prepare_gl_tidy(gl_journal, chart_of_accounts)
+    monthly = prepare_gl_monthly_summary(gl_tidy)
+
+    n_lines = int(len(gl_tidy))
+    n_txns = int(gl_tidy["txn_id"].nunique()) if n_lines else 0
+    n_missing_accounts = int(gl_tidy["account_name"].isna().sum())
+    n_bad_dates = int(gl_tidy["date"].isna().sum())
+
+    # Basic accounting invariant: sum of raw debit-positive amounts should be ~0
+    raw_total = float(gl_tidy["raw_amount"].sum()) if n_lines else 0.0
+    gl_balances = bool(abs(raw_total) < 1e-6)
+
+    summary: dict[str, Any] = {
+        "checks": {
+            "gl_balances_raw_amount_sum_zero": gl_balances,
+            "coa_join_coverage_ok": n_missing_accounts == 0,
+            "no_missing_coa_mappings": n_missing_accounts == 0,
+            "all_gl_dates_parse": n_bad_dates == 0,
+            "no_unparseable_dates": n_bad_dates == 0,
+        },
+        "metrics": {
+            "n_gl_lines": n_lines,
+            "n_txns": n_txns,
+            "n_accounts": int(gl_tidy["account_id"].nunique()) if n_lines else 0,
+            "n_months": int(gl_tidy["month"].nunique()) if n_lines else 0,
+            "n_missing_coa_mappings": n_missing_accounts,
+            "n_bad_dates": n_bad_dates,
+            "raw_amount_sum": raw_total,
+        },
+        "data_dictionary": build_data_dictionary(),
+        "notes": {
+            "amount_definition": (
+                "amount is signed so positive means the account increased on its normal side; "
+                "raw_amount uses debit-positive convention (debit - credit)."
+            )
+        },
+    }
+
+    return GLPrepOutputs(gl_tidy=gl_tidy, gl_monthly_summary=monthly, summary=summary)
+
+__all__ = [
+    "GLPrepOutputs",
+    "prepare_gl_tidy",
+    "build_gl_tidy_dataset",
+    "prepare_gl_monthly_summary",
+    "build_data_dictionary",
+    "analyze_gl_preparation",
+]
diff --git a/tests/test_trackd_etl_dedupe_and_keys.py b/tests/test_trackd_etl_dedupe_and_keys.py
new file mode 100644
index 0000000..7c01be7
--- /dev/null
+++ b/tests/test_trackd_etl_dedupe_and_keys.py
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+import pandas as pd
+
+from pystatsv1.trackd.etl import analyze_gl_preparation, prepare_gl_monthly_summary, prepare_gl_tidy
+
+
+def _sample_inputs() -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Smallest useful GL + COA inputs.
+
+    - GL journal uses (txn_id, date, description, account_id, dc, amount)
+    - Chart of accounts omits normal_side, which ETL should infer from account_type
+    - doc_id/line_no are also omitted and should be synthesized safely
+    """
+
+    gl_journal = pd.DataFrame(
+        [
+            {
+                "txn_id": "t1",
+                "date": "2025-01-05",
+                "description": "Sale (cash)",
+                "account_id": "1000",
+                "dc": "D",
+                "amount": 100.0,
+            },
+            {
+                "txn_id": "t1",
+                "date": "2025-01-05",
+                "description": "Sale (cash)",
+                "account_id": "4000",
+                "dc": "C",
+                "amount": 100.0,
+            },
+        ]
+    )
+
+    chart_of_accounts = pd.DataFrame(
+        [
+            {"account_id": "1000", "account_name": "Cash", "account_type": "Asset"},
+            {
+                "account_id": "4000",
+                "account_name": "Sales Revenue",
+                "account_type": "Revenue",
+            },
+        ]
+    )
+
+    return gl_journal, chart_of_accounts
+
+
+def test_prepare_gl_tidy_builds_stable_gl_line_id_and_signs() -> None:
+    gl_journal, chart_of_accounts = _sample_inputs()
+
+    gl_tidy = prepare_gl_tidy(gl_journal=gl_journal, chart_of_accounts=chart_of_accounts)
+
+    # stable deterministic line IDs (txn_id + within-txn line number)
+    assert set(gl_tidy["gl_line_id"]) == {"t1-1", "t1-2"}
+
+    # raw_amount is debit - credit; amount is sign-aligned to account normal_side
+    assert list(gl_tidy["raw_amount"]) == [100.0, -100.0]
+    assert list(gl_tidy["amount"]) == [100.0, 100.0]
+
+    # debit/credit columns should be populated consistently
+    assert list(gl_tidy["debit"]) == [100.0, 0.0]
+    assert list(gl_tidy["credit"]) == [0.0, 100.0]
+
+    # ETL should infer normal_side from account_type when missing
+    assert list(gl_tidy["normal_side"]) == ["debit", "credit"]
+
+    # doc_id should be synthesized when missing (defaults to txn_id)
+    assert list(gl_tidy["doc_id"]) == ["t1", "t1"]
+
+
+def test_prepare_gl_monthly_summary_keys_unique_and_expected_totals() -> None:
+    gl_journal, chart_of_accounts = _sample_inputs()
+    gl_tidy = prepare_gl_tidy(gl_journal=gl_journal, chart_of_accounts=chart_of_accounts)
+
+    monthly = prepare_gl_monthly_summary(gl_tidy)
+
+    # expected contract (subset)
+    for col in [
+        "month",
+        "account_id",
+        "account_name",
+        "account_type",
+        "normal_side",
+        "debit",
+        "credit",
+        "net_change",
+        "n_lines",
+    ]:
+        assert col in monthly.columns
+
+    # unique grouping keys
+    assert (
+        monthly[["month", "account_id", "normal_side"]].duplicated().sum() == 0
+    )
+
+    # one row per account in the month
+    assert set(monthly["account_id"]) == {"1000", "4000"}
+
+    # net_change is positive for both accounts after normal-side alignment
+    m = monthly.set_index("account_id")
+    assert float(m.loc["1000", "net_change"]) == 100.0
+    assert float(m.loc["4000", "net_change"]) == 100.0
+
+
+def test_analyze_gl_preparation_returns_outputs_and_summary_counts() -> None:
+    gl_journal, chart_of_accounts = _sample_inputs()
+
+    outputs = analyze_gl_preparation(gl_journal=gl_journal, chart_of_accounts=chart_of_accounts)
+
+    assert outputs.gl_tidy.shape[0] == 2
+    assert outputs.gl_monthly_summary.shape[0] == 2
+
+    assert outputs.summary["metrics"]["n_gl_lines"] == 2
+    # raw amounts should balance to zero (a valid double-entry transaction)
+    assert outputs.summary["checks"]["gl_balances_raw_amount_sum_zero"] is True