area-estimator/python/area.py at main · bbl-dres/area-estimator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
#!/usr/bin/env python3
"""
Step 4 — GWR Enrichment + Floor Area Estimation

Two responsibilities, both only used after volume calculation:

1. Enrich buildings with GWR (Federal Register of Buildings) classification
   — gkat, gklas, gbauj, gastw — via either a bulk CSV download (preferred,
   no network) or the swisstopo `find` REST endpoint as a single-call
   per-EGID fallback.

2. Convert building volume to gross floor area using building-type-specific
   floor heights, capped at the GWR `gastw` floor count when present.

Based on the Canton Zurich methodology (Seiler & Seiler GmbH, Dec 2020).

Uses height_minimal_m (volume / footprint) rather than height_mean_m for
floor count estimation, as it represents the equivalent uniform box height
and handles complex roof shapes more consistently.
"""

import json
import logging
import math
import time
import urllib.error
import urllib.parse
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Optional

import pandas as pd

from volume import (
    STATUS_NO_FOOTPRINT,
    STATUS_NO_VOLUME,
    STATUS_SUCCESS,
    BuildingResult,
    append_warning,
)

log = logging.getLogger(__name__)

# ── Constants ───────────────────────────────────────────────────────────────

# Refuse to estimate floors above this height — Roche Tower 1 (the tallest
# building in Switzerland) is 178 m, so anything taller is almost certainly
# bad data (a tower next to a much shorter building, or vegetation noise).
HEIGHT_SANITY_CAP_M = 200

# Hard ceiling for floor count when GWR `gastw` is unavailable.
MAX_FLOORS_FALLBACK = 200

# GWR `find` REST endpoint tuning
#
# The swisstopo API has no documented bulk endpoint for attribute lookup by
# EGID — `/MapServer/<layer>/<id1,id2,...>` exists but requires the exact
# featureId-with-suffix format which we don't know without first calling
# `find`. So we parallelise the find calls instead. Benchmarked at
# ~3× speedup over sequential at 10 workers, with diminishing returns past
# 20. Bump GWR_API_MAX_WORKERS for very large runs if the API tolerates it.
GWR_API_MAX_WORKERS = 10
GWR_API_TIMEOUT_S = 10
GWR_API_WARN_THRESHOLD = 100     # warn the user above this many API calls

# Floor height lookup table.
# Format: code -> (GF_min, GF_max, UF_min, UF_max, schema, description)
# GF = ground floor, UF = upper floors. Today the four numbers are averaged
# down to a single representative floor height — see get_floor_height.
FLOOR_HEIGHT_LOOKUP: "dict[int, tuple[float, float, float, float, str, str]]" = {
    # GKAT-based (category)
    1010: (2.70, 3.30, 2.70, 3.30, 'GKAT', 'Provisional shelter'),
    1030: (2.70, 3.30, 2.70, 3.30, 'GKAT', 'Residential with secondary use'),
    1040: (3.30, 3.70, 2.70, 3.70, 'GKAT', 'Partially residential'),
    1060: (3.30, 5.00, 3.00, 5.00, 'GKAT', 'Non-residential'),
    1080: (3.00, 4.00, 3.00, 4.00, 'GKAT', 'Special-purpose'),

    # GKLAS-based (class) — Residential
    1110: (2.70, 3.30, 2.70, 3.30, 'GKLAS', 'Single-family house'),
    1121: (2.70, 3.30, 2.70, 3.30, 'GKLAS', 'Two-family house'),
    1122: (2.70, 3.30, 2.70, 3.30, 'GKLAS', 'Multi-family house'),
    1130: (2.70, 3.30, 2.70, 3.30, 'GKLAS', 'Community residential'),

    # GKLAS — Hotels and Tourism
    1211: (3.30, 3.70, 3.00, 3.50, 'GKLAS', 'Hotel'),
    1212: (3.00, 3.50, 3.00, 3.50, 'GKLAS', 'Short-term accommodation'),

    # GKLAS — Commercial and Industrial
    1220: (3.40, 4.20, 3.40, 4.20, 'GKLAS', 'Office building'),
    1230: (3.40, 5.00, 3.40, 5.00, 'GKLAS', 'Wholesale and retail'),
    1231: (3.30, 4.00, 3.30, 4.00, 'GKLAS', 'Restaurants and bars'),
    1241: (4.00, 6.00, 4.00, 6.00, 'GKLAS', 'Stations and terminals'),
    1242: (2.80, 3.20, 2.80, 3.20, 'GKLAS', 'Parking garages'),
    1251: (4.00, 7.00, 4.00, 7.00, 'GKLAS', 'Industrial building'),
    1252: (3.50, 6.00, 3.50, 6.00, 'GKLAS', 'Tanks, silos, warehouses'),
    1261: (3.50, 5.00, 3.50, 5.00, 'GKLAS', 'Culture and leisure'),
    1262: (3.50, 5.00, 3.50, 5.00, 'GKLAS', 'Museums and libraries'),
    1263: (3.30, 4.00, 3.30, 4.00, 'GKLAS', 'Schools and universities'),
    1264: (3.30, 4.00, 3.30, 4.00, 'GKLAS', 'Hospitals and clinics'),
    1265: (3.00, 6.00, 3.00, 6.00, 'GKLAS', 'Sports halls'),
    1271: (3.50, 5.00, 3.50, 5.00, 'GKLAS', 'Agricultural buildings'),
    1272: (3.00, 6.00, 3.00, 6.00, 'GKLAS', 'Churches and religious buildings'),
    1273: (3.00, 4.00, 3.00, 4.00, 'GKLAS', 'Monuments and protected buildings'),
    1274: (3.00, 4.00, 3.00, 4.00, 'GKLAS', 'Other structures'),
}

DEFAULT_FLOOR_HEIGHT = (2.70, 3.30, 2.70, 3.30, 'DEFAULT', 'Unknown/Fallback')

ACCURACY_HIGH = 'high'       # ±10-15% — residential
ACCURACY_MEDIUM = 'medium'   # ±15-25% — commercial/office
ACCURACY_LOW = 'low'         # ±25-40% — industrial, special, missing

# Step 4-specific status (composed from area.py constants — the others
# are imported from volume.py at the top of this module so the test
# imports `from area import STATUS_SUCCESS` keep working as a re-export).
STATUS_HEIGHT_EXCEEDS_CAP = f'height_exceeds_{HEIGHT_SANITY_CAP_M}m'


def _to_gwr_code(value: Any) -> Optional[int]:
    """
    Normalise a GWR code (gkat / gklas / gbauj / gastw) to ``int | None``.

    Accepts ints, floats (including NaN from pandas), strings, and None.
    Returns None for any value that can't be coerced to a clean int —
    including ``inf`` / ``-inf`` (which crash ``int()`` with OverflowError)
    and strings that parse to inf via ``float()``.
    """
    if value is None:
        return None
    if isinstance(value, float) and math.isnan(value):
        return None
    try:
        as_float = float(value)
        if not math.isfinite(as_float):
            return None
        return int(as_float)
    except (TypeError, ValueError, OverflowError):
        return None


def get_floor_height(
    gkat: Any, gklas: Any,
) -> tuple[float, float, str, str]:
    """
    Look up floor height parameters based on GWR classification.

    Priority: GKLAS (specific) → GKAT (category) → default residential.

    Returns: ``(floor_height_min, floor_height_max, source, description)``
    where ``source`` is one of ``'GKLAS'``, ``'GKAT'``, or ``'DEFAULT'``.
    """
    # Try GKLAS first
    gklas_int = _to_gwr_code(gklas)
    if gklas_int is not None:
        entry = FLOOR_HEIGHT_LOOKUP.get(gklas_int)
        if entry is not None and entry[4] == 'GKLAS':
            return ((entry[0] + entry[2]) / 2,
                    (entry[1] + entry[3]) / 2,
                    'GKLAS', entry[5])

    # Try GKAT
    gkat_int = _to_gwr_code(gkat)
    if gkat_int is not None:
        entry = FLOOR_HEIGHT_LOOKUP.get(gkat_int)
        if entry is not None and entry[4] == 'GKAT':
            return ((entry[0] + entry[2]) / 2,
                    (entry[1] + entry[3]) / 2,
                    'GKAT', entry[5])

    entry = DEFAULT_FLOOR_HEIGHT
    return ((entry[0] + entry[2]) / 2,
            (entry[1] + entry[3]) / 2,
            'DEFAULT', entry[5])


# ── Per-GWR-code accuracy buckets ───────────────────────────────────────────
#
# Derived from the validation study at docs/Height Assumptions.md, which
# assigns a 5-level qualitative confidence (High / Medium-High / Medium /
# Low-Medium / Low) to each GWR code based on regulatory anchors (ArGV4,
# cantonal building codes), normative standards (SIA 2024, SIA 380/1),
# and construction practice. We collapse the 5 levels into our 3-level
# bucket *conservatively* — only the study's "High" maps to our `high`,
# and any "Low" or "Low-Medium" maps to `low`. The "Medium-High" and
# "Low-Medium" half-steps both round toward the middle. This makes
# `high` trustworthy and `low` inclusive.
#
# Every code in FLOOR_HEIGHT_LOOKUP has an entry below — verified by
# tests/test_area.py::test_accuracy_dicts_cover_every_lookup_code.
# Codes outside this list (e.g. a future GWR revision) fall through to
# the catch-all in determine_accuracy().
#
# **If you change these mappings, also update the per-code table in
# python/README.md ("How `area_accuracy` is computed" section).** The
# README is one-time-written documentation and silently drifts otherwise.

_ACCURACY_BY_GKLAS = {
    # Residential 11xx — High confidence (best-supported in the study)
    1110: ACCURACY_HIGH,    # Single-family house        — High
    1121: ACCURACY_HIGH,    # Two-family house           — High
    1122: ACCURACY_HIGH,    # Multi-family house         — High
    1130: ACCURACY_MEDIUM,  # Community residential      — Medium-High

    # Hotels / Tourism
    1211: ACCURACY_MEDIUM,  # Hotel                      — Medium
    1212: ACCURACY_MEDIUM,  # Short-term accommodation   — Medium

    # Commercial / Office
    1220: ACCURACY_MEDIUM,  # Office building            — Medium-High
    1230: ACCURACY_MEDIUM,  # Wholesale and retail       — Medium
    1231: ACCURACY_MEDIUM,  # Restaurants and bars       — Medium
    1241: ACCURACY_LOW,     # Stations and terminals     — Low-Medium
    1242: ACCURACY_MEDIUM,  # Parking garages            — Medium

    # Industrial
    1251: ACCURACY_MEDIUM,  # Industrial building        — Medium-High (ArGV4 anchors lower bound)
    1252: ACCURACY_LOW,     # Tanks, silos, warehouses   — Low-Medium

    # Cultural / Public
    1261: ACCURACY_LOW,     # Culture and leisure        — Low-Medium
    1262: ACCURACY_LOW,     # Museums and libraries      — Low-Medium
    1263: ACCURACY_HIGH,    # Schools and universities   — High (well-supported)
    1264: ACCURACY_MEDIUM,  # Hospitals and clinics      — Medium-High
    1265: ACCURACY_MEDIUM,  # Sports halls               — Medium

    # Special / Heritage
    1271: ACCURACY_LOW,     # Agricultural buildings     — Low-Medium
    1272: ACCURACY_LOW,     # Churches and religious     — Low (range too narrow for naves)
    1273: ACCURACY_LOW,     # Monuments and protected    — Low (heterogeneous by definition)
    1274: ACCURACY_LOW,     # Other structures           — Low (catch-all)
}

_ACCURACY_BY_GKAT = {
    1010: ACCURACY_MEDIUM,  # Provisional shelter            — Medium
    1020: ACCURACY_HIGH,    # Residential single-house parent — High (residential)
    1030: ACCURACY_HIGH,    # Residential w/ secondary use   — High (well-anchored in regulation)
    1040: ACCURACY_MEDIUM,  # Partially residential          — Medium-High
    1060: ACCURACY_MEDIUM,  # Non-residential                — Medium
    1080: ACCURACY_LOW,     # Special-purpose                — Low-Medium
}


def determine_accuracy(
    gkat: Any,
    gklas: Any,
    has_volume: bool,
    has_footprint: bool,
    floor_height_source: Optional[str] = None,
) -> str:
    """
    Determine accuracy bucket from data quality and building type.

    Lookup priority matches ``get_floor_height``: GKLAS (specific class)
    is consulted first, then GKAT (broader category), then a catch-all.

    Two failure-mode short-circuits at the top:

    - If volume or footprint is missing, return LOW (nothing to be confident about).
    - If ``floor_height_source == 'DEFAULT'`` the floor-height lookup fell
      through to the residential default — i.e. the GWR code wasn't in
      ``FLOOR_HEIGHT_LOOKUP``. Return LOW so the output is honest about
      the uncertainty (and consistent with the warning that
      ``estimate_floor_area`` appends in the same path).
    - If both gkat and gklas are missing, return LOW.

    Per-code mappings come from ``_ACCURACY_BY_GKLAS`` / ``_ACCURACY_BY_GKAT``,
    which are derived from the validation study at
    ``docs/Height Assumptions.md``.
    """
    if not has_volume or not has_footprint:
        return ACCURACY_LOW

    if floor_height_source == 'DEFAULT':
        return ACCURACY_LOW

    gkat_int = _to_gwr_code(gkat)
    gklas_int = _to_gwr_code(gklas)
    if gkat_int is None and gklas_int is None:
        return ACCURACY_LOW

    # GKLAS is more specific — try it first
    if gklas_int in _ACCURACY_BY_GKLAS:
        return _ACCURACY_BY_GKLAS[gklas_int]

    if gkat_int in _ACCURACY_BY_GKAT:
        return _ACCURACY_BY_GKAT[gkat_int]

    # Catch-all for codes not in the validated mapping (e.g. a future
    # GWR revision). Medium is the safe default — neither over- nor
    # under-promising on something we haven't characterised.
    return ACCURACY_MEDIUM


def _is_missing(value: Any) -> bool:
    """True if value is None or NaN — used to detect upstream gaps."""
    if value is None:
        return True
    if isinstance(value, float) and math.isnan(value):
        return True
    return False


def estimate_floor_area(volume_result: dict) -> BuildingResult:
    """
    Estimate floor area for a single building from its volume result.

    Uses ``height_minimal_m`` (volume / footprint) for floor count estimation
    — it represents the equivalent uniform box height and is more robust for
    complex building shapes than ``height_mean_m``.

    Args:
        volume_result: Dict from ``volume.calculate_building_volume()``,
            optionally enriched with ``gkat``, ``gklas``, ``gastw`` from
            ``area.enrich_with_gwr()``.

    Returns:
        A new dict with the original keys plus the area-estimation fields.
    """
    result = dict(volume_result)

    # Initialise area fields up front so the schema is consistent across
    # success and skip paths.
    result.update({
        'area_floor_total_m2': None,
        'area_accuracy': None,
        'floors_estimated': None,
        'floor_height_used_m': None,
        'floor_height_source': None,
        'building_type': None,
        'status_step4': None,
    })

    footprint = result.get('area_footprint_m2')
    volume = result.get('volume_above_ground_m3')
    height_minimal = result.get('height_minimal_m')

    # Distinguish "no footprint" from "no volume" so downstream filtering
    # can tell them apart.
    if _is_missing(footprint) or footprint <= 0:
        result['status_step4'] = STATUS_NO_FOOTPRINT
        return result
    if _is_missing(volume) or volume <= 0:
        result['status_step4'] = STATUS_NO_VOLUME
        return result

    # Fall back to volume/footprint if upstream didn't compute height_minimal.
    if _is_missing(height_minimal) or height_minimal <= 0:
        height_minimal = volume / footprint

    if height_minimal > HEIGHT_SANITY_CAP_M:
        result['status_step4'] = STATUS_HEIGHT_EXCEEDS_CAP
        return result

    # Floor height — collapse the min/max range to a single representative value.
    gkat = result.get('gkat')
    gklas = result.get('gklas')
    fh_min, fh_max, source, description = get_floor_height(gkat, gklas)
    floor_height = (fh_min + fh_max) / 2
    if source == 'DEFAULT':
        append_warning(result, 'no GWR class match — using default floor height')

    # Floor count = height_minimal ÷ floor_height, capped at GWR gastw if available.
    # `gastw_int or MAX_FLOORS_FALLBACK` treats both None and 0 as "no cap available".
    floors_estimate = max(1.0, height_minimal / floor_height)
    gastw_int = _to_gwr_code(result.get('gastw'))
    max_floors = gastw_int or MAX_FLOORS_FALLBACK
    floors_estimate = min(floors_estimate, float(max_floors))

    # Gross floor area uses the unrounded estimate so it stays consistent
    # with footprint × (height ÷ floor_height).
    area_estimate = footprint * floors_estimate

    accuracy = determine_accuracy(
        gkat, gklas,
        has_volume=True, has_footprint=True,
        floor_height_source=source,
    )

    result['area_floor_total_m2'] = round(area_estimate, 2)
    result['area_accuracy'] = accuracy
    # Round half-away-from-zero (matches the JS web app's Math.round and
    # avoids Python's banker's-rounding surprise where round(2.5) == 2).
    result['floors_estimated'] = int(floors_estimate + 0.5)
    result['floor_height_used_m'] = round(floor_height, 2)
    result['floor_height_source'] = source
    result['building_type'] = description
    result['status_step4'] = STATUS_SUCCESS

    return result


# ── GWR Enrichment ──────────────────────────────────────────────────────────
#
# Two data access methods, in order of cost:
#   1. Bulk CSV download from housing-stat.ch — zero network calls per building
#   2. swisstopo `find` REST endpoint — one network call per building (fallback)


# Columns we need from the GWR CSV (source name → internal name)
GWR_COLUMNS = {
    'EGID': 'egid',
    'GKAT': 'gkat',
    'GKLAS': 'gklas',
    'GBAUJ': 'gbauj',
    'GASTW': 'gastw',
}

# The four attribute fields we want from any GWR source (CSV or API).
# Used by both query_gwr_api and enrich_with_gwr.
_GWR_OUTPUT_COLS = ('gkat', 'gklas', 'gbauj', 'gastw')

GWR_FIND_URL = "https://api3.geo.admin.ch/rest/services/ech/MapServer/find"


def load_gwr_from_csv(csv_path: str) -> pd.DataFrame:
    """
    Load GWR building data from a bulk CSV download.

    Source: https://www.housing-stat.ch/de/data/supply/public.html

    Returns:
        DataFrame indexed by EGID with columns: gkat, gklas, gbauj, gastw
    """
    log.info(f"Loading GWR data from {csv_path}...")

    df = pd.read_csv(csv_path, sep=';', dtype=str, low_memory=False)

    available = {src: dst for src, dst in GWR_COLUMNS.items() if src in df.columns}
    if 'EGID' not in available:
        raise ValueError(
            f"CSV does not contain EGID column. Found: {list(df.columns[:20])}"
        )

    df = df[list(available.keys())].rename(columns=available)

    for col in ['egid', 'gkat', 'gklas', 'gbauj', 'gastw']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.dropna(subset=['egid'])
    df['egid'] = df['egid'].astype(int)
    df = df.set_index('egid')

    log.info(f"  Loaded {len(df)} buildings from GWR CSV")
    return df


def query_gwr_api(egid: Any) -> dict[str, Optional[int]]:
    """
    Fetch a single building's GWR attributes via swisstopo `find` (one HTTP call).

    Uses the ``ch.bfs.gebaeude_wohnungs_register`` layer with
    ``searchField=egid``, which returns full feature attributes in a single
    request — replacing the older Search → Detail two-call pattern.
    """
    result = {col: None for col in _GWR_OUTPUT_COLS}

    egid_int = _to_gwr_code(egid)
    if egid_int is None:
        return result

    # Canonical request — matches the curl in the swisstopo docs.
    # `sr` is intentionally omitted because returnGeometry=false means
    # there is no geometry to project.
    query = urllib.parse.urlencode({
        'layer': 'ch.bfs.gebaeude_wohnungs_register',
        'searchText': str(egid_int),
        'searchField': 'egid',
        'returnGeometry': 'false',
        'contains': 'false',
    })
    url = f"{GWR_FIND_URL}?{query}"

    try:
        req = urllib.request.Request(url)
        with urllib.request.urlopen(req, timeout=GWR_API_TIMEOUT_S) as resp:
            data = json.loads(resp.read().decode('utf-8'))
    except (urllib.error.URLError, json.JSONDecodeError) as e:
        log.debug("GWR API query failed for EGID %s: %s", egid, e)
        return result

    results = data.get('results') or []
    if not results:
        return result

    # `find` returns features with attributes either under 'properties'
    # or 'attributes' depending on the geometry/SR settings. Pick whichever
    # key is *present* (not just truthy) so an empty 'properties' dict
    # doesn't silently fall through and mask a populated 'attributes'.
    feature = results[0]
    if 'properties' in feature:
        attrs = feature['properties'] or {}
    else:
        attrs = feature.get('attributes') or {}

    for col in _GWR_OUTPUT_COLS:
        if col in attrs:
            result[col] = attrs[col]

    return result


def enrich_with_gwr(
    buildings_df: pd.DataFrame,
    gwr_csv_path: Optional[str] = None,
) -> pd.DataFrame:
    """
    Add GWR classification columns (gkat, gklas, gbauj, gastw) to a DataFrame.

    Uses the bulk CSV when ``gwr_csv_path`` is provided (zero network calls).
    Otherwise falls back to one API call per building with an EGID.
    """
    df = buildings_df.copy()

    for col in _GWR_OUTPUT_COLS:
        if col not in df.columns:
            df[col] = None

    # Prefer av_egid (from cadastral survey), fall back to user-supplied egid.
    if 'av_egid' in df.columns and df['av_egid'].notna().any():
        egid_col = 'av_egid'
    elif 'egid' in df.columns and df['egid'].notna().any():
        egid_col = 'egid'
    else:
        log.warning("No av_egid or egid column with values — skipping GWR enrichment")
        return df

    egids_available = df[egid_col].notna()
    n_with_egid = int(egids_available.sum())

    if gwr_csv_path:
        gwr_df = load_gwr_from_csv(gwr_csv_path)
        gwr_cols = [c for c in _GWR_OUTPUT_COLS if c in gwr_df.columns]

        # reindex aligns gwr_df rows to our EGIDs (NaN-filled where missing),
        # producing a frame with the same row order as our masked subset.
        keys = df.loc[egids_available, egid_col].astype(int)
        looked_up = gwr_df[gwr_cols].reindex(keys.values)
        looked_up.index = keys.index  # align back to df's row labels

        for col in gwr_cols:
            df.loc[egids_available, col] = looked_up[col].values

        # Match counted as "any GWR column populated" — see note in the
        # API path below for the same logic.
        matched_subset = df.loc[egids_available, list(gwr_cols)]
        matched = int(matched_subset.notna().any(axis=1).sum())
        log.info(f"  GWR CSV: matched {matched}/{n_with_egid} buildings")

    else:
        if n_with_egid > GWR_API_WARN_THRESHOLD:
            log.warning(
                f"Querying {n_with_egid} buildings via API "
                f"(parallel ×{GWR_API_MAX_WORKERS}). "
                f"Consider --gwr-csv for very large runs."
            )

        # Build (df_index, egid) pairs so we can write results back to the
        # right rows after the parallel pool returns out of order.
        targets = [
            (idx, int(df.at[idx, egid_col]))
            for idx in df.index[egids_available]
        ]

        log.info(
            f"  GWR API: parallel fetch ×{GWR_API_MAX_WORKERS} "
            f"for {n_with_egid} EGIDs"
        )
        t0 = time.monotonic()
        matched = 0
        completed = 0
        # ~20 progress lines, but always at least every 5 requests for small
        # batches and never more than every 50 for very large ones.
        progress_step = max(5, min(50, n_with_egid // 20 or 1))

        with ThreadPoolExecutor(max_workers=GWR_API_MAX_WORKERS) as ex:
            future_to_idx = {
                ex.submit(query_gwr_api, egid): (idx, egid)
                for idx, egid in targets
            }
            for fut in as_completed(future_to_idx):
                idx, egid = future_to_idx[fut]
                try:
                    attrs = fut.result()
                except Exception as e:  # noqa: BLE001 — log and continue
                    log.debug("GWR fetch raised for EGID %s: %s", egid, e)
                    attrs = {c: None for c in _GWR_OUTPUT_COLS}

                for col in _GWR_OUTPUT_COLS:
                    if attrs[col] is not None:
                        df.at[idx, col] = attrs[col]
                # Count as matched if ANY of the output cols came back —
                # not just gkat. A building can legitimately have gkat=None
                # but gklas/gbauj/gastw populated; we want to credit those
                # as successful lookups too.
                if any(attrs[c] is not None for c in _GWR_OUTPUT_COLS):
                    matched += 1

                completed += 1
                if completed % progress_step == 0 or completed == n_with_egid:
                    elapsed = time.monotonic() - t0
                    rate = completed / elapsed if elapsed > 0 else 0
                    log.info(
                        f"  GWR API: [{completed}/{n_with_egid}] "
                        f"{rate:.1f} req/s"
                    )

        elapsed = time.monotonic() - t0
        log.info(
            f"  GWR API: matched {matched}/{n_with_egid} in {elapsed:.0f}s "
            f"({n_with_egid/elapsed:.0f} req/s)"
        )

    return df