pollutri
/
ble-ai-localizer


			
							# app/train_mode.py
# Training mode: build hierarchical KNN model (floor classifier + per-floor X/Y regressors)
# Adds verbose dataset statistics useful for large training runs.

from __future__ import annotations

import glob
import os
import time
import math
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Tuple

import joblib
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
import sklearn

# NOTE: these are already present in the project
from .csv_config import load_gateway_features_csv
from .logger_utils import log_msg as log


@dataclass
class GatewayStats:
    mac: str
    total_samples: int = 0      # total rows processed (per sample point)
    non_missing: int = 0        # non-missing rssi count
    missing: int = 0            # missing (nan) count
    sum_: float = 0.0
    sumsq: float = 0.0
    min_: float = float("inf")
    max_: float = float("-inf")

    def add(self, v: float, is_missing: bool) -> None:
        self.total_samples += 1
        if is_missing:
            self.missing += 1
            return
        self.non_missing += 1
        self.sum_ += v
        self.sumsq += v * v
        if v < self.min_:
            self.min_ = v
        if v > self.max_:
            self.max_ = v

    def mean(self) -> float:
        return self.sum_ / self.non_missing if self.non_missing else float("nan")

    def std(self) -> float:
        if self.non_missing <= 1:
            return float("nan")
        mu = self.mean()
        var = max(0.0, (self.sumsq / self.non_missing) - (mu * mu))
        return math.sqrt(var)

    def missing_pct(self) -> float:
        return (self.missing / self.total_samples) * 100.0 if self.total_samples else 0.0


def _get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
    return d.get(key, default) if isinstance(d, dict) else default


def _as_bool(v: Any, default: bool = False) -> bool:
    if v is None:
        return default
    if isinstance(v, bool):
        return v
    if isinstance(v, (int, float)):
        return bool(v)
    s = str(v).strip().lower()
    return s in ("1", "true", "yes", "y", "on")


def _safe_float(v: Any) -> Optional[float]:
    try:
        if v is None:
            return None
        if isinstance(v, float) and math.isnan(v):
            return None
        return float(v)
    except Exception:
        return None


def _collect_dataset(
    sample_files: List[str],
    gateways_order: List[str],
    nan_fill: float,
    log: Callable[[str], None],
    verbose: bool,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, Dict[str, GatewayStats], Dict[str, Any]]:
    """
    Build dataset from per-point sample csv files.

    Each sample file is expected to contain:
      header: mac;x;y;z;<GW1>;<GW2>...
      1 row:  beacon_mac; x; y; z; rssi_gw1; rssi_gw2; ...

    Returns:
      X (N, G), y_floor (N,), y_xy (N,2), meta_xy (N,2),
      gw_stats, global_stats
    """
    X_rows: List[List[float]] = []
    y_floor: List[int] = []
    y_xy: List[List[float]] = []
    meta_xy: List[List[float]] = []

    gw_stats: Dict[str, GatewayStats] = {gw: GatewayStats(mac=gw) for gw in gateways_order}

    floors_counter: Dict[int, int] = {}
    bad_files: int = 0
    missing_cols_files: int = 0

    expected_cols: Optional[List[str]] = None

    for fp in sample_files:
        try:
            df = pd.read_csv(fp, sep=";", dtype=str)
        except Exception as e:
            bad_files += 1
            if verbose:
                log(f"TRAIN WARN: cannot read sample file {fp}: {type(e).__name__}: {e}")
            continue

        if df.shape[0] < 1:
            bad_files += 1
            if verbose:
                log(f"TRAIN WARN: empty sample file {fp}")
            continue

        row = df.iloc[0].to_dict()

        if verbose:
            cols = list(df.columns)
            if expected_cols is None:
                expected_cols = cols
            elif cols != expected_cols:
                missing_cols_files += 1
                if missing_cols_files <= 5:
                    log(f"TRAIN WARN: columns mismatch in {os.path.basename(fp)} (expected {len(expected_cols)} cols, got {len(cols)})")

        x = _safe_float(row.get("x"))
        y = _safe_float(row.get("y"))
        z = _safe_float(row.get("z"))
        if x is None or y is None or z is None:
            bad_files += 1
            if verbose:
                log(f"TRAIN WARN: missing x/y/z in {fp}")
            continue

        z_i = int(round(z))
        floors_counter[z_i] = floors_counter.get(z_i, 0) + 1

        feats: List[float] = []
        for gw in gateways_order:
            v = row.get(gw)
            fv = _safe_float(v)
            if fv is None:
                feats.append(nan_fill)
                gw_stats[gw].add(nan_fill, is_missing=True)
            else:
                feats.append(fv)
                gw_stats[gw].add(fv, is_missing=False)

        X_rows.append(feats)
        y_floor.append(z_i)
        y_xy.append([x, y])
        meta_xy.append([x, y])

    if not X_rows:
        raise RuntimeError("No valid samples found in samples_dir (dataset empty).")

    X = np.asarray(X_rows, dtype=np.float32)
    y_floor_arr = np.asarray(y_floor, dtype=np.int32)
    y_xy_arr = np.asarray(y_xy, dtype=np.float32)
    meta_xy_arr = np.asarray(meta_xy, dtype=np.float32)

    global_stats: Dict[str, Any] = {
        "samples_total_files": len(sample_files),
        "samples_used": int(X.shape[0]),
        "samples_bad": int(bad_files),
        "floors_counts": dict(sorted(floors_counter.items(), key=lambda kv: kv[0])),
        "missing_cols_files": int(missing_cols_files),
        "gateways": int(len(gateways_order)),
        "nan_fill": float(nan_fill),
    }

    return X, y_floor_arr, y_xy_arr, meta_xy_arr, gw_stats, global_stats


def _log_train_stats(
    log: Callable[[str], None],
    X: np.ndarray,
    y_floor: np.ndarray,
    y_xy: np.ndarray,
    gateways_order: List[str],
    nan_fill: float,
    gw_stats: Dict[str, GatewayStats],
    global_stats: Dict[str, Any],
    top_k: int = 8,
) -> None:
    """Human-friendly statistics for training runs."""
    log(
        "TRAIN stats: "
        f"samples_used={global_stats.get('samples_used')} "
        f"samples_bad={global_stats.get('samples_bad')} "
        f"files_total={global_stats.get('samples_total_files')} "
        f"gateways={len(gateways_order)} "
        f"floors={list(global_stats.get('floors_counts', {}).keys())}"
    )
    if global_stats.get("missing_cols_files", 0):
        log(f"TRAIN stats: files_with_column_mismatch={global_stats['missing_cols_files']} (see earlier WARN lines)")

    xs = y_xy[:, 0]
    ys = y_xy[:, 1]
    log(
        "TRAIN stats: XY range "
        f"X[min,max]=[{float(np.min(xs)):.2f},{float(np.max(xs)):.2f}] "
        f"Y[min,max]=[{float(np.min(ys)):.2f},{float(np.max(ys)):.2f}]"
    )

    miss = int((X == nan_fill).sum())
    total = int(X.size)
    miss_pct = (miss / total) * 100.0 if total else 0.0
    log(f"TRAIN stats: feature sparsity missing={miss}/{total} ({miss_pct:.1f}%) using nan_fill={nan_fill}")

    gw_list = list(gw_stats.values())
    gw_list_sorted = sorted(gw_list, key=lambda s: (s.missing_pct(), -s.non_missing), reverse=True)

    worst = gw_list_sorted[: max(1, min(top_k, len(gw_list_sorted)))]
    worst_str = " | ".join(
        f"{g.mac}: miss={g.missing_pct():.1f}% (seen={g.non_missing}) mean={g.mean():.1f} std={g.std():.1f}"
        for g in worst
    )
    log(f"TRAIN stats: gateways with highest missing%: {worst_str}")

    best = list(reversed(gw_list_sorted))[: max(1, min(top_k, len(gw_list_sorted)))]
    best_str = " | ".join(
        f"{g.mac}: miss={g.missing_pct():.1f}% (seen={g.non_missing}) mean={g.mean():.1f} std={g.std():.1f}"
        for g in best
    )
    log(f"TRAIN stats: gateways with lowest missing%: {best_str}")

    floors = global_stats.get("floors_counts", {})
    if floors:
        floor_str = ", ".join(f"z={k}:{v}" for k, v in floors.items())
        log(f"TRAIN stats: floor distribution: {floor_str}")


def run_train(settings: Dict[str, Any], log: Optional[Callable[[str], None]] = None) -> None:
    """
    Train hierarchical KNN:
      - KNeighborsClassifier for floor (Z)
      - For each floor, a KNeighborsRegressor for (X,Y) as multioutput

    Model saved with joblib to paths.model (or train.model_path).
    """
    if log is None:
        def log(msg: str) -> None:
            print(msg, flush=True)

    # Build stamp for this module (helps verifying which file is running)
    try:
        import hashlib
        from pathlib import Path
        _b = Path(__file__).read_bytes()
        log(f"TRAIN_MODE build sha256={hashlib.sha256(_b).hexdigest()[:12]} size={len(_b)}")
    except Exception:
        pass

    train_cfg = _get(settings, "train", {})
    paths = _get(settings, "paths", {})
    debug = _get(settings, "debug", {})

    samples_dir = _get(train_cfg, "samples_dir", _get(paths, "samples_dir", "/data/train/samples"))
    gateways_csv = _get(train_cfg, "gateways_csv", _get(paths, "gateways_csv", "/data/config/gateway.csv"))
    model_path = _get(train_cfg, "model_path", _get(paths, "model", "/data/model/model.joblib"))

    nan_fill = float(_get(train_cfg, "nan_fill", -110.0))

    k_floor = int(_get(train_cfg, "k_floor", _get(_get(settings, "ml", {}), "k", 7)))
    k_xy = int(_get(train_cfg, "k_xy", _get(_get(settings, "ml", {}), "k", 7)))

    weights = str(_get(train_cfg, "weights", _get(_get(settings, "ml", {}), "weights", "distance")))
    metric = str(_get(train_cfg, "metric", _get(_get(settings, "ml", {}), "metric", "euclidean")))

    verbose = _as_bool(_get(debug, "train_verbose", True), True)
    top_k = int(_get(debug, "train_stats_top_k", 8))

    backup_existing_model = _as_bool(_get(train_cfg, "backup_existing_model", True), True)

    log(
        "TRAIN config: "
        f"samples_dir={samples_dir} "
        f"gateways_csv={gateways_csv} "
        f"model_path={model_path} "
        f"nan_fill={nan_fill} "
        f"k_floor={k_floor} k_xy={k_xy} "
        f"weights={weights} metric={metric} "
        f"train_verbose={verbose} backup_existing_model={backup_existing_model}"
    )

    # 1) Load gateways definition to know feature order
    gws = load_gateway_features_csv(str(gateways_csv))
    gateways_order = [g.mac for g in gws]
    if not gateways_order:
        raise RuntimeError("No gateways found in gateways_csv (feature-set empty).")

    if verbose:
        preview = ", ".join(gateways_order[: min(6, len(gateways_order))])
        log(f"TRAIN: gateways(feature-order)={len(gateways_order)} first=[{preview}{'...' if len(gateways_order) > 6 else ''}]")

    # 2) Collect sample files
    sample_files = sorted(glob.glob(os.path.join(samples_dir, "*.csv")))
    if not sample_files:
        raise RuntimeError(f"No sample files found in samples_dir={samples_dir}")

    X, y_floor, y_xy, meta_xy, gw_stats, global_stats = _collect_dataset(
        sample_files=sample_files,
        gateways_order=gateways_order,
        nan_fill=nan_fill,
        log=log,
        verbose=verbose,
    )

    if verbose:
        _log_train_stats(
            log=log,
            X=X,
            y_floor=y_floor,
            y_xy=meta_xy,
            gateways_order=gateways_order,
            nan_fill=nan_fill,
            gw_stats=gw_stats,
            global_stats=global_stats,
            top_k=top_k,
        )

    # 3) Fit floor classifier
    floor_clf = KNeighborsClassifier(
        n_neighbors=k_floor,
        weights=weights,
        metric=metric,
    )
    floor_clf.fit(X, y_floor)

    # 4) Fit per-floor XY regressors (multioutput)
    models_xy: Dict[int, Any] = {}
    floors = sorted(set(int(z) for z in y_floor.tolist()))
    for z in floors:
        idx = np.where(y_floor == z)[0]
        Xz = X[idx, :]
        yz = y_xy[idx, :]  # (N,2)
        reg = KNeighborsRegressor(
            n_neighbors=k_xy,
            weights=weights,
            metric=metric,
        )
        reg.fit(Xz, yz)
        models_xy[int(z)] = reg

        if verbose:
            xs = yz[:, 0]
            ys = yz[:, 1]
            log(
                f"TRAIN: floor z={z} samples={int(len(idx))} "
                f"Xrange=[{float(np.min(xs)):.1f},{float(np.max(xs)):.1f}] "
                f"Yrange=[{float(np.min(ys)):.1f},{float(np.max(ys)):.1f}]"
            )

    model = {
        "type": "hier_knn_floor_xy",
        "gateways_order": gateways_order,
        "nan_fill": nan_fill,
        "k_floor": k_floor,
        "k_xy": k_xy,
        "weights": weights,
        "metric": metric,
        "floor_clf": floor_clf,
        "xy_by_floor": models_xy,
        "floors": floors,
    }

    os.makedirs(os.path.dirname(model_path), exist_ok=True)

    # Backup previous model (così inferenza può continuare ad usare una versione nota)
    backup_path = None
    if backup_existing_model and os.path.exists(model_path):
        root, ext = os.path.splitext(model_path)
        ts = int(time.time())
        # evita collisioni se lanci due train nello stesso secondo
        for bump in range(0, 1000):
            cand = f"{root}_{ts + bump}{ext}"
            if not os.path.exists(cand):
                backup_path = cand
                break
        try:
            if backup_path:
                os.replace(model_path, backup_path)
                log(f"TRAIN: previous model moved to {backup_path}")
        except Exception as e:
            log(f"TRAIN WARNING: cannot backup previous model {model_path}: {type(e).__name__}: {e}")

    # Metadata utile (tipo 'modinfo' minimale)
    model["created_at_utc"] = datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
    model["sklearn_version"] = getattr(sklearn, "__version__", "unknown")
    model["numpy_version"] = getattr(np, "__version__", "unknown")

    joblib.dump(model, model_path)

    log(
        f"TRAIN DONE: model saved to {model_path} "
        f"(samples={int(X.shape[0])}, gateways={len(gateways_order)}, floors={len(floors)})"
    )