# app/train_mode.py # Training mode: build hierarchical KNN model (floor classifier + per-floor X/Y regressors) # Adds verbose dataset statistics useful for large training runs. from __future__ import annotations import glob import os import time import math from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Tuple import joblib from datetime import datetime import numpy as np import pandas as pd from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor import sklearn # NOTE: these are already present in the project from .csv_config import load_gateway_features_csv from .logger_utils import log_msg as log @dataclass class GatewayStats: mac: str total_samples: int = 0 # total rows processed (per sample point) non_missing: int = 0 # non-missing rssi count missing: int = 0 # missing (nan) count sum_: float = 0.0 sumsq: float = 0.0 min_: float = float("inf") max_: float = float("-inf") def add(self, v: float, is_missing: bool) -> None: self.total_samples += 1 if is_missing: self.missing += 1 return self.non_missing += 1 self.sum_ += v self.sumsq += v * v if v < self.min_: self.min_ = v if v > self.max_: self.max_ = v def mean(self) -> float: return self.sum_ / self.non_missing if self.non_missing else float("nan") def std(self) -> float: if self.non_missing <= 1: return float("nan") mu = self.mean() var = max(0.0, (self.sumsq / self.non_missing) - (mu * mu)) return math.sqrt(var) def missing_pct(self) -> float: return (self.missing / self.total_samples) * 100.0 if self.total_samples else 0.0 def _get(d: Dict[str, Any], key: str, default: Any = None) -> Any: return d.get(key, default) if isinstance(d, dict) else default def _as_bool(v: Any, default: bool = False) -> bool: if v is None: return default if isinstance(v, bool): return v if isinstance(v, (int, float)): return bool(v) s = str(v).strip().lower() return s in ("1", "true", "yes", "y", "on") def _safe_float(v: Any) -> Optional[float]: try: if v is None: return None if isinstance(v, float) and math.isnan(v): return None return float(v) except Exception: return None def _collect_dataset( sample_files: List[str], gateways_order: List[str], nan_fill: float, log: Callable[[str], None], verbose: bool, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, Dict[str, GatewayStats], Dict[str, Any]]: """ Build dataset from per-point sample csv files. Each sample file is expected to contain: header: mac;x;y;z;;... 1 row: beacon_mac; x; y; z; rssi_gw1; rssi_gw2; ... Returns: X (N, G), y_floor (N,), y_xy (N,2), meta_xy (N,2), gw_stats, global_stats """ X_rows: List[List[float]] = [] y_floor: List[int] = [] y_xy: List[List[float]] = [] meta_xy: List[List[float]] = [] gw_stats: Dict[str, GatewayStats] = {gw: GatewayStats(mac=gw) for gw in gateways_order} floors_counter: Dict[int, int] = {} bad_files: int = 0 missing_cols_files: int = 0 expected_cols: Optional[List[str]] = None for fp in sample_files: try: df = pd.read_csv(fp, sep=";", dtype=str) except Exception as e: bad_files += 1 if verbose: log(f"TRAIN WARN: cannot read sample file {fp}: {type(e).__name__}: {e}") continue if df.shape[0] < 1: bad_files += 1 if verbose: log(f"TRAIN WARN: empty sample file {fp}") continue row = df.iloc[0].to_dict() if verbose: cols = list(df.columns) if expected_cols is None: expected_cols = cols elif cols != expected_cols: missing_cols_files += 1 if missing_cols_files <= 5: log(f"TRAIN WARN: columns mismatch in {os.path.basename(fp)} (expected {len(expected_cols)} cols, got {len(cols)})") x = _safe_float(row.get("x")) y = _safe_float(row.get("y")) z = _safe_float(row.get("z")) if x is None or y is None or z is None: bad_files += 1 if verbose: log(f"TRAIN WARN: missing x/y/z in {fp}") continue z_i = int(round(z)) floors_counter[z_i] = floors_counter.get(z_i, 0) + 1 feats: List[float] = [] for gw in gateways_order: v = row.get(gw) fv = _safe_float(v) if fv is None: feats.append(nan_fill) gw_stats[gw].add(nan_fill, is_missing=True) else: feats.append(fv) gw_stats[gw].add(fv, is_missing=False) X_rows.append(feats) y_floor.append(z_i) y_xy.append([x, y]) meta_xy.append([x, y]) if not X_rows: raise RuntimeError("No valid samples found in samples_dir (dataset empty).") X = np.asarray(X_rows, dtype=np.float32) y_floor_arr = np.asarray(y_floor, dtype=np.int32) y_xy_arr = np.asarray(y_xy, dtype=np.float32) meta_xy_arr = np.asarray(meta_xy, dtype=np.float32) global_stats: Dict[str, Any] = { "samples_total_files": len(sample_files), "samples_used": int(X.shape[0]), "samples_bad": int(bad_files), "floors_counts": dict(sorted(floors_counter.items(), key=lambda kv: kv[0])), "missing_cols_files": int(missing_cols_files), "gateways": int(len(gateways_order)), "nan_fill": float(nan_fill), } return X, y_floor_arr, y_xy_arr, meta_xy_arr, gw_stats, global_stats def _log_train_stats( log: Callable[[str], None], X: np.ndarray, y_floor: np.ndarray, y_xy: np.ndarray, gateways_order: List[str], nan_fill: float, gw_stats: Dict[str, GatewayStats], global_stats: Dict[str, Any], top_k: int = 8, ) -> None: """Human-friendly statistics for training runs.""" log( "TRAIN stats: " f"samples_used={global_stats.get('samples_used')} " f"samples_bad={global_stats.get('samples_bad')} " f"files_total={global_stats.get('samples_total_files')} " f"gateways={len(gateways_order)} " f"floors={list(global_stats.get('floors_counts', {}).keys())}" ) if global_stats.get("missing_cols_files", 0): log(f"TRAIN stats: files_with_column_mismatch={global_stats['missing_cols_files']} (see earlier WARN lines)") xs = y_xy[:, 0] ys = y_xy[:, 1] log( "TRAIN stats: XY range " f"X[min,max]=[{float(np.min(xs)):.2f},{float(np.max(xs)):.2f}] " f"Y[min,max]=[{float(np.min(ys)):.2f},{float(np.max(ys)):.2f}]" ) miss = int((X == nan_fill).sum()) total = int(X.size) miss_pct = (miss / total) * 100.0 if total else 0.0 log(f"TRAIN stats: feature sparsity missing={miss}/{total} ({miss_pct:.1f}%) using nan_fill={nan_fill}") gw_list = list(gw_stats.values()) gw_list_sorted = sorted(gw_list, key=lambda s: (s.missing_pct(), -s.non_missing), reverse=True) worst = gw_list_sorted[: max(1, min(top_k, len(gw_list_sorted)))] worst_str = " | ".join( f"{g.mac}: miss={g.missing_pct():.1f}% (seen={g.non_missing}) mean={g.mean():.1f} std={g.std():.1f}" for g in worst ) log(f"TRAIN stats: gateways with highest missing%: {worst_str}") best = list(reversed(gw_list_sorted))[: max(1, min(top_k, len(gw_list_sorted)))] best_str = " | ".join( f"{g.mac}: miss={g.missing_pct():.1f}% (seen={g.non_missing}) mean={g.mean():.1f} std={g.std():.1f}" for g in best ) log(f"TRAIN stats: gateways with lowest missing%: {best_str}") floors = global_stats.get("floors_counts", {}) if floors: floor_str = ", ".join(f"z={k}:{v}" for k, v in floors.items()) log(f"TRAIN stats: floor distribution: {floor_str}") def run_train(settings: Dict[str, Any], log: Optional[Callable[[str], None]] = None) -> None: """ Train hierarchical KNN: - KNeighborsClassifier for floor (Z) - For each floor, a KNeighborsRegressor for (X,Y) as multioutput Model saved with joblib to paths.model (or train.model_path). """ if log is None: def log(msg: str) -> None: print(msg, flush=True) # Build stamp for this module (helps verifying which file is running) try: import hashlib from pathlib import Path _b = Path(__file__).read_bytes() log(f"TRAIN_MODE build sha256={hashlib.sha256(_b).hexdigest()[:12]} size={len(_b)}") except Exception: pass train_cfg = _get(settings, "train", {}) paths = _get(settings, "paths", {}) debug = _get(settings, "debug", {}) samples_dir = _get(train_cfg, "samples_dir", _get(paths, "samples_dir", "/data/train/samples")) gateways_csv = _get(train_cfg, "gateways_csv", _get(paths, "gateways_csv", "/data/config/gateway.csv")) model_path = _get(train_cfg, "model_path", _get(paths, "model", "/data/model/model.joblib")) nan_fill = float(_get(train_cfg, "nan_fill", -110.0)) k_floor = int(_get(train_cfg, "k_floor", _get(_get(settings, "ml", {}), "k", 7))) k_xy = int(_get(train_cfg, "k_xy", _get(_get(settings, "ml", {}), "k", 7))) weights = str(_get(train_cfg, "weights", _get(_get(settings, "ml", {}), "weights", "distance"))) metric = str(_get(train_cfg, "metric", _get(_get(settings, "ml", {}), "metric", "euclidean"))) verbose = _as_bool(_get(debug, "train_verbose", True), True) top_k = int(_get(debug, "train_stats_top_k", 8)) backup_existing_model = _as_bool(_get(train_cfg, "backup_existing_model", True), True) log( "TRAIN config: " f"samples_dir={samples_dir} " f"gateways_csv={gateways_csv} " f"model_path={model_path} " f"nan_fill={nan_fill} " f"k_floor={k_floor} k_xy={k_xy} " f"weights={weights} metric={metric} " f"train_verbose={verbose} backup_existing_model={backup_existing_model}" ) # 1) Load gateways definition to know feature order gws = load_gateway_features_csv(str(gateways_csv)) gateways_order = [g.mac for g in gws] if not gateways_order: raise RuntimeError("No gateways found in gateways_csv (feature-set empty).") if verbose: preview = ", ".join(gateways_order[: min(6, len(gateways_order))]) log(f"TRAIN: gateways(feature-order)={len(gateways_order)} first=[{preview}{'...' if len(gateways_order) > 6 else ''}]") # 2) Collect sample files sample_files = sorted(glob.glob(os.path.join(samples_dir, "*.csv"))) if not sample_files: raise RuntimeError(f"No sample files found in samples_dir={samples_dir}") X, y_floor, y_xy, meta_xy, gw_stats, global_stats = _collect_dataset( sample_files=sample_files, gateways_order=gateways_order, nan_fill=nan_fill, log=log, verbose=verbose, ) if verbose: _log_train_stats( log=log, X=X, y_floor=y_floor, y_xy=meta_xy, gateways_order=gateways_order, nan_fill=nan_fill, gw_stats=gw_stats, global_stats=global_stats, top_k=top_k, ) # 3) Fit floor classifier floor_clf = KNeighborsClassifier( n_neighbors=k_floor, weights=weights, metric=metric, ) floor_clf.fit(X, y_floor) # 4) Fit per-floor XY regressors (multioutput) models_xy: Dict[int, Any] = {} floors = sorted(set(int(z) for z in y_floor.tolist())) for z in floors: idx = np.where(y_floor == z)[0] Xz = X[idx, :] yz = y_xy[idx, :] # (N,2) reg = KNeighborsRegressor( n_neighbors=k_xy, weights=weights, metric=metric, ) reg.fit(Xz, yz) models_xy[int(z)] = reg if verbose: xs = yz[:, 0] ys = yz[:, 1] log( f"TRAIN: floor z={z} samples={int(len(idx))} " f"Xrange=[{float(np.min(xs)):.1f},{float(np.max(xs)):.1f}] " f"Yrange=[{float(np.min(ys)):.1f},{float(np.max(ys)):.1f}]" ) model = { "type": "hier_knn_floor_xy", "gateways_order": gateways_order, "nan_fill": nan_fill, "k_floor": k_floor, "k_xy": k_xy, "weights": weights, "metric": metric, "floor_clf": floor_clf, "xy_by_floor": models_xy, "floors": floors, } os.makedirs(os.path.dirname(model_path), exist_ok=True) # Backup previous model (così inferenza può continuare ad usare una versione nota) backup_path = None if backup_existing_model and os.path.exists(model_path): root, ext = os.path.splitext(model_path) ts = int(time.time()) # evita collisioni se lanci due train nello stesso secondo for bump in range(0, 1000): cand = f"{root}_{ts + bump}{ext}" if not os.path.exists(cand): backup_path = cand break try: if backup_path: os.replace(model_path, backup_path) log(f"TRAIN: previous model moved to {backup_path}") except Exception as e: log(f"TRAIN WARNING: cannot backup previous model {model_path}: {type(e).__name__}: {e}") # Metadata utile (tipo 'modinfo' minimale) model["created_at_utc"] = datetime.utcnow().replace(microsecond=0).isoformat() + "Z" model["sklearn_version"] = getattr(sklearn, "__version__", "unknown") model["numpy_version"] = getattr(np, "__version__", "unknown") joblib.dump(model, model_path) log( f"TRAIN DONE: model saved to {model_path} " f"(samples={int(X.shape[0])}, gateways={len(gateways_order)}, floors={len(floors)})" )