|
- # app/train_mode.py
- # Training mode: build hierarchical KNN model (floor classifier + per-floor X/Y regressors)
- # Adds verbose dataset statistics useful for large training runs.
-
- from __future__ import annotations
-
- import glob
- import os
- import time
- import math
- from dataclasses import dataclass
- from typing import Any, Callable, Dict, List, Optional, Tuple
-
- import joblib
- from datetime import datetime
- import numpy as np
- import pandas as pd
- from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
- import sklearn
-
- # NOTE: these are already present in the project
- from .csv_config import load_gateway_features_csv
- from .logger_utils import log_msg as log
-
-
- @dataclass
- class GatewayStats:
- mac: str
- total_samples: int = 0 # total rows processed (per sample point)
- non_missing: int = 0 # non-missing rssi count
- missing: int = 0 # missing (nan) count
- sum_: float = 0.0
- sumsq: float = 0.0
- min_: float = float("inf")
- max_: float = float("-inf")
-
- def add(self, v: float, is_missing: bool) -> None:
- self.total_samples += 1
- if is_missing:
- self.missing += 1
- return
- self.non_missing += 1
- self.sum_ += v
- self.sumsq += v * v
- if v < self.min_:
- self.min_ = v
- if v > self.max_:
- self.max_ = v
-
- def mean(self) -> float:
- return self.sum_ / self.non_missing if self.non_missing else float("nan")
-
- def std(self) -> float:
- if self.non_missing <= 1:
- return float("nan")
- mu = self.mean()
- var = max(0.0, (self.sumsq / self.non_missing) - (mu * mu))
- return math.sqrt(var)
-
- def missing_pct(self) -> float:
- return (self.missing / self.total_samples) * 100.0 if self.total_samples else 0.0
-
-
- def _get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
- return d.get(key, default) if isinstance(d, dict) else default
-
-
- def _as_bool(v: Any, default: bool = False) -> bool:
- if v is None:
- return default
- if isinstance(v, bool):
- return v
- if isinstance(v, (int, float)):
- return bool(v)
- s = str(v).strip().lower()
- return s in ("1", "true", "yes", "y", "on")
-
-
- def _safe_float(v: Any) -> Optional[float]:
- try:
- if v is None:
- return None
- if isinstance(v, float) and math.isnan(v):
- return None
- return float(v)
- except Exception:
- return None
-
-
- def _collect_dataset(
- sample_files: List[str],
- gateways_order: List[str],
- nan_fill: float,
- log: Callable[[str], None],
- verbose: bool,
- ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, Dict[str, GatewayStats], Dict[str, Any]]:
- """
- Build dataset from per-point sample csv files.
-
- Each sample file is expected to contain:
- header: mac;x;y;z;<GW1>;<GW2>...
- 1 row: beacon_mac; x; y; z; rssi_gw1; rssi_gw2; ...
-
- Returns:
- X (N, G), y_floor (N,), y_xy (N,2), meta_xy (N,2),
- gw_stats, global_stats
- """
- X_rows: List[List[float]] = []
- y_floor: List[int] = []
- y_xy: List[List[float]] = []
- meta_xy: List[List[float]] = []
-
- gw_stats: Dict[str, GatewayStats] = {gw: GatewayStats(mac=gw) for gw in gateways_order}
-
- floors_counter: Dict[int, int] = {}
- bad_files: int = 0
- missing_cols_files: int = 0
-
- expected_cols: Optional[List[str]] = None
-
- for fp in sample_files:
- try:
- df = pd.read_csv(fp, sep=";", dtype=str)
- except Exception as e:
- bad_files += 1
- if verbose:
- log(f"TRAIN WARN: cannot read sample file {fp}: {type(e).__name__}: {e}")
- continue
-
- if df.shape[0] < 1:
- bad_files += 1
- if verbose:
- log(f"TRAIN WARN: empty sample file {fp}")
- continue
-
- row = df.iloc[0].to_dict()
-
- if verbose:
- cols = list(df.columns)
- if expected_cols is None:
- expected_cols = cols
- elif cols != expected_cols:
- missing_cols_files += 1
- if missing_cols_files <= 5:
- log(f"TRAIN WARN: columns mismatch in {os.path.basename(fp)} (expected {len(expected_cols)} cols, got {len(cols)})")
-
- x = _safe_float(row.get("x"))
- y = _safe_float(row.get("y"))
- z = _safe_float(row.get("z"))
- if x is None or y is None or z is None:
- bad_files += 1
- if verbose:
- log(f"TRAIN WARN: missing x/y/z in {fp}")
- continue
-
- z_i = int(round(z))
- floors_counter[z_i] = floors_counter.get(z_i, 0) + 1
-
- feats: List[float] = []
- for gw in gateways_order:
- v = row.get(gw)
- fv = _safe_float(v)
- if fv is None:
- feats.append(nan_fill)
- gw_stats[gw].add(nan_fill, is_missing=True)
- else:
- feats.append(fv)
- gw_stats[gw].add(fv, is_missing=False)
-
- X_rows.append(feats)
- y_floor.append(z_i)
- y_xy.append([x, y])
- meta_xy.append([x, y])
-
- if not X_rows:
- raise RuntimeError("No valid samples found in samples_dir (dataset empty).")
-
- X = np.asarray(X_rows, dtype=np.float32)
- y_floor_arr = np.asarray(y_floor, dtype=np.int32)
- y_xy_arr = np.asarray(y_xy, dtype=np.float32)
- meta_xy_arr = np.asarray(meta_xy, dtype=np.float32)
-
- global_stats: Dict[str, Any] = {
- "samples_total_files": len(sample_files),
- "samples_used": int(X.shape[0]),
- "samples_bad": int(bad_files),
- "floors_counts": dict(sorted(floors_counter.items(), key=lambda kv: kv[0])),
- "missing_cols_files": int(missing_cols_files),
- "gateways": int(len(gateways_order)),
- "nan_fill": float(nan_fill),
- }
-
- return X, y_floor_arr, y_xy_arr, meta_xy_arr, gw_stats, global_stats
-
-
- def _log_train_stats(
- log: Callable[[str], None],
- X: np.ndarray,
- y_floor: np.ndarray,
- y_xy: np.ndarray,
- gateways_order: List[str],
- nan_fill: float,
- gw_stats: Dict[str, GatewayStats],
- global_stats: Dict[str, Any],
- top_k: int = 8,
- ) -> None:
- """Human-friendly statistics for training runs."""
- log(
- "TRAIN stats: "
- f"samples_used={global_stats.get('samples_used')} "
- f"samples_bad={global_stats.get('samples_bad')} "
- f"files_total={global_stats.get('samples_total_files')} "
- f"gateways={len(gateways_order)} "
- f"floors={list(global_stats.get('floors_counts', {}).keys())}"
- )
- if global_stats.get("missing_cols_files", 0):
- log(f"TRAIN stats: files_with_column_mismatch={global_stats['missing_cols_files']} (see earlier WARN lines)")
-
- xs = y_xy[:, 0]
- ys = y_xy[:, 1]
- log(
- "TRAIN stats: XY range "
- f"X[min,max]=[{float(np.min(xs)):.2f},{float(np.max(xs)):.2f}] "
- f"Y[min,max]=[{float(np.min(ys)):.2f},{float(np.max(ys)):.2f}]"
- )
-
- miss = int((X == nan_fill).sum())
- total = int(X.size)
- miss_pct = (miss / total) * 100.0 if total else 0.0
- log(f"TRAIN stats: feature sparsity missing={miss}/{total} ({miss_pct:.1f}%) using nan_fill={nan_fill}")
-
- gw_list = list(gw_stats.values())
- gw_list_sorted = sorted(gw_list, key=lambda s: (s.missing_pct(), -s.non_missing), reverse=True)
-
- worst = gw_list_sorted[: max(1, min(top_k, len(gw_list_sorted)))]
- worst_str = " | ".join(
- f"{g.mac}: miss={g.missing_pct():.1f}% (seen={g.non_missing}) mean={g.mean():.1f} std={g.std():.1f}"
- for g in worst
- )
- log(f"TRAIN stats: gateways with highest missing%: {worst_str}")
-
- best = list(reversed(gw_list_sorted))[: max(1, min(top_k, len(gw_list_sorted)))]
- best_str = " | ".join(
- f"{g.mac}: miss={g.missing_pct():.1f}% (seen={g.non_missing}) mean={g.mean():.1f} std={g.std():.1f}"
- for g in best
- )
- log(f"TRAIN stats: gateways with lowest missing%: {best_str}")
-
- floors = global_stats.get("floors_counts", {})
- if floors:
- floor_str = ", ".join(f"z={k}:{v}" for k, v in floors.items())
- log(f"TRAIN stats: floor distribution: {floor_str}")
-
-
- def run_train(settings: Dict[str, Any], log: Optional[Callable[[str], None]] = None) -> None:
- """
- Train hierarchical KNN:
- - KNeighborsClassifier for floor (Z)
- - For each floor, a KNeighborsRegressor for (X,Y) as multioutput
-
- Model saved with joblib to paths.model (or train.model_path).
- """
- if log is None:
- def log(msg: str) -> None:
- print(msg, flush=True)
-
- # Build stamp for this module (helps verifying which file is running)
- try:
- import hashlib
- from pathlib import Path
- _b = Path(__file__).read_bytes()
- log(f"TRAIN_MODE build sha256={hashlib.sha256(_b).hexdigest()[:12]} size={len(_b)}")
- except Exception:
- pass
-
- train_cfg = _get(settings, "train", {})
- paths = _get(settings, "paths", {})
- debug = _get(settings, "debug", {})
-
- samples_dir = _get(train_cfg, "samples_dir", _get(paths, "samples_dir", "/data/train/samples"))
- gateways_csv = _get(train_cfg, "gateways_csv", _get(paths, "gateways_csv", "/data/config/gateway.csv"))
- model_path = _get(train_cfg, "model_path", _get(paths, "model", "/data/model/model.joblib"))
-
- nan_fill = float(_get(train_cfg, "nan_fill", -110.0))
-
- k_floor = int(_get(train_cfg, "k_floor", _get(_get(settings, "ml", {}), "k", 7)))
- k_xy = int(_get(train_cfg, "k_xy", _get(_get(settings, "ml", {}), "k", 7)))
-
- weights = str(_get(train_cfg, "weights", _get(_get(settings, "ml", {}), "weights", "distance")))
- metric = str(_get(train_cfg, "metric", _get(_get(settings, "ml", {}), "metric", "euclidean")))
-
- verbose = _as_bool(_get(debug, "train_verbose", True), True)
- top_k = int(_get(debug, "train_stats_top_k", 8))
-
- backup_existing_model = _as_bool(_get(train_cfg, "backup_existing_model", True), True)
-
- log(
- "TRAIN config: "
- f"samples_dir={samples_dir} "
- f"gateways_csv={gateways_csv} "
- f"model_path={model_path} "
- f"nan_fill={nan_fill} "
- f"k_floor={k_floor} k_xy={k_xy} "
- f"weights={weights} metric={metric} "
- f"train_verbose={verbose} backup_existing_model={backup_existing_model}"
- )
-
- # 1) Load gateways definition to know feature order
- gws = load_gateway_features_csv(str(gateways_csv))
- gateways_order = [g.mac for g in gws]
- if not gateways_order:
- raise RuntimeError("No gateways found in gateways_csv (feature-set empty).")
-
- if verbose:
- preview = ", ".join(gateways_order[: min(6, len(gateways_order))])
- log(f"TRAIN: gateways(feature-order)={len(gateways_order)} first=[{preview}{'...' if len(gateways_order) > 6 else ''}]")
-
- # 2) Collect sample files
- sample_files = sorted(glob.glob(os.path.join(samples_dir, "*.csv")))
- if not sample_files:
- raise RuntimeError(f"No sample files found in samples_dir={samples_dir}")
-
- X, y_floor, y_xy, meta_xy, gw_stats, global_stats = _collect_dataset(
- sample_files=sample_files,
- gateways_order=gateways_order,
- nan_fill=nan_fill,
- log=log,
- verbose=verbose,
- )
-
- if verbose:
- _log_train_stats(
- log=log,
- X=X,
- y_floor=y_floor,
- y_xy=meta_xy,
- gateways_order=gateways_order,
- nan_fill=nan_fill,
- gw_stats=gw_stats,
- global_stats=global_stats,
- top_k=top_k,
- )
-
- # 3) Fit floor classifier
- floor_clf = KNeighborsClassifier(
- n_neighbors=k_floor,
- weights=weights,
- metric=metric,
- )
- floor_clf.fit(X, y_floor)
-
- # 4) Fit per-floor XY regressors (multioutput)
- models_xy: Dict[int, Any] = {}
- floors = sorted(set(int(z) for z in y_floor.tolist()))
- for z in floors:
- idx = np.where(y_floor == z)[0]
- Xz = X[idx, :]
- yz = y_xy[idx, :] # (N,2)
- reg = KNeighborsRegressor(
- n_neighbors=k_xy,
- weights=weights,
- metric=metric,
- )
- reg.fit(Xz, yz)
- models_xy[int(z)] = reg
-
- if verbose:
- xs = yz[:, 0]
- ys = yz[:, 1]
- log(
- f"TRAIN: floor z={z} samples={int(len(idx))} "
- f"Xrange=[{float(np.min(xs)):.1f},{float(np.max(xs)):.1f}] "
- f"Yrange=[{float(np.min(ys)):.1f},{float(np.max(ys)):.1f}]"
- )
-
- model = {
- "type": "hier_knn_floor_xy",
- "gateways_order": gateways_order,
- "nan_fill": nan_fill,
- "k_floor": k_floor,
- "k_xy": k_xy,
- "weights": weights,
- "metric": metric,
- "floor_clf": floor_clf,
- "xy_by_floor": models_xy,
- "floors": floors,
- }
-
- os.makedirs(os.path.dirname(model_path), exist_ok=True)
-
- # Backup previous model (così inferenza può continuare ad usare una versione nota)
- backup_path = None
- if backup_existing_model and os.path.exists(model_path):
- root, ext = os.path.splitext(model_path)
- ts = int(time.time())
- # evita collisioni se lanci due train nello stesso secondo
- for bump in range(0, 1000):
- cand = f"{root}_{ts + bump}{ext}"
- if not os.path.exists(cand):
- backup_path = cand
- break
- try:
- if backup_path:
- os.replace(model_path, backup_path)
- log(f"TRAIN: previous model moved to {backup_path}")
- except Exception as e:
- log(f"TRAIN WARNING: cannot backup previous model {model_path}: {type(e).__name__}: {e}")
-
- # Metadata utile (tipo 'modinfo' minimale)
- model["created_at_utc"] = datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
- model["sklearn_version"] = getattr(sklearn, "__version__", "unknown")
- model["numpy_version"] = getattr(np, "__version__", "unknown")
-
- joblib.dump(model, model_path)
-
- log(
- f"TRAIN DONE: model saved to {model_path} "
- f"(samples={int(X.shape[0])}, gateways={len(gateways_order)}, floors={len(floors)})"
- )
|