Source code for footix.utils.team_name_resolver

"""Robust team name resolver for matching calendar names to training names.

Combines a static YAML mapping (hand-curated, per-league) with rapidfuzz
fuzzy matching (WRatio). When the fuzzy confidence is insufficient and the
resolver is in interactive mode, the user is prompted to confirm or provide the
correct training name. Confirmed mappings are persisted back to the YAML so
subsequent runs require no interaction for the same team names.

Usage:
    >>> from footix.utils.team_name_resolver import TeamNameResolver
    >>> resolver = TeamNameResolver(league="ligue_1", interactive=True)
    >>> mapping = resolver.resolve(calendar_names=[...], training_names=[...])

"""

from __future__ import annotations

import sys
from pathlib import Path

import yaml
from rapidfuzz import fuzz, process

# Default mapping directory: <repo_root>/data/team_name_mappings/
_DEFAULT_MAPPING_DIR = Path(__file__).parent.parent.parent / "data" / "team_name_mappings"

# Map competition strings to league YAML keys
COMPETITION_TO_LEAGUE_KEY: dict[str, str] = {
    "FRA Ligue 1": "ligue_1",
    "FRA Ligue 2": "ligue_2",
    "DEU Bundesliga 1": "bundesliga_1",
}


[docs] class UnresolvedTeamNameError(ValueError): """Raised when a team name cannot be resolved in non-interactive mode. Attributes: team_name: The unresolved calendar team name. candidates: Top fuzzy candidates as (name, score) tuples. """ def __init__( self, message: str, team_name: str, candidates: list[tuple[str, float]], ) -> None: """Initialize UnresolvedTeamNameError. Args: message: Human-readable error description. team_name: The calendar name that failed to resolve. candidates: Top rapidfuzz candidates. """ super().__init__(message) self.team_name = team_name self.candidates = candidates
[docs] class TeamNameResolver: """Map calendar-format team names to model-training team names. Resolution priority: 1. Static YAML mapping (case-insensitive exact match). 2. Exact case-insensitive match in training names. 3. rapidfuzz WRatio ≥ ``auto_threshold`` → auto-accept. 4. rapidfuzz WRatio ≥ ``confirm_threshold`` → interactive confirm (if ``interactive=True``) or ``UnresolvedTeamNameError`` otherwise. 5. Below ``confirm_threshold`` → interactive selection from top-5 candidates or ``UnresolvedTeamNameError`` in non-interactive mode. New mappings discovered during a run are persisted to the YAML file so subsequent runs skip the interactive step for the same team names. Args: league: League YAML key, e.g. ``"ligue_1"``, ``"ligue_2"``, ``"bundesliga_1"``. Can also be a full competition string such as ``"FRA Ligue 1"`` — it will be converted automatically. mapping_dir: Directory containing ``<league>.yaml`` mapping files. Defaults to ``<repo_root>/data/team_name_mappings/``. interactive: When ``True``, ambiguous matches trigger a CLI prompt. Set to ``False`` for CI / non-interactive pipelines; ``UnresolvedTeamNameError`` is raised instead. auto_threshold: rapidfuzz WRatio score (0–100) above which a match is accepted automatically. Default ``90``. confirm_threshold: rapidfuzz WRatio score (0–100) above which a match is proposed for interactive confirmation. Default ``70``. """ def __init__( self, league: str, mapping_dir: Path | None = None, interactive: bool = True, auto_threshold: float = 90.0, confirm_threshold: float = 70.0, ) -> None: """Initialize TeamNameResolver. Args: league: League key or full competition string. mapping_dir: Optional override for the YAML directory. interactive: Enable interactive CLI prompts for ambiguous names. auto_threshold: WRatio score for automatic acceptance. confirm_threshold: WRatio score for interactive confirmation. """ # Accept both "FRA Ligue 1" and "ligue_1" as league identifier self._league = COMPETITION_TO_LEAGUE_KEY.get(league, league) self._mapping_dir = mapping_dir or _DEFAULT_MAPPING_DIR self._interactive = interactive self._auto_threshold = auto_threshold self._confirm_threshold = confirm_threshold self._yaml_path = self._mapping_dir / f"{self._league}.yaml" self._static_map: dict[str, str] = self._load_static_map() self._new_mappings: dict[str, str] = {} # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ def _load_static_map(self) -> dict[str, str]: """Load mappings from the YAML file. Returns: Dict mapping calendar names (stripped) to training names. """ if not self._yaml_path.exists(): return {} with open(self._yaml_path, encoding="utf-8") as f: data = yaml.safe_load(f) or {} raw_mappings: dict[str, str] = data.get("mappings") or {} return {k.strip(): v for k, v in raw_mappings.items()} def _check_static(self, name: str) -> str | None: """Look up *name* in the static YAML mapping (case-insensitive). Args: name: Calendar team name. Returns: Mapped training name, or ``None`` if not found. """ # Exact match first (fastest) if name in self._static_map: return self._static_map[name] # Case-insensitive fallback lower = name.strip().lower() for key, value in self._static_map.items(): if key.strip().lower() == lower: return value return None def _fuzzy_candidates( self, name: str, candidates: list[str], limit: int = 5 ) -> list[tuple[str, float]]: """Compute rapidfuzz WRatio scores between *name* and *candidates*. WRatio is token-order invariant and handles abbreviations and partial matches, making it well suited for football club names. Args: name: Query team name. candidates: Pool of training team names. limit: Maximum number of results to return. Returns: List of (candidate_name, score) sorted by descending score. """ results = process.extract( name, candidates, scorer=fuzz.WRatio, limit=limit, score_cutoff=0, ) return [(match, float(score)) for match, score, _ in results] def _interactive_confirm(self, name: str, candidates: list[tuple[str, float]]) -> str: """Prompt the user to resolve an ambiguous team name. Displays top candidates with their scores and asks the user to pick one, type a custom name, or skip (which keeps the original calendar name). Args: name: Unresolved calendar team name. candidates: Top (candidate, score) tuples from rapidfuzz. Returns: Resolved training name chosen or typed by the user. """ top = candidates[:5] print(f"\n{'─' * 60}", file=sys.stderr) print(f"⚠️ Cannot auto-resolve: '{name}'", file=sys.stderr) print(" Top candidates (rapidfuzz WRatio score / 100):", file=sys.stderr) for i, (candidate, score) in enumerate(top, 1): print(f" [{i}] {candidate!r:<30} score: {score:.0f}", file=sys.stderr) print(" [0] Enter the correct name manually", file=sys.stderr) print( " [s] Skip — keep original name (will likely cause a model error)", file=sys.stderr ) while True: try: raw = input(" Your choice: ").strip() except EOFError: # Non-interactive environment despite interactive=True return name if raw.lower() == "s": print(f" ⚠️ Skipping '{name}' — using original name.", file=sys.stderr) return name if raw == "0": manual = input(" Type the correct training name: ").strip() if manual: print(f" ✅ Mapped '{name}' → '{manual}'", file=sys.stderr) return manual continue try: idx = int(raw) if 1 <= idx <= len(top): picked = top[idx - 1][0] print(f" ✅ Mapped '{name}' → '{picked}'", file=sys.stderr) return picked except ValueError: pass print(" Please enter a number between 0 and 5, or 's' to skip.", file=sys.stderr) def _persist(self) -> None: """Append newly discovered mappings to the YAML file. Existing mappings are preserved; only new entries are added. Note: YAML comments are removed on rewrite. """ if not self._new_mappings: return self._yaml_path.parent.mkdir(parents=True, exist_ok=True) if self._yaml_path.exists(): with open(self._yaml_path, encoding="utf-8") as f: data = yaml.safe_load(f) or {} else: data = {} existing: dict[str, str] = data.get("mappings") or {} existing.update(self._new_mappings) data["mappings"] = dict(sorted(existing.items())) with open(self._yaml_path, "w", encoding="utf-8") as f: f.write( "# Team name mappings — auto-updated by TeamNameResolver\n" '# Format: "calendar_name": "training_name"\n\n' ) yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print( f" 💾 Persisted {len(self._new_mappings)} new mapping(s) → {self._yaml_path}", file=sys.stderr, ) self._new_mappings = {} # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------
[docs] def resolve( self, calendar_names: list[str], training_names: list[str], ) -> dict[str, str]: """Build a full mapping from *calendar_names* to *training_names*. Each calendar name goes through the resolution priority chain (static → exact → fuzzy auto → interactive / error) and the result is returned as a dictionary. Any newly confirmed mappings are persisted to the YAML before this method returns. Args: calendar_names: Team names as they appear in the fixture/calendar CSV. training_names: Team names as they appear in the training dataset (football-data.co.uk convention). Returns: Dict mapping every calendar name to its resolved training name. Skipped names are mapped to themselves. Raises: UnresolvedTeamNameError: If ``interactive=False`` and a name cannot be auto-resolved above ``auto_threshold``. """ mapping: dict[str, str] = {} for name in calendar_names: # 1. Static YAML lookup (exact or case-insensitive) static = self._check_static(name) if static is not None: mapping[name] = static continue # 2. Exact case-insensitive match in training names lower_name = name.strip().lower() exact = next( (t for t in training_names if t.strip().lower() == lower_name), None, ) if exact is not None: mapping[name] = exact continue # 3. rapidfuzz fuzzy matching candidates = self._fuzzy_candidates(name, training_names) if not candidates: # Empty training set — keep original name mapping[name] = name continue best_match, best_score = candidates[0] # 3a. Auto-accept (high confidence) if best_score >= self._auto_threshold: print( f" 🤖 Auto-matched: '{name}' → '{best_match}' (score {best_score:.0f})", file=sys.stderr, ) mapping[name] = best_match self._new_mappings[name] = best_match continue # 3b. Confirm threshold — propose but ask for confirmation if best_score >= self._confirm_threshold: if self._interactive: print( f"\n{'─' * 60}\n" f"🔍 Proposed match:\n" f" '{name}' → '{best_match}'\n" f" (score {best_score:.0f}/100)", file=sys.stderr, ) try: ans = input(" Accept? [Y/n]: ").strip().lower() except EOFError: ans = "" if ans in ("", "y", "yes"): print(f" ✅ Accepted '{name}' → '{best_match}'", file=sys.stderr) mapping[name] = best_match self._new_mappings[name] = best_match continue # user rejected — fall through to full interactive selection resolved = self._interactive_confirm(name, candidates) mapping[name] = resolved self._new_mappings[name] = resolved continue else: raise UnresolvedTeamNameError( f"Team '{name}' matched '{best_match}' with score {best_score:.0f} " f"(below auto_threshold={self._auto_threshold}). " f"Add the mapping to '{self._yaml_path}' or run interactively.", name, candidates, ) # 3c. Below confirm threshold — full manual selection if self._interactive: resolved = self._interactive_confirm(name, candidates) mapping[name] = resolved self._new_mappings[name] = resolved else: raise UnresolvedTeamNameError( f"Team '{name}' could not be resolved (best candidate: " f"'{best_match}' at {best_score:.0f}/100, threshold: " f"{self._confirm_threshold}). " f"Add the mapping to '{self._yaml_path}' or run interactively.", name, candidates, ) self._persist() return mapping