Source code for footix.models.utils

from typing import Tuple, Union

import numpy as np
import pandas as pd
import scipy.stats as stats
import torch
from scipy.optimize import least_squares, root
from scipy.special import iv

import footix.utils.decorators as decorators
from footix.models.score_matrix import GoalMatrix
from footix.utils.typing import ProbaResult



[docs]
@decorators.verify_required_column(column_names=["home_team", "fthg"])
def compute_goals_home_vectors(
    data: pd.DataFrame, /, map_teams: dict, nbr_team: int
) -> tuple[np.ndarray, np.ndarray]:
    """Compute vectors representing home team goals.

    Args:
        data (pd.DataFrame): Input DataFrame with home team goals and HomeTeam column.
        map_teams (dict): Dictionary mapping team names to numerical IDs.
        nbr_team (int): Number of teams in the league.

    Returns:
        tuple[np.ndarray, np.ndarray]: A tuple containing two NumPy arrays:
            goals_vector representing home team goals and tau_home representing binary vectors
            for each home team.

    """
    goals_vector = data["fthg"].values
    tau_home = np.zeros((len(data), nbr_team))
    tau_home[np.arange(len(data)), [map_teams[team] for team in data["home_team"]]] = 1
    return goals_vector, tau_home




[docs]
@decorators.verify_required_column(column_names=["away_team", "ftag"])
def compute_goals_away_vectors(
    data: pd.DataFrame, /, map_teams: dict[str, int], nbr_team: int
) -> tuple[np.ndarray, np.ndarray]:
    """Compute vectors representing away team goals.

    Args:
        data (pd.DataFrame): Input DataFrame with away team goals and AwayTeam column.
        map_teams (dict): Dictionary mapping team names to numerical IDs.
        nbr_team (int): Number of teams in the league.

    Returns:
        tuple[np.ndarray, np.ndarray]: A tuple containing two NumPy arrays:
            goals_vector representing away team goals and tau_away representing binary vectors
            for each away team.

    """
    goals_vector = data["ftag"].values
    tau_away = np.zeros((len(data), nbr_team))
    tau_away[np.arange(len(data)), [map_teams[team] for team in data["away_team"]]] = 1
    return goals_vector, tau_away




[docs]
def to_torch_tensor(
    *arrays: np.ndarray, dtype: torch.dtype = torch.float32
) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
    """Convert numpy arrays to torch tensors.

    Args:
        *arrays: Variable number of numpy arrays to convert
        dtype: Target tensor dtype (default: torch.float32)

    Returns:
        Single tensor if one array is provided, tuple of tensors if multiple arrays
    Examples:
        >>> x = np.array([1, 2, 3])
        >>> tensor_x = to_tensor(x)

        >>> x = np.array([1, 2, 3])
        >>> y = np.array([4, 5, 6])
        >>> tensor_x, tensor_y = to_tensor(x, y)

    """
    tensors = tuple(torch.from_numpy(arr).type(dtype) for arr in arrays)
    return tensors[0] if len(tensors) == 1 else tensors




[docs]
def poisson_proba(lambda_param: float, k: int) -> np.ndarray:
    """Calculate the probability of achieving up to k goals given a lambda parameter.

    Args:
        lambda_param (float): The expected number of goals.
        k (int): The number of goals to achieve.

    Returns:
        np.ndarray:  An array containing the probabilities of achieving each possible
    number of goals from 0 to n_goals, inclusive.

    """
    poisson = stats.poisson(mu=lambda_param)
    k_list = np.arange(k)
    return poisson.pmf(k=k_list)  # type:ignore




[docs]
def implicit_intensities(
    proba_from_odds: np.ndarray, max_iter: int = 200, tol: float = 1e-10
) -> np.ndarray:
    """Calculate implicit scoring intensities from match outcome probabilities.

    This function converts betting odds probabilities into implied goal-scoring
    intensities (lambda parameters) for both teams using numerical optimization.
    It uses the Skellam distribution to model the difference between two Poisson
    processes (goal scoring by each team).

    Args:
        proba_from_odds (np.ndarray): Array of shape (n_matches, 3) containing
            probabilities for [win, draw, loss] derived from betting odds.
        max_iter (int, optional): Maximum number of iterations for the optimization
            algorithm. Defaults to 200.
        tol (float, optional): Tolerance for optimization convergence. Defaults to 1e-10.

    Raises:
        ValueError: If proba_from_odds does not have shape (n_matches, 3).

    Returns:
        np.ndarray: Array of shape (n_matches, 2) containing the implied scoring
            intensities [lambda1, lambda2] for each match, where lambda1 is the
            home team's scoring intensity and lambda2 is the away team's.

    Note:
        If the primary optimization fails, the function falls back to a grid search
        over predefined lambda values to find the best approximation.

    """
    proba_from_odds = np.asarray(proba_from_odds, dtype=float)
    eps = 1e-12
    if proba_from_odds.ndim != 2 or proba_from_odds.shape[1] != 3:
        raise ValueError("`pi` doit avoir la forme (n_matches, 3).")

    p = np.clip(proba_from_odds, eps, 1 - eps)
    row_sums = p.sum(axis=1, keepdims=True)
    p /= row_sums

    results = np.empty((p.shape[0], 2), dtype=float)
    lg = np.logspace(-2, 2, 50)  # maillage pour le fallback

    for i, (p_w, p_d, p_l) in enumerate(p):
        target = np.array([p_w + p_d, p_l])

        mu_diff = p_w - p_l
        lam0 = max(0.2, 1.0 + mu_diff)
        lam1 = max(0.2, 1.0 - mu_diff)
        x0 = np.array([lam0, lam1])

        def residual(t):
            lam1, lam2 = t
            p_wd = 1 - stats.skellam.cdf(-1, lam1, lam2)  # P(Y1 ≥ Y2)
            p_l = stats.skellam.cdf(-1, lam1, lam2)  # P(Y1 <  Y2)
            return (np.array([p_wd, p_l]) - target) / np.sqrt(target * (1 - target))

        sol = least_squares(
            residual,
            x0,
            bounds=(1e-6, np.inf),
            xtol=tol,
            ftol=tol,
            gtol=tol,
            max_nfev=max_iter,
        )

        if sol.success and np.all(sol.x > 0):
            results[i] = sol.x
            continue
        best_err, best_t = np.inf, x0
        for t1 in lg:
            for t2 in lg:
                err = np.sum(residual([t1, t2]) ** 2)
                if err < best_err:
                    best_err, best_t = err, (t1, t2)  # type: ignore
        results[i] = best_t

    return results



def _p0(lamda_1: float, lamda_2: float) -> float:
    return np.exp(-(lamda_1 + lamda_2)) * iv(0, 2 * np.sqrt(lamda_1 * lamda_2))


def _p_pos(lamda_1: float, lamda_2: float, K: int = 40) -> float:
    k = np.arange(1, K + 1)
    return np.sum(
        np.exp(-(lamda_1 + lamda_2))
        * ((lamda_1 / lamda_2) ** (k / 2) * iv(k, 2 * np.sqrt(lamda_1 * lamda_2)))
    )



[docs]
def implied_poisson_goals(
    bookmaker_proba: ProbaResult, *, k_sum: int = 40, nbr_goals: int = 10
) -> GoalMatrix:
    """Calculate implied Poisson goal distributions from bookmaker probabilities.

    This function uses a system of equations to find the Poisson parameters (lambda)
    that best match the observed probabilities from bookmakers. It solves for the
    scoring rates of both teams using modified Bessel functions of the first kind.

    Args:
        bookmaker_proba: Probabilities from bookmaker (draw, home win, away win)
        k_sum: Maximum number of goals to consider in summation (default: 40)
        nbr_goals: Number of goals to generate probabilities for (default: 10)

    Returns:
        GoalMatrix containing probability distributions for home and away goals

    Raises:
        ArithmeticError: If the numerical solver fails to converge

    """
    proba_draw = bookmaker_proba[1]
    proba_home_win = bookmaker_proba[0]

    def system(params: np.ndarray, p_0_obs: float, p_pos_obs: float) -> list[float]:
        """System of equations to solve for Poisson parameters.

        Args:
            params: Log of lambda parameters [log(λ1), log(λ2)]
            p_0_obs: Observed probability of draw
            p_pos_obs: Observed probability of home win

        Returns:
            Differences between model and observed probabilities

        """
        l1, l2 = np.exp(params)
        p_0_model = _p0(l1, l2)  # Probability of draw
        p_pos_model = _p_pos(l1, l2, K=k_sum)  # Probability of home win
        return [p_0_model - p_0_obs, p_pos_model - p_pos_obs]

    # Initial guess for lambda parameters (log scale)
    initial_guess = [np.log(1.2), np.log(0.9)]

    # Solve system of equations
    sol = root(system, x0=initial_guess, args=(proba_draw, proba_home_win))

    if not sol.success:
        raise ArithmeticError("Numerical solver failed to converge")

    # Convert solution back from log scale
    lamda_1, lamda_2 = np.exp(sol.x)

    return GoalMatrix(
        home_goals_probs=poisson_proba(lambda_param=lamda_1, k=nbr_goals),
        away_goals_probs=poisson_proba(lambda_param=lamda_2, k=nbr_goals),
    )