Source code for footix.data_io.understat

import json
import re
from datetime import datetime
from functools import lru_cache
from typing import Any
from urllib.parse import urljoin

import numpy as np
import pandas as pd
import requests
from lxml import html

import footix.data_io.utils_scrapper as utils_scrapper
from footix.data_io.base_scrapper import Scraper



[docs]
class ShotDataNotFound(RuntimeError):
    """Raised when the expected shotsData <script> block is not present."""




[docs]
class FixtureDataNotFound(RuntimeError):
    """Raised when the fixture data are not present."""




[docs]
class ScrapUnderstat(Scraper):
    """Scraper for downloading and processing football match data from understat.com.
    This class function is heavily inspired/copied from its counterpart from penalty blog:
    https://github.com/martineastwood/penaltyblog

    This class retrieves, parses, and processes football match data for a given competition
    and season from Understat. It extracts fixture details, expected goals (xG), forecasts,
    and normalizes team names. The data is returned as a processed pandas DataFrame.

    Args:
        competition (str): The competition code (e.g., 'EPL' for Premier League).
        season (str): The season string (e.g., '2020/2021', '2020-2021', or '2021').
        path (str): Directory path for any required file operations.
        force_reload (bool, optional): If True, forces re-download or reprocessing of data.
        mapping_teams (dict[str, str] | None, optional): Optional mapping for team name
        normalization.

    Attributes:
        base_url (str): Base URL for understat.com.
        scraper_name (str): Name identifier for the scraper.
        season (str): Processed season string.
        force_reload (bool): Whether to force data reload.
        slug (str): Slug for the competition used in URL construction.

    Methods:
        sanitize_columns(df): Converts DataFrame columns to snake_case.
        get_fixtures() -> pd.DataFrame: Downloads, parses, and returns processed match data.
        _process_season(season: str) -> str: Processes the season string for URL usage.

    """

    base_url: str = "https://understat.com/"
    scraper_name = "understat"

    def __init__(
        self,
        competition: str,
        season: str,
        path: str,
        force_reload: bool = False,
        mapping_teams: dict[str, str] | None = None,
    ):
        self._check_competitions(competition_name=competition)
        super().__init__(path=path, mapping_teams=mapping_teams)
        self.season = self._process_season(season)
        self.force_reload = force_reload
        self.slug = utils_scrapper.MAPPING_COMPETITIONS[competition]["understat"]["slug"]
        self.competition = competition


[docs]
    @staticmethod
    def sanitize_columns(df: pd.DataFrame):
        df.columns = [utils_scrapper.to_snake_case(x) for x in df.columns]



[docs]
    @lru_cache(maxsize=256)
    def get_fixtures(self) -> pd.DataFrame:
        """Downloads and processes match fixtures using Understat's API.

        Uses the /getLeagueData/ API endpoint which requires specific headers.

        Returns:
            pd.DataFrame: Processed fixtures with match details, xG, and forecasts.

        Raises:
            FixtureDataNotFound: If no fixture data is found in the API response.
        """
        # Use the API endpoint which requires specific headers
        url = urljoin(self.base_url, f"getLeagueData/{self.slug}/{self.season}")

        # The API requires X-Requested-With and Referer headers
        headers = {
            "X-Requested-With": "XMLHttpRequest",
            "Referer": urljoin(self.base_url, f"league/{self.slug}/{self.season}"),
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Parse the JSON response
        try:
            data = response.json()
            # The response is a dict with 'dates', 'teams', 'players' keys
            events = data.get("dates", [])
            if not events:
                raise FixtureDataNotFound("No dates data found in API response")
        except json.JSONDecodeError as exc:
            raise FixtureDataNotFound(f"Invalid JSON response: {exc}")

        fixtures = list()
        for e in events:
            if not e["isResult"]:
                continue

            tmp: dict[str, Any] = dict()
            tmp["understat_id"] = str(e["id"])
            tmp["datetime"] = e["datetime"]
            tmp["home_team"] = e["h"]["title"]
            tmp["away_team"] = e["a"]["title"]
            tmp["fthg"] = int(e["goals"]["h"])
            tmp["ftag"] = int(e["goals"]["a"])
            tmp["fthxg"] = float(e["xG"]["h"])
            tmp["ftaxg"] = float(e["xG"]["a"])
            tmp["forecast_w"] = float(e["forecast"]["w"])
            tmp["forecast_d"] = float(e["forecast"]["d"])
            tmp["forecast_l"] = float(e["forecast"]["l"])
            fixtures.append(tmp)

        df = (
            pd.DataFrame(fixtures)
            .pipe(self.replace_name_team, columns=["home_team", "away_team"])
            .sort_index()
        )

        def _get_date(date: str) -> str:
            dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
            return dt.strftime("%d/%m/%Y")

        def get_ftr(row) -> str:
            fthg = row["fthg"]
            ftag = row["ftag"]
            if fthg > ftag:
                return "H"
            if fthg == ftag:
                return "D"
            return "A"

        self.sanitize_columns(df)
        df["ftr"] = df.apply(get_ftr, axis=1)
        df["date"] = df["datetime"].apply(_get_date)
        df = utils_scrapper.add_match_id(df)
        return df


    def _process_season(self, season: str) -> str:
        clean_season = season.replace(" ", "-").replace("/", "-").split("-")
        return clean_season[0]


[docs]
    @lru_cache(maxsize=256)
    def get_shots(self, understat_id: str) -> pd.DataFrame:
        url = urljoin(self.base_url, f"match/{understat_id}")
        content = self.get(url)
        tree = html.fromstring(content)
        events = None

        for s in tree.cssselect("script"):
            if "shotsData" in s.text:
                script = s.text
                script = " ".join(script.split())
                script = str(script.encode(), "unicode-escape")
                script = re.match(r"var shotsData = JSON\.parse\('(?P<json>.*?)'\)", script)
                if script is not None:
                    script = script.group("json")
                events = json.loads(script)
                break

        if events is None:
            raise ShotDataNotFound

        shots = list()
        shots.extend(events["h"])
        shots.extend(events["a"])

        col_renames = {
            "h_team": "home_team",
            "a_team": "away_team",
            "h_goals": "goals_home",
            "a_goals": "goals_away",
            "match_id": "understat_id",
        }

        df = (
            pd.DataFrame(shots)
            .rename(columns=col_renames)
            .assign(season=self.season)
            .assign(competition=self.competition)
            .assign(date=lambda x: pd.to_datetime(x["date"]).dt.strftime("%d-%m-%Y"))
            .pipe(self.replace_name_team, columns=["home_team", "away_team"])
            .sort_index()
        )
        df["h_a"] = np.where(df["h_a"] == "h", df["home_team"], df["away_team"])
        df = utils_scrapper.add_match_id(df)
        self.sanitize_columns(df)
        return df