Source code for footix.data_io.understat

import json
import re
from datetime import datetime
from functools import lru_cache
from typing import Any
from urllib.parse import urljoin

import numpy as np
import pandas as pd
import requests
from lxml import html

import footix.data_io.utils_scrapper as utils_scrapper
from footix.data_io.base_scrapper import Scraper


[docs] class ShotDataNotFound(RuntimeError): """Raised when the expected shotsData <script> block is not present."""
[docs] class FixtureDataNotFound(RuntimeError): """Raised when the fixture data are not present."""
[docs] class ScrapUnderstat(Scraper): """Scraper for downloading and processing football match data from understat.com. This class function is heavily inspired/copied from its counterpart from penalty blog: https://github.com/martineastwood/penaltyblog This class retrieves, parses, and processes football match data for a given competition and season from Understat. It extracts fixture details, expected goals (xG), forecasts, and normalizes team names. The data is returned as a processed pandas DataFrame. Args: competition (str): The competition code (e.g., 'EPL' for Premier League). season (str): The season string (e.g., '2020/2021', '2020-2021', or '2021'). path (str): Directory path for any required file operations. force_reload (bool, optional): If True, forces re-download or reprocessing of data. mapping_teams (dict[str, str] | None, optional): Optional mapping for team name normalization. Attributes: base_url (str): Base URL for understat.com. scraper_name (str): Name identifier for the scraper. season (str): Processed season string. force_reload (bool): Whether to force data reload. slug (str): Slug for the competition used in URL construction. Methods: sanitize_columns(df): Converts DataFrame columns to snake_case. get_fixtures() -> pd.DataFrame: Downloads, parses, and returns processed match data. _process_season(season: str) -> str: Processes the season string for URL usage. """ base_url: str = "https://understat.com/" scraper_name = "understat" def __init__( self, competition: str, season: str, path: str, force_reload: bool = False, mapping_teams: dict[str, str] | None = None, ): self._check_competitions(competition_name=competition) super().__init__(path=path, mapping_teams=mapping_teams) self.season = self._process_season(season) self.force_reload = force_reload self.slug = utils_scrapper.MAPPING_COMPETITIONS[competition]["understat"]["slug"] self.competition = competition
[docs] @staticmethod def sanitize_columns(df: pd.DataFrame): df.columns = [utils_scrapper.to_snake_case(x) for x in df.columns]
[docs] @lru_cache(maxsize=256) def get_fixtures(self) -> pd.DataFrame: """Downloads and processes match fixtures using Understat's API. Uses the /getLeagueData/ API endpoint which requires specific headers. Returns: pd.DataFrame: Processed fixtures with match details, xG, and forecasts. Raises: FixtureDataNotFound: If no fixture data is found in the API response. """ # Use the API endpoint which requires specific headers url = urljoin(self.base_url, f"getLeagueData/{self.slug}/{self.season}") # The API requires X-Requested-With and Referer headers headers = { "X-Requested-With": "XMLHttpRequest", "Referer": urljoin(self.base_url, f"league/{self.slug}/{self.season}"), } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Parse the JSON response try: data = response.json() # The response is a dict with 'dates', 'teams', 'players' keys events = data.get("dates", []) if not events: raise FixtureDataNotFound("No dates data found in API response") except json.JSONDecodeError as exc: raise FixtureDataNotFound(f"Invalid JSON response: {exc}") fixtures = list() for e in events: if not e["isResult"]: continue tmp: dict[str, Any] = dict() tmp["understat_id"] = str(e["id"]) tmp["datetime"] = e["datetime"] tmp["home_team"] = e["h"]["title"] tmp["away_team"] = e["a"]["title"] tmp["fthg"] = int(e["goals"]["h"]) tmp["ftag"] = int(e["goals"]["a"]) tmp["fthxg"] = float(e["xG"]["h"]) tmp["ftaxg"] = float(e["xG"]["a"]) tmp["forecast_w"] = float(e["forecast"]["w"]) tmp["forecast_d"] = float(e["forecast"]["d"]) tmp["forecast_l"] = float(e["forecast"]["l"]) fixtures.append(tmp) df = ( pd.DataFrame(fixtures) .pipe(self.replace_name_team, columns=["home_team", "away_team"]) .sort_index() ) def _get_date(date: str) -> str: dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S") return dt.strftime("%d/%m/%Y") def get_ftr(row) -> str: fthg = row["fthg"] ftag = row["ftag"] if fthg > ftag: return "H" if fthg == ftag: return "D" return "A" self.sanitize_columns(df) df["ftr"] = df.apply(get_ftr, axis=1) df["date"] = df["datetime"].apply(_get_date) df = utils_scrapper.add_match_id(df) return df
def _process_season(self, season: str) -> str: clean_season = season.replace(" ", "-").replace("/", "-").split("-") return clean_season[0]
[docs] @lru_cache(maxsize=256) def get_shots(self, understat_id: str) -> pd.DataFrame: url = urljoin(self.base_url, f"match/{understat_id}") content = self.get(url) tree = html.fromstring(content) events = None for s in tree.cssselect("script"): if "shotsData" in s.text: script = s.text script = " ".join(script.split()) script = str(script.encode(), "unicode-escape") script = re.match(r"var shotsData = JSON\.parse\('(?P<json>.*?)'\)", script) if script is not None: script = script.group("json") events = json.loads(script) break if events is None: raise ShotDataNotFound shots = list() shots.extend(events["h"]) shots.extend(events["a"]) col_renames = { "h_team": "home_team", "a_team": "away_team", "h_goals": "goals_home", "a_goals": "goals_away", "match_id": "understat_id", } df = ( pd.DataFrame(shots) .rename(columns=col_renames) .assign(season=self.season) .assign(competition=self.competition) .assign(date=lambda x: pd.to_datetime(x["date"]).dt.strftime("%d-%m-%Y")) .pipe(self.replace_name_team, columns=["home_team", "away_team"]) .sort_index() ) df["h_a"] = np.where(df["h_a"] == "h", df["home_team"], df["away_team"]) df = utils_scrapper.add_match_id(df) self.sanitize_columns(df) return df