Source code for footix.data_io.footballdata

"""Module for scraping and processing footballdata.co.uk data.

This module contains the `ScrapFootballData` class, which is responsible for
downloading, storing, and preprocessing football match data from football-data.co.uk.
It includes methods for data sanitization, team name mapping, and fixture retrieval.

Classes:
    ScrapFootballData: Handles the scraping and processing of football match data.

Functions:
    _process_season(season: str) -> str: Processes a season string into a standardized format.

"""

import io

import pandas as pd

import footix.data_io.utils_scrapper as utils_scrapper
from footix.data_io.base_scrapper import Scraper


[docs] class ScrapFootballData(Scraper): """Scraper for downloading and processing football match data from football-data.co.uk. This class handles the retrieval, local storage, and preprocessing of football match data for a given competition and season. It supports automatic downloading, file management, column sanitization, and team name mapping. Args: competition (str): The competition code (e.g., 'E0' for Premier League). season (str): The season string (e.g., '2020/2021', '2020-2021', or '2021'). path (str): Directory path to store the downloaded CSV files. force_reload (bool, optional): If True, forces re-download of data even if file exists. mapping_teams (dict[str, str] | None, optional): Optional mapping for team name normalization. Attributes: base_url (str): Base URL for football-data.co.uk. scraper_name (str): Name identifier for the scraper. competition (str): Competition code. season (str): Processed season string. path (Path): Path object for data storage. force_reload (bool): Whether to force data reload. infered_url (str): Constructed URL for the CSV file. df (pd.DataFrame): Loaded and processed match data. Methods: download(): Downloads and saves the competition data as a CSV file. load() -> pd.DataFrame: Loads the data from file or downloads if not present. sanitize_columns(): Converts DataFrame columns to snake_case. get_fixtures() -> pd.DataFrame: Returns the processed match data. """ base_url: str = "https://www.football-data.co.uk/mmz4281/" scraper_name = "footballdata" def __init__( self, competition: str, season: str, path: str, force_reload: bool = False, mapping_teams: dict[str, str] | None = None, ) -> None: """Initialize the ScrapFootballData instance. Args: competition (str): The competition code. The mapping of competition names to their respective codes is defined in `utils_scrapper.MAPPING_COMPETITIONS`. season (str): The season string (e.g., '2020/2021', '2020-2021', or '2021'). path (str): Directory path to store the downloaded CSV files. force_reload (bool, optional): If True, forces re-download of data even if the file exists. Defaults to False. mapping_teams (dict[str, str] | None, optional): Optional mapping for team name normalization. Defaults to None. Raises: ValueError: If the competition is invalid or the season string is not in a valid format. """ super().__init__(path=path, mapping_teams=mapping_teams) self._check_competitions(competition_name=competition) self.competition = competition slug = utils_scrapper.MAPPING_COMPETITIONS[self.competition]["footballdata"]["slug"] self.season = _process_season(season) self.path = self.manage_path(path) self.force_reload = force_reload self.infered_url = self.base_url + self.season + "/" + slug + ".csv" self.df = self.load() self.sanitize_columns() self.df = utils_scrapper.add_match_id(self.df)
[docs] def download(self) -> None: """Download the competition data and save it as a CSV file.""" response = self.get(self.infered_url) df = ( pd.read_csv(io.StringIO(response), encoding="utf-8") .sort_index() .pipe(self.replace_name_team, columns=["home_team", "away_team"]) ) df.to_csv( self.path / (self.competition + "_" + self.season + ".csv"), index=False, encoding="utf-8", )
[docs] def load(self) -> pd.DataFrame: """Load the CSV for the configured competition and season into a pandas DataFrame. If a file named "{competition}_{season}.csv" exists under self.path and self.force_reload is False, it is loaded with pandas.read_csv. Otherwise self.download() is invoked to (re)create the CSV, which is then read. Returns: pd.DataFrame: The loaded dataset. Raises: FileNotFoundError: If the expected CSV is not found after attempting download. pandas.errors.EmptyDataError, pandas.errors.ParserError, OSError: Propagated from pandas.read_csv or filesystem operations. Notes: Relies on the instance attributes self.path (Path or str), self.competition (str), self.season (str), and self.force_reload (bool). This method may have the side effect of calling self.download(). """ if self._check_if_file_exist() and not self.force_reload: df = pd.read_csv(self.path / (self.competition + "_" + self.season + ".csv")) else: self.download() df = pd.read_csv(self.path / (self.competition + "_" + self.season + ".csv")) return df
[docs] def sanitize_columns(self): """Convert DataFrame columns to snake_case.""" self.df.columns = [utils_scrapper.to_snake_case(x) for x in self.df.columns]
[docs] def get_fixtures(self) -> pd.DataFrame: """Return the processed match data DataFrame. Returns: pd.DataFrame: The DataFrame containing match data. """ return self.df
def _check_if_file_exist(self) -> bool: name = self.competition + "_" + self.season + ".csv" expected_file = self.path / name if not expected_file.is_file(): return False return True
def _process_season(season: str) -> str: """Process a season string to extract a standardized format. Args: season (str): A string representing a football season in the format 'YYYY/YYYY' or 'YYYY-YYYY'. For example: '2020/2021' or '2020-2021' Raises: ValueError: if the season string cannot be split into exactly two years. Returns: str: the formatted season string. """ if len(season) == 4 and season.isdigit(): if int(season[-2:]) - int(season[:2]) != 1: raise ValueError("Years must be consecutive") return season # Remove any whitespace and split on common separators clean_season = season.replace(" ", "-").replace("/", "-").split("-") # Extract last 2 digits from each year year1 = clean_season[0][-2:] year2 = clean_season[-1][-2:] # Check if the years are consecutive if int(year2) - int(year1) != 1: raise ValueError("Years must be consecutive") return year1 + year2