Source code for footix.data_io.utils_scrapper

# Mapping of the different competitions to their respective slugs
import re
from typing import Any

import pandas as pd

from footix.utils.decorators import verify_required_column

MAPPING_COMPETITIONS: dict[str, dict[str, Any]] = {
    "FRA Ligue 1": {"footballdata": {"slug": "F1"}, "understat": {"slug": "Ligue_1"}},
    "FRA Ligue 2": {"footballdata": {"slug": "F2"}},
    "ENG Premier League": {"footballdata": {"slug": "E0"}, "understat": {"slug": "EPL"}},
    "ENG Championship": {"footballdata": {"slug": "E1"}},
    "DEU Bundesliga 1": {"footballdata": {"slug": "D1"}, "understat": {"slug": "Bundesliga"}},
    "DEU Bundesliga 2": {"footballdata": {"slug": "D2"}},
    "ITA Serie A": {"footballdata": {"slug": "I1"}, "understat": {"slug": "Serie_A"}},
    "ITA Serie B": {"footballdata": {"slug": "I2"}},
    "SPA La Liga": {"footballdata": {"slug": "SP1"}, "understat": {"slug": "La_Liga"}},
    "SPA La Liga 2": {"footballdata": {"slug": "SP2"}},
}


[docs] def check_competition_exists(competition: str) -> bool: """Check if the competition exists in the MAPPING_COMPETITIONS dictionary. Args: competition (str): The name of the competition to check. Returns: bool: True if the competition exists, False otherwise. """ return competition in MAPPING_COMPETITIONS
[docs] def process_string(input_string): lower_string = input_string.lower() no_space_string = lower_string.replace(" ", "") return no_space_string
[docs] def to_snake_case(name: str) -> str: """Convert the string name into a snake case string. Shamelessly copied from: https://stackoverflow.com/questions/1175208/ elegant-python-function-to-convert-camelcase-to-snake-case Args: name (str): the name to convert Returns: str: the name in snake case """ name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) name = re.sub("__([A-Z])", r"_\1", name) name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name) return name.lower()
[docs] @verify_required_column(["home_team", "away_team", "date"]) def add_match_id(df: pd.DataFrame) -> pd.DataFrame: """Add a stable `match_id` column in the form "Home - Away - YYYY-MM-DD". This normalizes the date formatting so match ids are consistent across scrapers that use different date string formats. """ tmp_df = df.copy() # Ensure date is datetime-like for a stable formatting if not pd.api.types.is_datetime64_any_dtype(tmp_df["date"]): tmp_df["date"] = pd.to_datetime(tmp_df["date"], dayfirst=True) tmp_df["match_id"] = ( tmp_df["home_team"] + " - " + tmp_df["away_team"] + " - " + tmp_df["date"].dt.strftime("%Y-%m-%d") ) return tmp_df
[docs] def canonicalize_matches_df( df: pd.DataFrame, *, require_columns: list[str] | None = None ) -> pd.DataFrame: """Canonicalize a match dataframe. Ensures date parsing, required columns present, sorts by date and adds a stable `match_id`. Args: df: Input dataframe with match rows. require_columns: List of columns that must be present (defaults to minimal match columns). Returns: The canonicalized dataframe. """ cols_required = require_columns or ["date", "home_team", "away_team", "fthg", "ftag"] missing = [c for c in cols_required if c not in df.columns] if missing: raise ValueError(f"Missing required columns for canonicalization: {missing}") tmp = df.copy() # Parse dates with dayfirst=True to be consistent with existing readers tmp["date"] = pd.to_datetime(tmp["date"], dayfirst=True) # Ensure minimal dtypes tmp["home_team"] = tmp["home_team"].astype(str) tmp["away_team"] = tmp["away_team"].astype(str) # Add stable match_id and sort tmp = add_match_id(tmp) tmp = tmp.sort_values(by="date", ascending=True).reset_index(drop=True) return tmp