Source code for better_dict.core

"""
Module defines the :class:`.BetterDict` class.

The :class:`.BetterDict` class is a subclass of the :class:`dict` class that
provides additional functionality to the dictionary. The additional
functionality includes:

- Accessing the dictionary keys by value.
- Manipulating the dictionary keys and values using index notation.
- Accessing and manipulating the dictionary values using dot notation.
- Accessing the dictionary values by their data types.
- Saving and loading the dictionary to and from a file.
- Creating the dictionary from a :class:`pandas.DataFrame` object.
- Creating the dictionary from a :class:`numpy.ndarray` object.
- Creating the dictionary from a :class:`numpy.matrix` object.
- Creating the dictionary from a :class:`pandas.Series` object.
- Applying a function to the dictionary values and keys.
- Finding keys using fuzzy matching.
- Renaming the dictionary keys.

"""
from __future__ import annotations

from abc import abstractmethod
from difflib import get_close_matches
from typing import Any, Hashable, Iterable, List, Tuple

import numpy as np
import pandas as pd

from better_dict.accessors import ILoc, VLoc
from better_dict.io import JoblibIOMixin, PicklerMixin
from better_dict.utils import (flatten, iterable_not_string, make_list,
                               same_length)


[docs]def translate_dtype(dtype: str) -> type | Tuple[type, ...]:
    """Translate the ``dtype`` string to the corresponding Python type.

    Parameters
    ----------
    dtype : str
        Data type represented as string to translate.

    Returns
    -------
    type | Tuple[type, ...]
        Type or tuple of types corresponding to the text data types.

    Raises
    ------
    ValueError
        If the data type string is not recognized.
    """
    if dtype == "number":
        return float, int
    if dtype == "string":
        return str
    if dtype == "datetime":
        return pd.Timestamp
    if dtype == "pandas":
        return pd.DataFrame, pd.Series
    if dtype == "numpy":
        return np.ndarray, np.matrix
    raise ValueError(f"Invalid dtype: {dtype!r}")


[docs]class ApplyMixin:
    """
    Mixin class that adds the apply method to the ``BetterDict`` class.

    The ``apply`` method enables operations on the values or keys of the
    dictionary.

    """

[docs]    @abstractmethod
    def rename(self, keys: dict) -> None:
        """Rename the keys of the dictionary."""

[docs]    @abstractmethod
    def keys(self):
        """Get the keys of the dictionary."""

[docs]    @abstractmethod
    def values(self):
        """Get the values of the dictionary."""

[docs]    @abstractmethod
    def __setitem__(self, key, value):
        """Set the value of the dictionary at the given key."""

[docs]    def apply(self, func, *args, axis=1, **kwargs):
        """Apply a function to the ``BetterDict`` keys or values.

        Parameters
        ----------
        func : callable
            Function to apply.
        *args
            Positional arguments to pass to the function.
        **kwargs
            Keyword arguments to pass to the function.
        axis : int, default 1
            Axis along which to apply the function. If ``axis=0``, the function
            is applied to the keys of the dictionary. If ``axis=1``, the
            function is applied to the values of the dictionary.

        Returns
        -------
        BetterDict
            Result of applying the function to the object.

        Notes
        -----
        The class mimics the ``pandas.DataFrame.apply`` method.
        """
        if axis == 1:
            return self.value_apply(func, *args, **kwargs)
        return self.keys_apply(func, *args, **kwargs)

[docs]    def keys_apply(self, func, *args, **kwargs):
        """Apply the function to the keys of the dictionary.

        Parameters
        ----------
        func : callable
            Function to apply.
        *args
            Positional arguments to pass to the function.
        **kwargs
            Keyword arguments to pass to the function.

        Returns
        -------
        BetterDict
            Result of applying the function to the keys of the object.
        """
        self.rename({key: func(key, *args, **kwargs) for key in self.keys()})
        return self

[docs]    def value_apply(self, func, *args, **kwargs):
        """Apply the function to each value of a dictionary.

        Parameters
        ----------
        func : callable
            Function to apply.
        *args
            Positional arguments to pass to the function.
        **kwargs
            Keyword arguments to pass to the function.

        Returns
        -------
        BetterDict
            Result of applying the function to each row of the object.
        """
        new_values = (func(value, *args, **kwargs) for value in self.values())
        for key, value in zip(self.keys(), new_values):
            self[key] = value
        return self


[docs]class BetterDict(dict, PicklerMixin, JoblibIOMixin, ApplyMixin):
    """
    Custom dictionary class that allows multiple get/set operations at once.

    Class also supports the following operations:

    - Access and set values using index notation.
    - Access keys referrencing the dictionary values.
    - Get the keys and values as lists.
    - Get the closest match to a given key.
    - Select a subset of the dictionary based on the values data types.
    - Apply functions row- and column-wise.
    - Perform I/O operations using ``pickle`` and ``joblib``.

    Examples
    --------
    Create a :class:`.BetterDict` instance from a normal dictionary:
    >>> d = BetterDict({"a": 1, "b": 2})
    >>> d["a"]
    1

    Get multiple values at once
    >>> d["a", "b"]
    {'a': 1, 'b': 2}

    Set multiple values at once
    >>> d["a", "b"] = 3, 4
    >>> d["a", "b"]
    {'a': 3, 'b': 4}

    Access values using index notation
    >>> d.iloc[0]
    3

    Set values using index notation
    >>> d.iloc[0, 1] = [5, 6]
    >>> d.iloc[:]
    [5, 6]

    Get the keys and values as lists
    >>> d.keys()    # Dictionary keys
    ['a', 'b']
    >>> d.values()  # Dictionary values
    [5, 6]

    Get the closest match to a given key
    >>> d.get_closest_match("A")
    'a'

    """

    @property
    def iloc(self) -> ILoc:
        """Access the dictionary values by index.

        Returns
        -------
        ILoc
            Dictionary values accessed by index.
        """
        return ILoc(self)

    @property
    def vloc(self) -> VLoc:
        """Access the dictionary values by value.

        Returns
        -------
        VLoc
            Dictionary values accessed by value.
        """
        return VLoc(self)

    @property
    def index(self) -> List[int]:
        """Get the indexes of the dictionary.

        Returns
        -------
        List[int]
            Indexes of the dictionary.
        """
        return self.iloc.index

[docs]    def __getitem__(
        self, key: Hashable | Iterable[Hashable],
    ) -> object | BetterDict:
        """Get the value(s) associated with the key(s).

        Parameters
        ----------
        key : Hashable | Iterable[Hashable]
            Key(s) to get the value(s) for.

        Returns
        -------
        object | BetterDict
            Value(s) associated with the key(s).
        """
        if iterable_not_string(key):
            return BetterDict({_key: self[_key] for _key in key})
        return self.get(key)

[docs]    def __setitem__(
        self, key: Hashable | Iterable[Hashable], value: Any | Iterable[Any],
    ):
        """Set the value(s) associated with the key(s).

        Parameters
        ----------
        key : Hashable | Iterable[Hashable]
            Key(s) to set the value(s) for.
        value : Any | Iterable[Any]
            Value(s) to set.
        """
        if same_length(key, value):
            for _key, _value in zip(key, value):
                super().__setitem__(_key, _value)
        else:
            super().__setitem__(key, value)

[docs]    def __getattr__(self, name: str) -> object:
        """Get the value associated with the key.

        Parameters
        ----------
        name : str
            Key to get the value for.

        Returns
        -------
        object
            Value associated with the key.
        """
        return self.__getitem__(name)

[docs]    def __setattr__(self, name: str, value: Any):
        """Set the value associated with the key.

        Parameters
        ----------
        name : str
            Key to set the value for.
        value : Any
            Value to set.
        """
        self.__setitem__(name, value)

[docs]    def rename(self, keys):
        """Rename the keys of the dictionary.

        Parameters
        ----------
        keys : dict
            New keys for the dictionary.
        """
        new_keys = keys.values()
        old_keys = keys.keys()
        for old_key, new_key in zip(old_keys, new_keys):
            value = self.pop(old_key)
            self[new_key] = value

[docs]    def keys(self) -> List[Hashable]:
        """Return a list of the dictionary keys.

        Returns
        -------
        List[Hashable]
            List of the dictionary keys.
        """
        return list(super().keys())

[docs]    def values(self) -> List[Any]:
        """Return a list of the dictionary values.

        Returns
        -------
        List[Any]
            List of the dictionary values.
        """
        return list(super().values())

[docs]    def close_match(self, key: Hashable, cutoff: float = 0.6) -> Hashable:
        """
        Return the key that is the closest match to the name from ``key``.

        Before applying the fuzzy match, the ``key`` and the dictionary
        keys are converted to lower case. This maximizes the chances of
        finding a match.

        Parameters
        ----------
        key : Hashable
            Key to find the closest match for.
        cutoff : float, default 0.6
            Minimum similarity ratio for a match to be returned. The parameter
            must be between 0 and 1. A value of 1 means that the strings must
            be identical. A value close to 0 means that the strings don't have
            to be identical, for a match to be returned.

        Returns
        -------
        Hashable
            Closest match to the given key.

        Raises
        ------
        KeyError
            If no match is found.
        """
        lower_keys = BetterDict(
            {str(_key).lower(): _key for _key in self.keys()}
        )
        lower_key = str(key).lower()
        matches = get_close_matches(  # type: ignore
            lower_key, lower_keys.keys(), cutoff=cutoff
        )
        if matches:
            return lower_keys[matches[0]]
        raise KeyError(f"No close match found for {key}.")

[docs]    def dtypes(self) -> BetterDict:
        """Get the data types of the dictionary values.

        Returns
        -------
        BetterDict
            Data types of the dictionary values.
        """
        return BetterDict({key: type(value) for key, value in self.items()})

    def select_dtypes(self, include=None, exclude=None):
        """
        Return a subset of the dictionary based on the data types.

        At least one of ``include`` or ``exclude`` must be specified and if
        both are specified, ``include`` cannot contain the same data types as
        ``exclude``.

        This function accepts some special strings to specify the data types:

        - ``'number'`` - Numeric data types.
        - ``'string'`` - String data types.
        - ``'pandas'`` - Pandas DataFrame or Series.
        - ``'numpy'`` - Numpy array.
        - ``'datetime'`` - Datetime data types.

        Parameters
        ----------
        include : list, default None
            Data types to include in the subset. If ``exclude`` is not
            specified, then this parameter is obligatory.
        exclude : list, default None
            Data types to exclude from the subset. If ``include`` is not
            specified, then this parameter is obligatory.

        Returns
        -------
        BetterDict
            Subset of the dictionary based on the data types.

        Raises
        ------
        ValueError
            - If both ``include`` and ``exclude`` are not specified.
            - If ``include`` contains the same data types as ``exclude``.

        Examples
        --------
        >>> from better_dict import BetterDict
        >>> d = BetterDict({'a': 1, 'b': '2', 'c': [3], 'd': {'e': 4}})
        >>> d.select_dtypes(include=['number', 'string'])
        BetterDict({'a': 1, 'b': '2'})

        Notes
        -----
        Method checks whether at least one of ``include`` or ``exclude`` is
        specified and if both are specified, it checks whether ``include``
        contains the same data types as ``exclude``. If any of these checks
        fail, a ``ValueError`` is raised.

        This method works similarly to the ``pandas.DataFrame.select_dtypes``
        method.

        See Also
        --------
        - :meth:`pandas.DataFrame.select_dtypes`
        - :func:`.translate_dtype`
        - :func:`better_dict.utils.make_list`
        - :func:`better_dict.utils.flatten`
        """
        if include is None and exclude is None:
            raise ValueError(
                "At least one of ``include`` or ``exclude`` must be specified."
            )
        include = tuple(
            flatten(
                [
                    translate_dtype(dtype) if isinstance(dtype, str) else dtype
                    for dtype in make_list(include)
                ]
            )
        )
        exclude_dtypes = tuple(
            flatten(
                [
                    translate_dtype(dtype) if isinstance(dtype, str) else dtype
                    for dtype in make_list(exclude)
                ]
            )
        )
        check_include_exclude = set(include).intersection(set(exclude_dtypes))
        if check_include_exclude:
            raise ValueError(
                "Cannot set the same data types to `include` and `exclude`:\n"
                "\n".join(f"- {dtype}" for dtype in check_include_exclude)
            )
        return BetterDict(
            {
                key: value
                for key, value in self.items()
                if isinstance(value, include)
                and not isinstance(value, exclude_dtypes)
            }
        )

[docs]    @classmethod
    def from_frame(cls, pandas_df: pd.DataFrame) -> BetterDict:
        """Create a ``BetterDict`` from a pandas DataFrame.

        Parameters
        ----------
        pandas_df : pd.DataFrame
            Pandas DataFrame to create the ``BetterDict`` from.

        Returns
        -------
        BetterDict
            ``BetterDict`` created from the pandas DataFrame.
        """
        return cls(pandas_df.to_dict(orient="list"))

[docs]    @classmethod
    def from_series(cls, pandas_series: pd.Series) -> BetterDict:
        """Create a ``BetterDict`` from a pandas Series.

        Parameters
        ----------
        pandas_series : pd.Series
            Pandas Series to create the ``BetterDict`` from.

        Returns
        -------
        BetterDict
            ``BetterDict`` created from the pandas Series.
        """
        return cls(pandas_series.to_dict())

[docs]    @classmethod
    def from_list(cls, list_obj: list) -> BetterDict:
        """Create a ``BetterDict`` from a list.

        Parameters
        ----------
        list_obj : list
            List to create the ``BetterDict`` from.

        Returns
        -------
        BetterDict
            ``BetterDict`` created from the list.
        """
        return cls(enumerate(list_obj))