Source code for imperfecto.algos.regret_matching

"""A class of regret matching player for N-player normal form games.
See the paper "a simple adaptive procedure leading to correlated equilibrium" (2000)
by Sergiu Hart and Andreu Mas-Colell.

The algorithm works by maintaining a cumulative regret vector for each action. The current regret is the difference between the counterfactual reward and the reward of the actual action taken.

The strategy is action distribution proportional to the positive entries of the cumulative regret vector.
"""
from copy import copy
from enum import IntEnum
from typing import List

import numpy as np

from imperfecto.algos.player import NormalFormPlayer


[docs]class RegretMatchingPlayer(NormalFormPlayer):
    """
    Regret Matching (Hart and Mas-Colell 2000) Player works by maintaining a cumulative
    regret vector for each action.

    The current regret is the difference between the counterfactual reward and the reward of
    the actual action taken.

    The policy is action distribution proportional to the positive entries of the cumulative regret vector.

    Args:
        name(str): name of the player.
        n_actions(int): number of actions.

    Attributes:
        name(str): name of the player.
        n_actions(int): number of actions.
        cum_regrets(np.array): cumulative regret vector of shape (n_actions,).
    """

    def __init__(self, name: str, n_actions: int):
        super().__init__(name=name)
        self.n_actions = n_actions
        self.cum_regrets = np.zeros(self.n_actions)

    def __str__(self):
        return "Regret Matching Agent(" + self.name + ")"

    def __repr__(self):
        return "Regret Matching Agent(" + self.name + ")"

[docs]    @staticmethod
    def regret_matching_strategy(regrets: np.ndarray) -> np.ndarray:
        """Return the regret matching policy.

        Args:
            regrets: cumulative regrets vector.

        Returns:
            action distribution according to regret-matching policy.
        """
        action_probs = np.where(regrets > 0, regrets, 0)
        if np.sum(action_probs) > 0:
            action_probs /= np.sum(action_probs)
        else:
            action_probs = np.ones(len(action_probs)) / len(action_probs)
        return action_probs

[docs]    def update_strategy(self, history: List[IntEnum], player_id: int) -> None:
        """Update the cumulative regret vector. This will update the strategy of the player as
        the strategy is computed from the cumulative regret vector.

        Args:
            history: history of the game.
            player_id: player id.
        """
        my_action = history[player_id]
        # compute counterfactual rewards
        counterfactual_rewards = np.zeros(self.n_actions)
        for action in range(self.n_actions):
            action = self.game.actions(action)  # int to action enum
            counterfactual_history = copy(history)
            # suppose player `player_id` has played `action`
            counterfactual_history[player_id] = action
            counterfactual_rewards[int(action)] = self.game.get_payoffs(
                counterfactual_history)[player_id]

        self.cum_regrets += counterfactual_rewards - \
            counterfactual_rewards[int(my_action)]

        self.strategy = self.regret_matching_strategy(self.cum_regrets)