Axelrod-Python · marcharper · Mar 16, 2024 · Mar 16, 2024 · Mar 16, 2024 · Mar 16, 2024
diff --git a/axelrod/data/all_classifiers.yml b/axelrod/data/all_classifiers.yml
diff --git a/axelrod/strategies/_strategies.py b/axelrod/strategies/_strategies.py
@@ -90,6 +90,7 @@
 from .dbs import DBS
 from .defector import Defector, TrickyDefector
 from .doubler import Doubler
+from .bandits import Greedy, EpsilonGreedy
 from .finite_state_machines import (
     TF1,
     TF2,
@@ -334,6 +335,7 @@
     Doubler,
     DynamicTwoTitsForTat,
     EasyGo,
+    EpsilonGreedy,
     EugineNier,
     EventualCycleHunter,
     EvolvedANN,
@@ -376,6 +378,7 @@
     Golden,
     Gradual,
     GradualKiller,
+    Greedy,
     Grudger,
     GrudgerAlternator,
     Grumpy,

diff --git a/axelrod/strategies/bandits.py b/axelrod/strategies/bandits.py
@@ -0,0 +1,154 @@
+import numpy as np
+
+from axelrod.action import Action
+from axelrod.player import Player
+
+C, D = Action.C, Action.D
+
+
+class Greedy(Player):
+    """
+    A player that always chooses the optimal action based on the average reward of each action from previous turns.
+
+    If initial rewards for each action are equivalent (true by default),
+    then the optimal action for the first turn is cooperate.
+
+    Names:
+
+    - Greedy: [Sutton2018]_
+    """
+
+    name = "greedy"
+    classifier = {
+        "memory_depth": float("inf"),
+        "stochastic": False,
+        "long_run_time": False,
+        "inspects_source": False,
+        "manipulates_source": False,
+        "manipulates_state": False,
+    }
+
+    UNIFORM = -1.0  # constant that replaces weight when rewards aren't weighted
+
+    def __init__(
+        self,
+        init_c_reward: float = 0.0,
+        init_d_reward: float = 0.0,
+        recency_weight: float = UNIFORM,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        init_c_reward
+            Initial expected utility from action C; defaults to 0.0.
+        init_d_reward
+            Initial expected utility from action D; defaults to 0.0
+        recency_weight
+            0.0 <= recency_weight <= 1.0
+            The exponential recency weight used in calculating the average reward.
+            If this argument is equal to -1 or is not provided, the player will not weigh rewards based on recency.
+        """
+        super().__init__()
+        self._rewards = {C: init_c_reward, D: init_d_reward}
+        self.weight = recency_weight
+
+        # limit parameter value range
+        if (self.weight != self.UNIFORM) and self.weight <= 0:
+            self.weight = 0.0
+        if self.weight >= 1:
+            self.weight = 1.0
+
+    def update_rewards(self, opponent: Player):
+        """Updates the expected reward associated with the last action."""
+        game = self.match_attributes["game"]
+        last_round = (self.history[-1], opponent.history[-1])
+        last_play = self.history[-1]
+        last_score = game.score(last_round)[0]
+
+        # if UNIFORM, use 1 / total number of times the updated action was taken previously
+        if self.weight == self.UNIFORM:
+            weight = 1 / (
+                self.history.cooperations if last_play == C else self.history.defections
+            )
+        else:
+            weight = self.weight
+
+        self._rewards[last_play] = self._rewards[last_play] + weight * (
+            last_score - self._rewards[last_play]
+        )
+
+    def strategy(self, opponent: Player) -> Action:
+        # if not the first turn
+        if len(self.history) != 0:
+            self.update_rewards(opponent)
+
+        # select the optimal play
+        return max(self._rewards, key=self._rewards.get)
+
+
+class EpsilonGreedy(Greedy):
+    """
+    Has a 1 - epsilon probability of behaving like Greedy; otherwise, randomly choose to cooperate or defect.
+
+    Names:
+
+    - Epsilon-greedy: [Sutton2018]_
+    """
+
+    name = "$\varepsilon$-greedy"
+    classifier = {
+        "memory_depth": float("inf"),
+        "stochastic": True,
+        "long_run_time": False,
+        "inspects_source": False,
+        "manipulates_source": False,
+        "manipulates_state": False,
+    }
+
+    def __init__(
+        self,
+        epsilon: float = 0.1,
+        init_c_reward: float = 0.0,
+        init_d_reward: float = 0.0,
+        recency_weight: float = Greedy.UNIFORM,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        epsilon
+            0.0 <= epsilon <= 1.0
+            the probability that the player will "explore" (act uniformly random); defaults to 0.1
+        init_c_reward
+            initial expected utility from action C; defaults to 0.0.
+        init_d_reward
+            initial expected utility from action D; defaults to 0.0
+
+        Special cases
+        ----------
+            When epsilon <= 0, this player behaves like Random(0.5)
+            When epsilon >= 1, this player behaves like Greedy()
+        """
+        super().__init__(init_c_reward, init_d_reward, recency_weight)
+        self.epsilon = epsilon
+
+        # treat out of range values as extremes
+        if epsilon <= 0:
+            self.epsilon = 0.0
+        if epsilon >= 1:
+            self.epsilon = 1.0
+
+    def _post_init(self):
+        super()._post_init()
+        if self.epsilon == 0:
+            self.classifier["stochastic"] = False
+
+    def strategy(self, opponent: Player) -> Action:
+        # this will also update the reward appropriately
+        greedy_action = super().strategy(opponent)
+
+        # explore
+        if self.epsilon > 0 and self._random.uniform() <= self.epsilon:
+            return self._random.random_choice()
+        # exploit
+        else:
+            return greedy_action
diff --git a/axelrod/tests/strategies/test_armed_bandits.py b/axelrod/tests/strategies/test_armed_bandits.py
@@ -0,0 +1,91 @@
+"""Tests for the armed bandits strategies."""
+
+import axelrod as axl
+
+from .test_player import TestPlayer, TestMatch
+
+C, D = axl.Action.C, axl.Action.D
+
+
+class TestEpsilonGreedy(TestPlayer):
+
+    name = "$\varepsilon$-greedy: 0.1, 0.0, 0.0, -1.0"
+    player = axl.EpsilonGreedy
+    expected_classifier = {
+        "memory_depth": float("inf"),
+        "stochastic": True,
+        "makes_use_of": {"game"},
+        "long_run_time": False,
+        "inspects_source": False,
+        "manipulates_source": False,
+        "manipulates_state": False,
+    }
+
+    def test_deterministic(self):
+        # cases where epsilon = 0
+        actions = [(C, C), (C, C), (C, C)]
+        self.versus_test(
+            axl.Cooperator(),
+            expected_actions=actions,
+            init_kwargs={"epsilon": 0, "init_c_reward": 0, "init_d_reward": -1},
+            attrs={"_rewards": {C: 3, D: -1}},
+        )
+
+        actions = [(D, D), (D, D), (D, D)]
+        self.versus_test(
+            axl.Defector(),
+            expected_actions=actions,
+            init_kwargs={"epsilon": 0, "init_c_reward": -1, "init_d_reward": 0},
+            attrs={"_rewards": {C: -1, D: 1}},
+        )
+
+        actions = [(D, C), (D, D), (C, D)]
+        self.versus_test(
+            axl.TitForTat(),
+            expected_actions=actions,
+            init_kwargs={"epsilon": 0, "init_c_reward": 3.2, "init_d_reward": 4.0},
+            attrs={"_rewards": {C: 3.2, D: 3.0}},
+        )
+
+    def test_random(self):
+        # cases where epsilon = 1
+        opponent = axl.MockPlayer()
+        actions = [(C, C), (D, C), (D, C), (C, C)]
+        self.versus_test(
+            opponent, expected_actions=actions, init_kwargs={"epsilon": 1}, seed=5
+        )
+
+        opponent = axl.MockPlayer(actions=[C, D, C])
+        actions = [(D, C), (C, D), (C, C)]
+        self.versus_test(
+            opponent, expected_actions=actions, init_kwargs={"epsilon": 1.0}, seed=1
+        )
+
+    def test_strategy(self):
+        # sometimes explores
+        actions = [(C, C), (D, C), (D, C)]
+        self.versus_test(
+            axl.Cooperator(),
+            expected_actions=actions,
+            init_kwargs={"epsilon": 0.5},
+            attrs={"_rewards": {C: 3, D: 5}},
+            seed=2,
+        )
+
+        # always explores
+        actions = [(D, D), (C, D), (C, D)]
+        self.versus_test(
+            axl.Defector(),
+            expected_actions=actions,
+            attrs={"_rewards": {C: 0, D: 1}},
+            seed=13741,
+        )
+
+        # never explores/always exploits
+        actions = [(C, C), (C, C), (C, C)]
+        self.versus_test(
+            axl.TitForTat(),
+            expected_actions=actions,
+            attrs={"_rewards": {C: 3, D: 0}},
+            seed=1,
+        )
diff --git a/docs/how-to/classify_strategies.rst b/docs/how-to/classify_strategies.rst
@@ -57,7 +57,7 @@ strategies::
     ... }
     >>> strategies = axl.filtered_strategies(filterset)
     >>> len(strategies)
-    88
+    89
 
 Or, to find out how many strategies only use 1 turn worth of memory to
 make a decision::

diff --git a/docs/index.rst b/docs/index.rst
@@ -53,7 +53,7 @@ Count the number of available players::
 
     >>> import axelrod as axl
     >>> len(axl.strategies)
-    240
+    242
 
 Create matches between two players::
 

diff --git a/docs/reference/bibliography.rst b/docs/reference/bibliography.rst
@@ -63,6 +63,7 @@ documentation.
 .. [Shakarian2013] Shakarian, P., Roos, P. & Moores, G. A Novel Analytical Method for Evolutionary Graph Theory Problems.
 .. [Slany2007] Slany W. and Kienreich W., On some winning strategies for the iterated prisoner’s dilemma, in Kendall G., Yao X. and Chong S. (eds.) The iterated prisoner’s dilemma: 20 years on. World Scientific, chapter 8, pp. 171-204, 2007.
 .. [Stewart2012] Stewart, a. J., & Plotkin, J. B. (2012). Extortion and cooperation in the Prisoner’s Dilemma. Proceedings of the National Academy of Sciences, 109(26), 10134–10135. http://doi.org/10.1073/pnas.1208087109
+.. [Sutton2018] Sutton, R. S., & Barto, A. G. (2018). Multi-armed Bandits. In Reinforcement Learning: An Introduction (2nd ed., pp. 25–45). MIT Press. http://incompleteideas.net/book/the-book-2nd.html
 .. [Szabo2007] Szabó, G., & Fáth, G. (2007). Evolutionary games on graphs. Physics Reports, 446(4-6), 97–216. http://doi.org/10.1016/j.physrep.2007.04.004
 .. [Gaudesi2016] Gaudesi, Marco, et al. "Exploiting evolutionary modeling to prevail in iterated prisoner’s dilemma tournaments." IEEE Transactions on Computational Intelligence and AI in Games 8.3 (2016): 288-300.
 .. [Tzafestas2000] Tzafestas, E. (2000). Toward adaptive cooperative behavior. From Animals to Animals: Proceedings of the 6th International Conference on the Simulation of Adaptive Behavior {(SAB-2000)}, 2, 334–340.
diff --git a/docs/reference/strategy_index.rst b/docs/reference/strategy_index.rst
@@ -18,6 +18,8 @@ Here are the docstrings of all the strategies in the library.
    :members:
 .. automodule:: axelrod.strategies.appeaser
    :members:
+.. automodule:: axelrod.strategies.bandits
+   :members:
 .. automodule:: axelrod.strategies.averagecopier
    :members:
 .. automodule:: axelrod.strategies.axelrod_first
Original file line number	Diff line number	Diff line change
Expand Up		@@ -53,7 +53,7 @@ Count the number of available players::

		>>> import axelrod as axl
		>>> len(axl.strategies)
		240
		242

		Create matches between two players::

Expand Down