Skip to content

Bingj rebase #1448

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4,044 changes: 2,010 additions & 2,034 deletions axelrod/data/all_classifiers.yml

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions axelrod/strategies/_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
from .dbs import DBS
from .defector import Defector, TrickyDefector
from .doubler import Doubler
from .bandits import Greedy, EpsilonGreedy
from .finite_state_machines import (
TF1,
TF2,
Expand Down Expand Up @@ -334,6 +335,7 @@
Doubler,
DynamicTwoTitsForTat,
EasyGo,
EpsilonGreedy,
EugineNier,
EventualCycleHunter,
EvolvedANN,
Expand Down Expand Up @@ -376,6 +378,7 @@
Golden,
Gradual,
GradualKiller,
Greedy,
Grudger,
GrudgerAlternator,
Grumpy,
Expand Down
154 changes: 154 additions & 0 deletions axelrod/strategies/bandits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import numpy as np

from axelrod.action import Action
from axelrod.player import Player

C, D = Action.C, Action.D


class Greedy(Player):
"""
A player that always chooses the optimal action based on the average reward of each action from previous turns.

If initial rewards for each action are equivalent (true by default),
then the optimal action for the first turn is cooperate.

Names:

- Greedy: [Sutton2018]_
"""

name = "greedy"
classifier = {
"memory_depth": float("inf"),
"stochastic": False,
"long_run_time": False,
"inspects_source": False,
"manipulates_source": False,
"manipulates_state": False,
}

UNIFORM = -1.0 # constant that replaces weight when rewards aren't weighted

def __init__(
self,
init_c_reward: float = 0.0,
init_d_reward: float = 0.0,
recency_weight: float = UNIFORM,
) -> None:
"""
Parameters
----------
init_c_reward
Initial expected utility from action C; defaults to 0.0.
init_d_reward
Initial expected utility from action D; defaults to 0.0
recency_weight
0.0 <= recency_weight <= 1.0
The exponential recency weight used in calculating the average reward.
If this argument is equal to -1 or is not provided, the player will not weigh rewards based on recency.
"""
super().__init__()
self._rewards = {C: init_c_reward, D: init_d_reward}
self.weight = recency_weight

# limit parameter value range
if (self.weight != self.UNIFORM) and self.weight <= 0:
self.weight = 0.0
if self.weight >= 1:
self.weight = 1.0

def update_rewards(self, opponent: Player):
"""Updates the expected reward associated with the last action."""
game = self.match_attributes["game"]
last_round = (self.history[-1], opponent.history[-1])
last_play = self.history[-1]
last_score = game.score(last_round)[0]

# if UNIFORM, use 1 / total number of times the updated action was taken previously
if self.weight == self.UNIFORM:
weight = 1 / (
self.history.cooperations if last_play == C else self.history.defections
)
else:
weight = self.weight

self._rewards[last_play] = self._rewards[last_play] + weight * (
last_score - self._rewards[last_play]
)

def strategy(self, opponent: Player) -> Action:
# if not the first turn
if len(self.history) != 0:
self.update_rewards(opponent)

# select the optimal play
return max(self._rewards, key=self._rewards.get)


class EpsilonGreedy(Greedy):
"""
Has a 1 - epsilon probability of behaving like Greedy; otherwise, randomly choose to cooperate or defect.

Names:

- Epsilon-greedy: [Sutton2018]_
"""

name = "$\varepsilon$-greedy"
classifier = {
"memory_depth": float("inf"),
"stochastic": True,
"long_run_time": False,
"inspects_source": False,
"manipulates_source": False,
"manipulates_state": False,
}

def __init__(
self,
epsilon: float = 0.1,
init_c_reward: float = 0.0,
init_d_reward: float = 0.0,
recency_weight: float = Greedy.UNIFORM,
) -> None:
"""
Parameters
----------
epsilon
0.0 <= epsilon <= 1.0
the probability that the player will "explore" (act uniformly random); defaults to 0.1
init_c_reward
initial expected utility from action C; defaults to 0.0.
init_d_reward
initial expected utility from action D; defaults to 0.0

Special cases
----------
When epsilon <= 0, this player behaves like Random(0.5)
When epsilon >= 1, this player behaves like Greedy()
"""
super().__init__(init_c_reward, init_d_reward, recency_weight)
self.epsilon = epsilon

# treat out of range values as extremes
if epsilon <= 0:
self.epsilon = 0.0
if epsilon >= 1:
self.epsilon = 1.0

def _post_init(self):
super()._post_init()
if self.epsilon == 0:
self.classifier["stochastic"] = False

def strategy(self, opponent: Player) -> Action:
# this will also update the reward appropriately
greedy_action = super().strategy(opponent)

# explore
if self.epsilon > 0 and self._random.uniform() <= self.epsilon:
return self._random.random_choice()
# exploit
else:
return greedy_action
91 changes: 91 additions & 0 deletions axelrod/tests/strategies/test_armed_bandits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Tests for the armed bandits strategies."""

import axelrod as axl

from .test_player import TestPlayer, TestMatch

C, D = axl.Action.C, axl.Action.D


class TestEpsilonGreedy(TestPlayer):

name = "$\varepsilon$-greedy: 0.1, 0.0, 0.0, -1.0"
player = axl.EpsilonGreedy
expected_classifier = {
"memory_depth": float("inf"),
"stochastic": True,
"makes_use_of": {"game"},
"long_run_time": False,
"inspects_source": False,
"manipulates_source": False,
"manipulates_state": False,
}

def test_deterministic(self):
# cases where epsilon = 0
actions = [(C, C), (C, C), (C, C)]
self.versus_test(
axl.Cooperator(),
expected_actions=actions,
init_kwargs={"epsilon": 0, "init_c_reward": 0, "init_d_reward": -1},
attrs={"_rewards": {C: 3, D: -1}},
)

actions = [(D, D), (D, D), (D, D)]
self.versus_test(
axl.Defector(),
expected_actions=actions,
init_kwargs={"epsilon": 0, "init_c_reward": -1, "init_d_reward": 0},
attrs={"_rewards": {C: -1, D: 1}},
)

actions = [(D, C), (D, D), (C, D)]
self.versus_test(
axl.TitForTat(),
expected_actions=actions,
init_kwargs={"epsilon": 0, "init_c_reward": 3.2, "init_d_reward": 4.0},
attrs={"_rewards": {C: 3.2, D: 3.0}},
)

def test_random(self):
# cases where epsilon = 1
opponent = axl.MockPlayer()
actions = [(C, C), (D, C), (D, C), (C, C)]
self.versus_test(
opponent, expected_actions=actions, init_kwargs={"epsilon": 1}, seed=5
)

opponent = axl.MockPlayer(actions=[C, D, C])
actions = [(D, C), (C, D), (C, C)]
self.versus_test(
opponent, expected_actions=actions, init_kwargs={"epsilon": 1.0}, seed=1
)

def test_strategy(self):
# sometimes explores
actions = [(C, C), (D, C), (D, C)]
self.versus_test(
axl.Cooperator(),
expected_actions=actions,
init_kwargs={"epsilon": 0.5},
attrs={"_rewards": {C: 3, D: 5}},
seed=2,
)

# always explores
actions = [(D, D), (C, D), (C, D)]
self.versus_test(
axl.Defector(),
expected_actions=actions,
attrs={"_rewards": {C: 0, D: 1}},
seed=13741,
)

# never explores/always exploits
actions = [(C, C), (C, C), (C, C)]
self.versus_test(
axl.TitForTat(),
expected_actions=actions,
attrs={"_rewards": {C: 3, D: 0}},
seed=1,
)
2 changes: 1 addition & 1 deletion docs/how-to/classify_strategies.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ strategies::
... }
>>> strategies = axl.filtered_strategies(filterset)
>>> len(strategies)
88
89

Or, to find out how many strategies only use 1 turn worth of memory to
make a decision::
Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ Count the number of available players::

>>> import axelrod as axl
>>> len(axl.strategies)
240
242

Create matches between two players::

Expand Down
1 change: 1 addition & 0 deletions docs/reference/bibliography.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ documentation.
.. [Shakarian2013] Shakarian, P., Roos, P. & Moores, G. A Novel Analytical Method for Evolutionary Graph Theory Problems.
.. [Slany2007] Slany W. and Kienreich W., On some winning strategies for the iterated prisoner’s dilemma, in Kendall G., Yao X. and Chong S. (eds.) The iterated prisoner’s dilemma: 20 years on. World Scientific, chapter 8, pp. 171-204, 2007.
.. [Stewart2012] Stewart, a. J., & Plotkin, J. B. (2012). Extortion and cooperation in the Prisoner’s Dilemma. Proceedings of the National Academy of Sciences, 109(26), 10134–10135. http://doi.org/10.1073/pnas.1208087109
.. [Sutton2018] Sutton, R. S., & Barto, A. G. (2018). Multi-armed Bandits. In Reinforcement Learning: An Introduction (2nd ed., pp. 25–45). MIT Press. http://incompleteideas.net/book/the-book-2nd.html
.. [Szabo2007] Szabó, G., & Fáth, G. (2007). Evolutionary games on graphs. Physics Reports, 446(4-6), 97–216. http://doi.org/10.1016/j.physrep.2007.04.004
.. [Gaudesi2016] Gaudesi, Marco, et al. "Exploiting evolutionary modeling to prevail in iterated prisoner’s dilemma tournaments." IEEE Transactions on Computational Intelligence and AI in Games 8.3 (2016): 288-300.
.. [Tzafestas2000] Tzafestas, E. (2000). Toward adaptive cooperative behavior. From Animals to Animals: Proceedings of the 6th International Conference on the Simulation of Adaptive Behavior {(SAB-2000)}, 2, 334–340.
2 changes: 2 additions & 0 deletions docs/reference/strategy_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Here are the docstrings of all the strategies in the library.
:members:
.. automodule:: axelrod.strategies.appeaser
:members:
.. automodule:: axelrod.strategies.bandits
:members:
.. automodule:: axelrod.strategies.averagecopier
:members:
.. automodule:: axelrod.strategies.axelrod_first
Expand Down
Loading