Implemented Monte Carle Tree Search

shashankms-dev · shashankms-dev · commit 6c3b5ef42480 · 2023-05-25T23:43:01.000+05:30
diff --git a/alpha-zero.ipynb b/alpha-zero.ipynb
@@ -2,29 +2,18 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 8,
    "id": "b86297a7",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'1.24.2'"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
-    "np.__version__"
+    "import math"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 9,
    "id": "e9f409ff",
    "metadata": {},
    "outputs": [],
@@ -48,6 +37,9 @@
     "        return (state.reshape(-1) == 0).astype(np.uint8)\n",
     "    \n",
     "    def check_win(self, state, action):\n",
+    "        if action == None:\n",
+    "            return False\n",
+    "        \n",
     "        row = action // self.column_count\n",
     "        column = action % self.column_count\n",
     "        player = state[row, column]\n",
@@ -75,12 +67,132 @@
     "            return 0, True\n",
     "        \n",
     "        return 0, False\n",
+    "    \n",
+    "    def get_opponent_value(self, value):\n",
+    "        return -value\n",
+    "    \n",
+    "    def change_perspective(self, state, player):\n",
+    "        return (state * player)\n",
     "    "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
+   "id": "c09a4301",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Node:\n",
+    "    def __init__(self, game, args, state, parent=None, action_taken=None):\n",
+    "        self.game = game\n",
+    "        self.args = args\n",
+    "        self.state = state\n",
+    "        self.parent = parent\n",
+    "        self.action_taken = action_taken\n",
+    "        \n",
+    "        self.children = []\n",
+    "        self.expandable_moves = game.get_valid_moves(state)\n",
+    "        \n",
+    "        self.visit_count = 0\n",
+    "        self.value_sum = 0\n",
+    "        \n",
+    "    def is_fully_expanded(self):\n",
+    "        return np.sum(self.expandable_moves) == 0 and len(self.children) > 0\n",
+    "    \n",
+    "    def select(self):\n",
+    "        best_child = None\n",
+    "        best_ucb = -np.inf\n",
+    "        \n",
+    "        for child in self.children:\n",
+    "            ucb = self.get_ucb(child)\n",
+    "            if ucb > best_ucb:\n",
+    "                best_child = child\n",
+    "                best_ucb = ucb\n",
+    "            \n",
+    "        return best_child\n",
+    "    \n",
+    "    def get_ucb(self, child):\n",
+    "        q_value = 1 - ((child.value_sum / child.visit_count) + 1) / 2\n",
+    "        return q_value + self.args['C'] * math.sqrt(math.log(self.visit_count) / child.visit_count)\n",
+    "\n",
+    "    def expand(self):\n",
+    "        action = np.random.choice(np.where(self.expandable_moves == 1)[0])\n",
+    "        self.expandable_moves[action] = 0\n",
+    "        \n",
+    "        child_state = self.state.copy()\n",
+    "        child_state = self.game.get_next_state(child_state, action, 1)\n",
+    "        child_state = self.game.change_perspective(child_state, player = -1)\n",
+    "        \n",
+    "        child = Node(self.game, self.args, child_state, self, action)\n",
+    "        self.children.append(child)\n",
+    "        \n",
+    "        return child\n",
+    "    \n",
+    "    def simulate(self):\n",
+    "        value, terminated = self.game.get_value_and_terminated(self.state, self.action_taken)\n",
+    "        value = self.game.get_opponent_value(value)\n",
+    "        \n",
+    "        if terminated:\n",
+    "            return value\n",
+    "        \n",
+    "        rollout_state = self.state.copy()\n",
+    "        rollout_player = 1\n",
+    "        while True:\n",
+    "            valid_moves = self.game.get_valid_moves(rollout_state)\n",
+    "            action = np.random.choice(np.where(valid_moves == 1)[0])\n",
+    "            rollout_state = self.game.get_next_state(rollout_state, action, rollout_player)\n",
+    "            \n",
+    "            value, terminated = self.game.get_value_and_terminated(rollout_state, action)\n",
+    "            if terminated:\n",
+    "                if rollout_player == -1:\n",
+    "                    value = self.game.get_opponent_value(value)\n",
+    "                return value\n",
+    "            \n",
+    "            rollout_player = self.game.get_opponent(rollout_player)\n",
+    "            \n",
+    "    def backpropagate(self, value):\n",
+    "        self.value_sum += value\n",
+    "        self.visit_count += 1\n",
+    "        \n",
+    "        value = self.game.get_opponent_value(value)\n",
+    "        if self.parent is not None:\n",
+    "            self.parent.backpropagate(value)\n",
+    "            \n",
+    "        \n",
+    "class MCTS:\n",
+    "    def __init__(self, game, args):\n",
+    "        self.game = game\n",
+    "        self.args = args\n",
+    "        \n",
+    "    def search(self, state):\n",
+    "        root = Node(self.game, self.args, state)\n",
+    "        \n",
+    "        for search in range(self.args['num_searches']):\n",
+    "            node = root\n",
+    "            \n",
+    "            while node.is_fully_expanded():\n",
+    "                node = node.select()\n",
+    "                \n",
+    "            value, terminated = self.game.get_value_and_terminated(node.state, node.action_taken)\n",
+    "            value = self.game.get_opponent_value(value)\n",
+    "            \n",
+    "            if not terminated:\n",
+    "                node = node.expand()\n",
+    "                value = node.simulate()\n",
+    "                \n",
+    "            node.backpropagate(value)\n",
+    "\n",
+    "        action_probs = np.zeros(self.game.action_size)\n",
+    "        for child in root.children:\n",
+    "            action_probs[child.action_taken] = child.visit_count\n",
+    "        action_probs /= np.sum(action_probs)\n",
+    "        return action_probs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "id": "e60e21f1",
    "metadata": {},
    "outputs": [
@@ -92,68 +204,58 @@
       " [0 0 0]\n",
       " [0 0 0]]\n",
       "valid_moves: [0, 1, 2, 3, 4, 5, 6, 7, 8]\n",
-      "1: 1\n",
-      "[[0 1 0]\n",
+      "1: 0\n",
+      "[[1 0 0]\n",
       " [0 0 0]\n",
       " [0 0 0]]\n",
-      "valid_moves: [0, 2, 3, 4, 5, 6, 7, 8]\n",
-      "-1: 3\n",
-      "[[ 0  1  0]\n",
-      " [-1  0  0]\n",
+      "[[ 1  0  0]\n",
+      " [ 0 -1  0]\n",
       " [ 0  0  0]]\n",
-      "valid_moves: [0, 2, 4, 5, 6, 7, 8]\n",
-      "1: 0\n",
+      "valid_moves: [1, 2, 3, 5, 6, 7, 8]\n",
+      "1: 1\n",
       "[[ 1  1  0]\n",
-      " [-1  0  0]\n",
-      " [ 0  0  0]]\n",
-      "valid_moves: [2, 4, 5, 6, 7, 8]\n",
-      "-1: 2\n",
-      "[[ 1  1 -1]\n",
-      " [-1  0  0]\n",
+      " [ 0 -1  0]\n",
       " [ 0  0  0]]\n",
-      "valid_moves: [4, 5, 6, 7, 8]\n",
-      "1: 4\n",
       "[[ 1  1 -1]\n",
-      " [-1  1  0]\n",
+      " [ 0 -1  0]\n",
       " [ 0  0  0]]\n",
-      "valid_moves: [5, 6, 7, 8]\n",
-      "-1: 7\n",
-      "[[ 1  1 -1]\n",
-      " [-1  1  0]\n",
-      " [ 0 -1  0]]\n",
-      "valid_moves: [5, 6, 8]\n",
+      "valid_moves: [3, 5, 6, 7, 8]\n",
       "1: 6\n",
       "[[ 1  1 -1]\n",
-      " [-1  1  0]\n",
-      " [ 1 -1  0]]\n",
-      "valid_moves: [5, 8]\n",
-      "-1: 8\n",
+      " [ 0 -1  0]\n",
+      " [ 1  0  0]]\n",
       "[[ 1  1 -1]\n",
-      " [-1  1  0]\n",
-      " [ 1 -1 -1]]\n",
-      "valid_moves: [5]\n",
-      "1: 5\n",
-      "[[ 1  1 -1]\n",
-      " [-1  1  1]\n",
-      " [ 1 -1 -1]]\n",
-      "Game drawn\n"
+      " [-1 -1  0]\n",
+      " [ 1  0  0]]\n",
+      "valid_moves: [5, 7, 8]\n"
      ]
     }
    ],
    "source": [
     "tictactoe = TicTacToe()\n",
     "player = 1\n",
+    "args = {\n",
+    "    'C': 1.4142,\n",
+    "    'num_searches': 1000\n",
+    "}\n",
+    "mcts = MCTS(tictactoe, args)\n",
     "state = tictactoe.get_initial_state()\n",
     "\n",
     "while True:\n",
     "    print(state)\n",
-    "    valid_moves = tictactoe.get_valid_moves(state)\n",
-    "    print(\"valid_moves:\", [i for i in range(tictactoe.action_size) if valid_moves[i] == 1])\n",
-    "    action = int(input(f\"{player}: \"))\n",
     "    \n",
-    "    if valid_moves[action] == 0:\n",
-    "        print(\"invalid action\")\n",
-    "        continue\n",
+    "    if player == 1:\n",
+    "        valid_moves = tictactoe.get_valid_moves(state)\n",
+    "        print(\"valid_moves:\", [i for i in range(tictactoe.action_size) if valid_moves[i] == 1])\n",
+    "        action = int(input(f\"{player}: \"))\n",
+    "\n",
+    "        if valid_moves[action] == 0:\n",
+    "            print(\"invalid action\")\n",
+    "            continue\n",
+    "    else:\n",
+    "        neutral_state = tictactoe.change_perspective(state, player)\n",
+    "        mcts_probs = mcts.search(neutral_state)\n",
+    "        action = np.argmax(mcts_probs)\n",
     "        \n",
     "    state = tictactoe.get_next_state(state, action, player)\n",
     "    \n",
@@ -173,17 +275,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c09a4301",
+   "id": "20fe08e6",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "class MCTS:\n",
-    "    def __init__(self, game, args):\n",
-    "        self.game = game\n",
-    "        self.args = args\n",
-    "        \n",
-    "    "
-   ]
+   "source": []
   }
  ],
  "metadata": {