{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-06-05T05:07:22.736605400Z",
     "start_time": "2024-06-05T05:06:39.100164300Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "initializing\n",
      "initializing finished\n",
      "start compile\n",
      "compile finished, cost time: 18.307454s\n",
      "Generation: 1.0, Cost time: 4551.03ms\n",
      " \tnode counts: max: 21, min: 21, mean: 21.00\n",
      " \tconn counts: max: 20, min: 20, mean: 20.00\n",
      " \tspecies: 1, [10000]\n",
      " \tfitness: valid cnt: 10000, max: 10124.0000, min: 44.0000, mean: 1758.1263, std: 1212.6823\n",
      "Generation: 2.0, Cost time: 4636.33ms\n",
      " \tnode counts: max: 22, min: 21, mean: 21.03\n",
      " \tconn counts: max: 22, min: 20, mean: 20.05\n",
      " \tspecies: 1, [10000]\n",
      " \tfitness: valid cnt: 10000, max: 11000.0000, min: 48.0000, mean: 1870.1300, std: 1263.3086\n",
      "Generation: 3.0, Cost time: 6271.12ms\n",
      " \tnode counts: max: 23, min: 21, mean: 21.03\n",
      " \tconn counts: max: 22, min: 20, mean: 20.05\n",
      " \tspecies: 1, [10000]\n",
      " \tfitness: valid cnt: 10000, max: 14624.0000, min: 28.0000, mean: 1943.9924, std: 1293.7146\n",
      "\n",
      "Fitness limit reached!\n"
     ]
    }
   ],
   "source": [
    "import jax.numpy as jnp\n",
    "\n",
    "from pipeline import Pipeline\n",
    "from algorithm.neat import *\n",
    "\n",
    "from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048\n",
    "from utils import Act, Agg\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    pipeline = Pipeline(\n",
    "        algorithm=NEAT(\n",
    "            species=DefaultSpecies(\n",
    "                genome=DefaultGenome(\n",
    "                    num_inputs=16,\n",
    "                    num_outputs=4,\n",
    "                    max_nodes=100,\n",
    "                    max_conns=1000,\n",
    "                    node_gene=DefaultNodeGene(\n",
    "                        activation_default=Act.sigmoid,\n",
    "                        activation_options=(Act.sigmoid, Act.relu, Act.tanh, Act.identity, Act.inv),\n",
    "                        aggregation_default=Agg.sum,\n",
    "                        aggregation_options=(Agg.sum, Agg.mean, Agg.max, Agg.product),\n",
    "                    ),\n",
    "                    mutation=DefaultMutation(\n",
    "                        node_add=0.03,\n",
    "                        conn_add=0.03,\n",
    "                    )\n",
    "                ),\n",
    "                pop_size=10000,\n",
    "                species_size=100,\n",
    "                survival_threshold=0.01,\n",
    "            ),\n",
    "        ),\n",
    "        problem=Jumanji_2048(\n",
    "            max_step=1000,\n",
    "        ),\n",
    "        generation_limit=10000,\n",
    "        fitness_target=13000,\n",
    "    )\n",
    "\n",
    "    # initialize state\n",
    "    state = pipeline.setup()\n",
    "    # print(state)\n",
    "    # run until terminate\n",
    "    state, best = pipeline.auto_run(state)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "genome = pipeline.algorithm.genome"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T05:08:14.332101Z",
     "start_time": "2024-06-05T05:08:14.324101300Z"
    }
   },
   "id": "a0915ecf8179f347"
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [],
   "source": [
    "transformed = genome.transform(state, *best)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T05:08:49.132030500Z",
     "start_time": "2024-06-05T05:08:48.495809200Z"
    }
   },
   "id": "cd1fa65e8a9d6e13"
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [],
   "source": [
    "def policy(board):\n",
    "    action_scores = genome.forward(state, transformed, board)\n",
    "    return action_scores"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T05:09:32.355055100Z",
     "start_time": "2024-06-05T05:09:32.350057Z"
    }
   },
   "id": "61bc1895af304651"
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
      "       [0, 1, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 2],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 2],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 3],\n",
      "       [0, 0, 0, 0],\n",
      "       [2, 0, 0, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 3],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 1],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 3],\n",
      "       [0, 0, 0, 1],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 3],\n",
      "       [0, 0, 0, 1],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 3],\n",
      "       [0, 0, 2, 1],\n",
      "       [0, 1, 0, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 3],\n",
      "       [1, 0, 2, 1],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 3],\n",
      "       [0, 1, 2, 1],\n",
      "       [0, 1, 0, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 3],\n",
      "       [0, 2, 2, 1],\n",
      "       [1, 0, 0, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
      "       [0, 2, 2, 1],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
      "       [0, 2, 2, 1],\n",
      "       [0, 0, 1, 0],\n",
      "       [0, 0, 2, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
      "       [0, 0, 3, 1],\n",
      "       [0, 0, 1, 1],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
      "       [0, 0, 3, 2],\n",
      "       [0, 0, 1, 2],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 3],\n",
      "       [0, 0, 3, 3],\n",
      "       [0, 1, 1, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
      "       [0, 1, 3, 1],\n",
      "       [0, 0, 1, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
      "       [0, 1, 3, 1],\n",
      "       [0, 0, 0, 1],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
      "       [0, 2, 3, 2],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
      "       [0, 2, 3, 2],\n",
      "       [0, 0, 0, 1],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
      "       [1, 2, 3, 2],\n",
      "       [0, 0, 0, 1],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
      "       [1, 2, 3, 2],\n",
      "       [0, 0, 0, 2],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
      "       [1, 2, 3, 3],\n",
      "       [0, 0, 0, 1],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
      "       [1, 2, 3, 3],\n",
      "       [0, 1, 0, 1],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
      "       [1, 2, 3, 3],\n",
      "       [0, 2, 1, 1],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
      "       [1, 3, 3, 3],\n",
      "       [0, 0, 1, 1],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
      "       [1, 0, 3, 3],\n",
      "       [0, 1, 1, 2],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
      "       [1, 1, 3, 3],\n",
      "       [0, 0, 1, 2],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
      "       [1, 1, 3, 3],\n",
      "       [0, 1, 2, 2],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
      "       [1, 2, 3, 3],\n",
      "       [0, 1, 2, 2],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
      "       [1, 1, 2, 4],\n",
      "       [0, 0, 1, 3],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
      "       [1, 1, 2, 3],\n",
      "       [0, 1, 1, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
      "       [1, 2, 2, 3],\n",
      "       [0, 0, 1, 0],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
      "       [1, 2, 2, 3],\n",
      "       [0, 1, 1, 0],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
      "       [1, 2, 2, 3],\n",
      "       [0, 1, 2, 1],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
      "       [1, 2, 3, 3],\n",
      "       [0, 1, 1, 1],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
      "       [1, 1, 2, 4],\n",
      "       [0, 0, 1, 2],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
      "       [0, 2, 2, 4],\n",
      "       [0, 0, 1, 2],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
      "       [1, 2, 2, 4],\n",
      "       [0, 0, 1, 2],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
      "       [2, 2, 2, 4],\n",
      "       [1, 0, 1, 2],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
      "       [1, 2, 2, 4],\n",
      "       [0, 0, 1, 2],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
      "       [1, 2, 2, 4],\n",
      "       [0, 1, 1, 2],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
      "       [1, 2, 2, 4],\n",
      "       [0, 1, 2, 2],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
      "       [2, 2, 3, 4],\n",
      "       [1, 1, 0, 2],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
      "       [0, 3, 3, 4],\n",
      "       [0, 0, 2, 2],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
      "       [1, 3, 3, 4],\n",
      "       [0, 0, 2, 2],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
      "       [0, 1, 4, 4],\n",
      "       [0, 1, 0, 3],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
      "       [0, 2, 4, 4],\n",
      "       [0, 1, 0, 3],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
      "       [0, 0, 2, 5],\n",
      "       [1, 0, 1, 3],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 0, 2, 3],\n",
      "       [0, 1, 1, 1],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 1, 2, 3],\n",
      "       [0, 0, 1, 1],\n",
      "       [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 1, 2, 3],\n",
      "       [2, 0, 1, 1],\n",
      "       [0, 2, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 1, 2, 3],\n",
      "       [2, 2, 1, 1],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 1, 2, 3],\n",
      "       [2, 2, 2, 1],\n",
      "       [0, 0, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 1, 3, 3],\n",
      "       [2, 2, 2, 1],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [0, 1, 2, 4],\n",
      "       [0, 2, 3, 1],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 2, 4, 0],\n",
      "       [2, 3, 1, 1],\n",
      "       [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 2, 4, 1],\n",
      "       [3, 3, 1, 0],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 2, 4, 2],\n",
      "       [3, 3, 1, 0],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 2, 4, 2],\n",
      "       [3, 3, 2, 0],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 2, 4, 2],\n",
      "       [0, 0, 4, 2],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 2, 5, 3],\n",
      "       [0, 1, 1, 1],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 2, 5, 3],\n",
      "       [0, 0, 1, 2],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [1, 2, 5, 3],\n",
      "       [1, 2, 1, 0],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [2, 3, 5, 3],\n",
      "       [1, 1, 1, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [2, 3, 5, 3],\n",
      "       [0, 0, 1, 2],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [2, 3, 5, 3],\n",
      "       [0, 1, 1, 2],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
      "       [2, 3, 5, 3],\n",
      "       [1, 2, 1, 2],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, False,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 0],\n",
      "       [3, 4, 1, 6],\n",
      "       [2, 3, 5, 3],\n",
      "       [1, 2, 1, 2]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
      "       [2, 4, 5, 3],\n",
      "       [1, 3, 1, 2],\n",
      "       [0, 2, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
      "       [2, 4, 5, 3],\n",
      "       [1, 3, 1, 2],\n",
      "       [1, 0, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
      "       [2, 4, 5, 3],\n",
      "       [2, 3, 1, 2],\n",
      "       [0, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
      "       [3, 4, 5, 3],\n",
      "       [1, 3, 1, 2],\n",
      "       [0, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [1, 4, 5, 3],\n",
      "       [0, 3, 1, 2],\n",
      "       [2, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [1, 4, 5, 3],\n",
      "       [2, 3, 1, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [1, 4, 5, 3],\n",
      "       [2, 3, 1, 2],\n",
      "       [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [1, 4, 5, 3],\n",
      "       [3, 3, 2, 2],\n",
      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [1, 4, 5, 3],\n",
      "       [0, 1, 4, 3],\n",
      "       [0, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [1, 4, 5, 4],\n",
      "       [1, 2, 4, 1],\n",
      "       [0, 0, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 5, 4],\n",
      "       [0, 2, 4, 1],\n",
      "       [0, 0, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 5, 4],\n",
      "       [0, 2, 4, 2],\n",
      "       [0, 0, 2, 1]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 5, 4],\n",
      "       [2, 4, 2, 0],\n",
      "       [2, 1, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 5, 5, 4],\n",
      "       [2, 1, 2, 0],\n",
      "       [0, 1, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 5, 5, 4],\n",
      "       [2, 2, 2, 0],\n",
      "       [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 5, 5, 4],\n",
      "       [2, 2, 2, 1],\n",
      "       [0, 2, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 5, 5, 4],\n",
      "       [2, 3, 2, 1],\n",
      "       [0, 1, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [0, 3, 6, 4],\n",
      "       [2, 3, 2, 1],\n",
      "       [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 6, 4],\n",
      "       [1, 1, 2, 1],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 6, 4],\n",
      "       [0, 2, 2, 1],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 6, 4],\n",
      "       [1, 0, 3, 1],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 6, 4],\n",
      "       [0, 1, 3, 1],\n",
      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 6, 4],\n",
      "       [0, 2, 3, 1],\n",
      "       [0, 2, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 6, 4],\n",
      "       [0, 3, 3, 1],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 6, 4],\n",
      "       [1, 3, 3, 1],\n",
      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [2, 4, 6, 4],\n",
      "       [2, 1, 4, 1],\n",
      "       [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 4, 6, 4],\n",
      "       [1, 1, 4, 1],\n",
      "       [0, 0, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 4, 6, 4],\n",
      "       [0, 2, 4, 1],\n",
      "       [0, 0, 1, 3]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 4, 6, 4],\n",
      "       [2, 4, 1, 0],\n",
      "       [1, 3, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 5, 6, 4],\n",
      "       [2, 3, 1, 1],\n",
      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 5, 6, 4],\n",
      "       [2, 3, 1, 2],\n",
      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 5, 6, 4],\n",
      "       [2, 3, 1, 2],\n",
      "       [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 5, 6, 4],\n",
      "       [2, 3, 1, 3],\n",
      "       [0, 0, 2, 1]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 5, 6, 4],\n",
      "       [2, 3, 1, 3],\n",
      "       [2, 1, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [3, 5, 6, 4],\n",
      "       [3, 3, 2, 3],\n",
      "       [0, 1, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
      "       [4, 5, 6, 4],\n",
      "       [1, 3, 3, 3],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
      "       [1, 5, 6, 4],\n",
      "       [0, 3, 3, 3],\n",
      "       [2, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
      "       [1, 5, 6, 4],\n",
      "       [2, 3, 3, 3],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
      "       [1, 5, 6, 4],\n",
      "       [0, 2, 3, 4],\n",
      "       [1, 0, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
      "       [2, 5, 6, 5],\n",
      "       [1, 2, 3, 2],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, False,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [5, 2, 1, 6],\n",
      "       [2, 5, 6, 5],\n",
      "       [1, 2, 3, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 2, 6],\n",
      "       [2, 5, 6, 5],\n",
      "       [1, 2, 3, 2],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 5, 3, 6],\n",
      "       [2, 5, 6, 5],\n",
      "       [1, 2, 3, 2],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
      "       [1, 2, 6, 5],\n",
      "       [0, 1, 3, 2],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
      "       [1, 2, 6, 5],\n",
      "       [0, 2, 3, 2],\n",
      "       [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
      "       [1, 3, 6, 5],\n",
      "       [0, 0, 3, 2],\n",
      "       [1, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
      "       [2, 3, 6, 5],\n",
      "       [1, 0, 3, 2],\n",
      "       [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
      "       [1, 3, 6, 5],\n",
      "       [0, 0, 3, 2],\n",
      "       [1, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
      "       [2, 3, 6, 5],\n",
      "       [0, 1, 3, 2],\n",
      "       [0, 0, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
      "       [2, 3, 6, 5],\n",
      "       [0, 1, 3, 2],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
      "       [2, 3, 6, 5],\n",
      "       [1, 1, 3, 3],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
      "       [2, 3, 6, 5],\n",
      "       [0, 0, 2, 4],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
      "       [2, 3, 6, 5],\n",
      "       [1, 1, 2, 4],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
      "       [2, 3, 6, 5],\n",
      "       [0, 2, 2, 4],\n",
      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
      "       [2, 3, 6, 5],\n",
      "       [1, 2, 2, 4],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
      "       [2, 3, 6, 5],\n",
      "       [2, 1, 3, 4],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
      "       [3, 3, 6, 5],\n",
      "       [0, 1, 3, 4],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [0, 3, 6, 5],\n",
      "       [0, 1, 3, 4],\n",
      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [0, 3, 6, 5],\n",
      "       [0, 2, 3, 4],\n",
      "       [0, 2, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [0, 3, 6, 5],\n",
      "       [0, 3, 3, 4],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [1, 4, 6, 5],\n",
      "       [0, 2, 3, 4],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [1, 4, 6, 5],\n",
      "       [2, 3, 4, 0],\n",
      "       [1, 2, 2, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [1, 4, 6, 5],\n",
      "       [0, 2, 3, 4],\n",
      "       [1, 0, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [2, 4, 6, 5],\n",
      "       [1, 2, 3, 4],\n",
      "       [0, 0, 1, 3]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [2, 4, 6, 5],\n",
      "       [1, 2, 3, 4],\n",
      "       [1, 3, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [2, 4, 6, 5],\n",
      "       [2, 2, 3, 4],\n",
      "       [1, 3, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [3, 4, 6, 5],\n",
      "       [1, 2, 3, 4],\n",
      "       [1, 3, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [3, 4, 6, 5],\n",
      "       [2, 2, 3, 4],\n",
      "       [0, 3, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [3, 4, 6, 5],\n",
      "       [0, 3, 3, 4],\n",
      "       [1, 0, 3, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [3, 4, 6, 5],\n",
      "       [1, 3, 4, 4],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [3, 4, 6, 5],\n",
      "       [0, 1, 3, 5],\n",
      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
      "       [3, 4, 6, 6],\n",
      "       [0, 2, 3, 2],\n",
      "       [2, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 2, 3, 0],\n",
      "       [0, 1, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [0, 2, 3, 3],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [0, 0, 2, 4],\n",
      "       [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [0, 1, 2, 4],\n",
      "       [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [0, 2, 2, 4],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [0, 0, 3, 4],\n",
      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [0, 1, 3, 4],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [1, 1, 3, 4],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 1, 3, 4],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 1, 3, 4],\n",
      "       [0, 1, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 2, 3, 4],\n",
      "       [0, 1, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [1, 3, 3, 4],\n",
      "       [0, 0, 1, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [0, 1, 4, 4],\n",
      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [0, 2, 4, 4],\n",
      "       [1, 0, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [1, 2, 4, 4],\n",
      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 1, 2, 5],\n",
      "       [0, 0, 2, 3]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 1, 3, 5],\n",
      "       [1, 0, 0, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 1, 3, 5],\n",
      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 2, 3, 5],\n",
      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [1, 3, 3, 5],\n",
      "       [0, 0, 2, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [0, 1, 4, 5],\n",
      "       [0, 2, 2, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [0, 1, 4, 5],\n",
      "       [1, 0, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [1, 1, 4, 5],\n",
      "       [0, 1, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [1, 2, 4, 5],\n",
      "       [0, 1, 3, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [1, 2, 4, 5],\n",
      "       [0, 1, 1, 4]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [1, 2, 4, 5],\n",
      "       [0, 2, 2, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [1, 3, 4, 5],\n",
      "       [1, 0, 2, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 3, 4, 5],\n",
      "       [0, 1, 2, 4]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 3, 4, 5],\n",
      "       [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 3, 5, 5],\n",
      "       [1, 2, 1, 1]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 6, 2],\n",
      "       [2, 3, 6, 1],\n",
      "       [1, 2, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 7, 2],\n",
      "       [2, 3, 2, 1],\n",
      "       [1, 2, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 7, 2],\n",
      "       [2, 3, 3, 1],\n",
      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 7, 2],\n",
      "       [2, 3, 3, 2],\n",
      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 7, 3],\n",
      "       [2, 3, 3, 1],\n",
      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 7, 3],\n",
      "       [2, 3, 3, 2],\n",
      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 7, 3],\n",
      "       [0, 2, 4, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 7, 3],\n",
      "       [1, 2, 4, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 7, 3],\n",
      "       [2, 2, 4, 2],\n",
      "       [2, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [3, 4, 7, 3],\n",
      "       [3, 2, 4, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
      "       [4, 4, 7, 3],\n",
      "       [1, 2, 4, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [2, 4, 7, 3],\n",
      "       [0, 2, 4, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [2, 4, 7, 3],\n",
      "       [1, 2, 4, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [2, 4, 7, 3],\n",
      "       [2, 2, 4, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [3, 4, 7, 3],\n",
      "       [1, 2, 4, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [3, 4, 7, 3],\n",
      "       [2, 2, 4, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [3, 4, 7, 3],\n",
      "       [3, 4, 2, 0],\n",
      "       [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(48., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [2, 2, 2, 1],\n",
      "       [1, 0, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [1, 2, 3, 1],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [1, 2, 3, 1],\n",
      "       [2, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [1, 2, 3, 2],\n",
      "       [2, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [1, 2, 3, 2],\n",
      "       [1, 0, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [2, 2, 3, 2],\n",
      "       [1, 0, 2, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [0, 3, 3, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [1, 3, 3, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [2, 3, 3, 2],\n",
      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [2, 4, 2, 1],\n",
      "       [2, 2, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [3, 4, 2, 1],\n",
      "       [0, 2, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [3, 4, 2, 2],\n",
      "       [0, 2, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 3],\n",
      "       [1, 3, 4, 3],\n",
      "       [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 4],\n",
      "       [1, 3, 4, 2],\n",
      "       [0, 0, 2, 1]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 4],\n",
      "       [1, 3, 4, 2],\n",
      "       [2, 1, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 4],\n",
      "       [1, 3, 4, 2],\n",
      "       [1, 0, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 4],\n",
      "       [2, 3, 4, 3],\n",
      "       [1, 0, 2, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 4],\n",
      "       [2, 3, 4, 3],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 4],\n",
      "       [2, 3, 4, 3],\n",
      "       [1, 0, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 4],\n",
      "       [2, 3, 4, 3],\n",
      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 4],\n",
      "       [2, 3, 4, 4],\n",
      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 5],\n",
      "       [2, 3, 4, 2],\n",
      "       [0, 1, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 5],\n",
      "       [2, 3, 4, 2],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 5],\n",
      "       [2, 3, 4, 3],\n",
      "       [1, 0, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 5],\n",
      "       [2, 3, 4, 3],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 5],\n",
      "       [2, 3, 4, 3],\n",
      "       [0, 1, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 5],\n",
      "       [2, 3, 4, 3],\n",
      "       [1, 0, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 5],\n",
      "       [2, 3, 4, 4],\n",
      "       [1, 0, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 5],\n",
      "       [1, 2, 3, 5],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [1, 2, 3, 2],\n",
      "       [0, 1, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [1, 2, 3, 2],\n",
      "       [1, 0, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 2, 3, 3],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [0, 0, 3, 4],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [0, 1, 3, 4],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [0, 2, 3, 4],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [0, 2, 3, 4],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 4, 1],\n",
      "       [1, 2, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 4, 1],\n",
      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 4, 1],\n",
      "       [0, 1, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 4, 1],\n",
      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 4, 1],\n",
      "       [1, 0, 2, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 4, 1],\n",
      "       [1, 1, 2, 3]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 4, 1],\n",
      "       [2, 2, 3, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [3, 3, 4, 2],\n",
      "       [1, 2, 3, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [0, 4, 4, 2],\n",
      "       [1, 1, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [1, 4, 4, 2],\n",
      "       [1, 1, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 4, 4, 2],\n",
      "       [1, 1, 2, 3]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 5, 2, 0],\n",
      "       [2, 2, 3, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
      "       [4, 6, 7, 6],\n",
      "       [3, 2, 2, 1],\n",
      "       [1, 0, 3, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 2, 7, 6],\n",
      "       [3, 0, 2, 1],\n",
      "       [1, 0, 3, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 2, 7, 6],\n",
      "       [2, 3, 2, 1],\n",
      "       [0, 1, 3, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 2, 7, 6],\n",
      "       [2, 3, 2, 1],\n",
      "       [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 2, 7, 6],\n",
      "       [2, 4, 3, 2],\n",
      "       [1, 1, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 2, 7, 6],\n",
      "       [2, 4, 3, 2],\n",
      "       [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 2, 7, 6],\n",
      "       [2, 4, 3, 3],\n",
      "       [1, 0, 2, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 2, 7, 6],\n",
      "       [2, 2, 4, 4],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 3, 7, 6],\n",
      "       [2, 0, 4, 4],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 3, 7, 6],\n",
      "       [0, 0, 2, 5],\n",
      "       [0, 2, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 3, 7, 6],\n",
      "       [0, 2, 3, 5],\n",
      "       [0, 2, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 3, 7, 6],\n",
      "       [0, 3, 3, 5],\n",
      "       [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 4, 7, 6],\n",
      "       [1, 1, 3, 5],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [0, 5, 7, 6],\n",
      "       [0, 2, 3, 5],\n",
      "       [0, 1, 0, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [0, 5, 7, 6],\n",
      "       [1, 2, 3, 5],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [1, 5, 7, 6],\n",
      "       [0, 2, 3, 5],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [2, 5, 7, 6],\n",
      "       [0, 2, 3, 5],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [2, 5, 7, 6],\n",
      "       [1, 2, 3, 5],\n",
      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [2, 5, 7, 6],\n",
      "       [2, 2, 3, 5],\n",
      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [3, 5, 7, 6],\n",
      "       [0, 2, 3, 5],\n",
      "       [1, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [3, 5, 7, 6],\n",
      "       [1, 2, 3, 5],\n",
      "       [1, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [3, 5, 7, 6],\n",
      "       [2, 2, 3, 5],\n",
      "       [1, 1, 1, 2]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [3, 5, 7, 6],\n",
      "       [3, 3, 5, 1],\n",
      "       [2, 1, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 5, 1],\n",
      "       [1, 1, 2, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 5, 1],\n",
      "       [1, 0, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 5, 1],\n",
      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [2, 3, 5, 1],\n",
      "       [2, 0, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [3, 3, 5, 1],\n",
      "       [1, 0, 2, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [0, 4, 5, 1],\n",
      "       [1, 1, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(2, dtype=int8), reward=Array(0., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
      "       [4, 5, 7, 6],\n",
      "       [1, 4, 5, 1],\n",
      "       [2, 1, 2, 3]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "3004.0\n"
     ]
    }
   ],
   "source": [
    "import jax, jumanji\n",
    "\n",
    "env = jumanji.make(\"Game2048-v1\")\n",
    "key = jax.random.PRNGKey(48)\n",
    "jit_reset = jax.jit(env.reset)\n",
    "jit_step = jax.jit(env.step)\n",
    "state, timestep = jax.jit(env.reset)(key)\n",
    "jit_policy = jax.jit(policy)\n",
    "total_reward = 0\n",
    "while True:\n",
    "    board, action_mask = timestep[\"observation\"]\n",
    "    action = jit_policy(timestep[\"observation\"][0].reshape(-1))\n",
    "    score_with_mask = jnp.where(action_mask, action, -jnp.inf)\n",
    "    action = jnp.argmax(score_with_mask)\n",
    "    state, timestep = jit_step(state, action)\n",
    "    done = jnp.all(~timestep[\"observation\"][1])\n",
    "    print(timestep)\n",
    "    total_reward += timestep[\"reward\"]\n",
    "    if done:\n",
    "        break\n",
    "print(total_reward)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T05:15:43.041491500Z",
     "start_time": "2024-06-05T05:15:37.325953600Z"
    }
   },
   "id": "f166e09c5be1a8fb"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   },
   "id": "187326d08ac1eeb4"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}