tensorneat-mend/examples/jumanji/2048_test.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-06-05T07:40:13.841629100Z",
     "start_time": "2024-06-05T07:40:13.076164500Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "initializing\n",
      "initializing finished\n"
     ]
    }
   ],
   "source": [
    "import jax.numpy as jnp\n",
    "\n",
    "from pipeline import Pipeline\n",
    "from algorithm.neat import *\n",
    "from algorithm.neat.gene.node.default_without_response import NodeGeneWithoutResponse\n",
    "\n",
    "from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048\n",
    "from tensorneat.utils import Act, Agg\n",
    "\n",
    "pipeline = Pipeline(\n",
    "    algorithm=NEAT(\n",
    "        species=DefaultSpecies(\n",
    "            genome=DefaultGenome(\n",
    "                num_inputs=16,\n",
    "                num_outputs=4,\n",
    "                max_nodes=100,\n",
    "                max_conns=1000,\n",
    "                node_gene=NodeGeneWithoutResponse(\n",
    "                    activation_default=Act.sigmoid,\n",
    "                    activation_options=(\n",
    "                        Act.sigmoid,\n",
    "                        Act.relu,\n",
    "                        Act.tanh,\n",
    "                        Act.identity,\n",
    "                    ),\n",
    "                    aggregation_default=Agg.sum,\n",
    "                    aggregation_options=(Agg.sum,),\n",
    "                    activation_replace_rate=0.02,\n",
    "                    aggregation_replace_rate=0.02,\n",
    "                    bias_mutate_rate=0.03,\n",
    "                    bias_init_std=0.5,\n",
    "                    bias_mutate_power=0.2,\n",
    "                    bias_replace_rate=0.01,\n",
    "                ),\n",
    "                conn_gene=DefaultConnGene(\n",
    "                    weight_mutate_rate=0.015,\n",
    "                    weight_replace_rate=0.003,\n",
    "                    weight_mutate_power=0.5,\n",
    "                ),\n",
    "                mutation=DefaultMutation(\n",
    "                    node_add=0.1, conn_add=0.2, conn_delete=0.2\n",
    "                ),\n",
    "            ),\n",
    "            pop_size=1000,\n",
    "            species_size=5,\n",
    "            survival_threshold=0.1,\n",
    "            max_stagnation=7,\n",
    "            genome_elitism=3,\n",
    "            compatibility_threshold=1.2,\n",
    "        ),\n",
    "    ),\n",
    "    problem=Jumanji_2048(max_step=10000, repeat_times=5),\n",
    "    generation_limit=100,\n",
    "    fitness_target=13000,\n",
    "    save_path=\"2048.pkl\",\n",
    ")\n",
    "state = pipeline.setup()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "data = np.load('2048.npz')\n",
    "nodes, conns = data['nodes'], data['conns']"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T07:40:13.932015100Z",
     "start_time": "2024-06-05T07:40:13.876631500Z"
    }
   },
   "id": "a0915ecf8179f347"
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [
    "genome = pipeline.algorithm.species.genome\n",
    "transformed = genome.transform(state, nodes, conns)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T07:40:14.585804800Z",
     "start_time": "2024-06-05T07:40:14.568805Z"
    }
   },
   "id": "cd1fa65e8a9d6e13"
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [],
   "source": [
    "def policy(board):\n",
    "    action_scores = genome.forward(state, transformed, board)\n",
    "    return action_scores"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T07:40:15.124383600Z",
     "start_time": "2024-06-05T07:40:15.118384200Z"
    }
   },
   "id": "61bc1895af304651"
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 1, 0],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [1, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 1],\n",
      "       [1, 1, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [1, 1, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 1, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [2, 1, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 2, 0, 0],\n",
      "       [0, 1, 0, 0],\n",
      "       [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 1, 2],\n",
      "       [0, 0, 0, 1],\n",
      "       [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [0, 0, 0, 2],\n",
      "       [0, 0, 1, 1],\n",
      "       [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 0, 2],\n",
      "       [0, 1, 2, 1],\n",
      "       [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 0, 0, 2],\n",
      "       [0, 0, 2, 1],\n",
      "       [0, 1, 3, 2]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
      "       [2, 0, 0, 0],\n",
      "       [2, 1, 0, 0],\n",
      "       [1, 3, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 0, 1],\n",
      "       [3, 2, 0, 0],\n",
      "       [1, 3, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [1, 0, 0, 0],\n",
      "       [3, 2, 0, 0],\n",
      "       [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 1, 0, 1],\n",
      "       [3, 2, 0, 0],\n",
      "       [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [1, 1, 0, 0],\n",
      "       [3, 2, 0, 0],\n",
      "       [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 1, 0, 0],\n",
      "       [3, 2, 1, 0],\n",
      "       [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [1, 1, 0, 0],\n",
      "       [3, 2, 1, 1],\n",
      "       [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 2, 0, 0],\n",
      "       [3, 2, 1, 1],\n",
      "       [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 0, 1],\n",
      "       [3, 3, 1, 2],\n",
      "       [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 1, 0],\n",
      "       [3, 0, 1, 1],\n",
      "       [1, 4, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 1, 0],\n",
      "       [3, 0, 2, 1],\n",
      "       [1, 4, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 0, 1],\n",
      "       [3, 0, 1, 1],\n",
      "       [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
      "       [1, 0, 0, 0],\n",
      "       [3, 0, 1, 2],\n",
      "       [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 1, 0],\n",
      "       [3, 0, 1, 3],\n",
      "       [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 0, 1],\n",
      "       [3, 0, 2, 0],\n",
      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 0, 1],\n",
      "       [3, 0, 2, 1],\n",
      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [1, 0, 0, 0],\n",
      "       [3, 0, 2, 2],\n",
      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 1, 2],\n",
      "       [3, 0, 2, 2],\n",
      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 1, 1, 0],\n",
      "       [3, 0, 2, 3],\n",
      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [1, 0, 1, 0],\n",
      "       [3, 1, 2, 3],\n",
      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 0, 2, 0],\n",
      "       [3, 1, 2, 3],\n",
      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 1, 0, 1],\n",
      "       [3, 1, 3, 3],\n",
      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 1, 1],\n",
      "       [3, 2, 0, 3],\n",
      "       [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [1, 0, 0, 1],\n",
      "       [3, 2, 1, 3],\n",
      "       [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [2, 0, 0, 1],\n",
      "       [3, 2, 1, 3],\n",
      "       [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [2, 1, 0, 2],\n",
      "       [3, 2, 1, 3],\n",
      "       [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [0, 2, 1, 2],\n",
      "       [3, 2, 1, 3],\n",
      "       [0, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 1, 0, 2],\n",
      "       [1, 3, 2, 3],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 1, 2],\n",
      "       [1, 3, 2, 3],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [0, 0, 1, 2],\n",
      "       [2, 3, 2, 3],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 1, 1, 2],\n",
      "       [2, 3, 2, 3],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 1, 2, 2],\n",
      "       [2, 3, 2, 3],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 1, 0, 2],\n",
      "       [2, 3, 3, 3],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
      "       [0, 0, 2, 2],\n",
      "       [0, 2, 3, 4],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
      "       [0, 0, 2, 2],\n",
      "       [0, 3, 3, 4],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 0, 2, 2],\n",
      "       [1, 3, 3, 4],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [0, 0, 2, 2],\n",
      "       [2, 3, 3, 4],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
      "       [0, 1, 2, 2],\n",
      "       [2, 3, 3, 4],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [1, 1, 2, 2],\n",
      "       [2, 3, 3, 4],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
      "       [1, 2, 2, 2],\n",
      "       [2, 3, 3, 4],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
      "       [2, 2, 2, 2],\n",
      "       [2, 3, 3, 4],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
      "       [1, 2, 2, 2],\n",
      "       [3, 3, 3, 4],\n",
      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n",
      "       [0, 2, 2, 2],\n",
      "       [1, 3, 3, 4],\n",
      "       [4, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
      "       [3, 2, 0, 2],\n",
      "       [1, 4, 4, 0],\n",
      "       [4, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
      "       [3, 2, 1, 0],\n",
      "       [1, 4, 0, 2],\n",
      "       [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
      "       [3, 2, 0, 0],\n",
      "       [1, 4, 1, 2],\n",
      "       [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
      "       [3, 2, 1, 1],\n",
      "       [1, 4, 1, 2],\n",
      "       [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
      "       [3, 2, 0, 1],\n",
      "       [1, 4, 2, 2],\n",
      "       [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n",
      "       [3, 2, 1, 1],\n",
      "       [1, 4, 2, 2],\n",
      "       [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(76., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 2],\n",
      "       [1, 3, 2, 2],\n",
      "       [0, 1, 4, 3],\n",
      "       [0, 4, 1, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 0],\n",
      "       [0, 3, 2, 3],\n",
      "       [0, 1, 4, 3],\n",
      "       [1, 4, 1, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n",
      "       [0, 3, 2, 0],\n",
      "       [0, 1, 4, 4],\n",
      "       [2, 4, 1, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
      "       [0, 3, 2, 2],\n",
      "       [0, 1, 4, 4],\n",
      "       [3, 4, 1, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 1],\n",
      "       [1, 0, 3, 3],\n",
      "       [0, 0, 1, 5],\n",
      "       [3, 4, 1, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 1, 2, 3],\n",
      "       [1, 0, 3, 5],\n",
      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 2, 2, 3],\n",
      "       [1, 1, 3, 5],\n",
      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 0, 3, 3],\n",
      "       [1, 2, 3, 5],\n",
      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
      "       [0, 0, 0, 3],\n",
      "       [1, 2, 4, 5],\n",
      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 1],\n",
      "       [0, 0, 1, 3],\n",
      "       [1, 2, 4, 5],\n",
      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
      "       [1, 3, 0, 1],\n",
      "       [1, 2, 4, 5],\n",
      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
      "       [2, 3, 0, 1],\n",
      "       [2, 2, 4, 5],\n",
      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
      "       [1, 3, 0, 1],\n",
      "       [3, 2, 4, 5],\n",
      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [1, 3, 1, 1],\n",
      "       [1, 2, 4, 5],\n",
      "       [4, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [1, 3, 1, 1],\n",
      "       [2, 2, 4, 5],\n",
      "       [4, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(44., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 1, 3, 2],\n",
      "       [0, 3, 4, 5],\n",
      "       [0, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [0, 1, 3, 2],\n",
      "       [0, 3, 4, 5],\n",
      "       [1, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 2, 3, 2],\n",
      "       [0, 3, 4, 5],\n",
      "       [1, 5, 2, 6]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
      "       [0, 2, 3, 2],\n",
      "       [0, 3, 4, 5],\n",
      "       [2, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
      "       [2, 2, 3, 2],\n",
      "       [0, 3, 4, 5],\n",
      "       [2, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 2, 3, 3],\n",
      "       [1, 3, 4, 5],\n",
      "       [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 2, 4],\n",
      "       [1, 3, 4, 5],\n",
      "       [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 0, 2, 4],\n",
      "       [2, 3, 4, 5],\n",
      "       [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
      "       [2, 4, 0, 0],\n",
      "       [2, 3, 4, 5],\n",
      "       [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [1, 4, 1, 0],\n",
      "       [3, 3, 4, 5],\n",
      "       [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [1, 4, 1, 0],\n",
      "       [1, 3, 4, 5],\n",
      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [0, 4, 1, 0],\n",
      "       [2, 3, 4, 5],\n",
      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
      "       [0, 4, 1, 1],\n",
      "       [2, 3, 4, 5],\n",
      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n",
      "       [1, 4, 1, 1],\n",
      "       [2, 3, 4, 5],\n",
      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
      "       [1, 4, 2, 1],\n",
      "       [2, 3, 4, 5],\n",
      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
      "       [2, 4, 2, 1],\n",
      "       [2, 3, 4, 5],\n",
      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
      "       [0, 4, 2, 2],\n",
      "       [3, 3, 4, 5],\n",
      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 2, 0],\n",
      "       [1, 4, 2, 2],\n",
      "       [3, 3, 4, 5],\n",
      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
      "       [1, 4, 3, 2],\n",
      "       [3, 3, 4, 5],\n",
      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
      "       [1, 4, 3, 2],\n",
      "       [4, 4, 5, 2],\n",
      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [2, 1, 3, 0],\n",
      "       [1, 5, 5, 3],\n",
      "       [5, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
      "       [2, 0, 3, 0],\n",
      "       [1, 1, 5, 3],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
      "       [2, 0, 3, 1],\n",
      "       [1, 1, 5, 3],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
      "       [2, 2, 3, 1],\n",
      "       [1, 1, 5, 3],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
      "       [2, 2, 3, 2],\n",
      "       [1, 1, 5, 3],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
      "       [1, 3, 3, 2],\n",
      "       [0, 2, 5, 3],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [0, 3, 3, 3],\n",
      "       [1, 2, 5, 3],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
      "       [0, 3, 3, 0],\n",
      "       [1, 2, 5, 4],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
      "       [0, 0, 1, 4],\n",
      "       [1, 2, 5, 4],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [0, 0, 1, 2],\n",
      "       [1, 2, 5, 5],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [0, 1, 1, 2],\n",
      "       [1, 2, 5, 5],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [2, 1, 1, 2],\n",
      "       [2, 2, 5, 5],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 1, 1, 2],\n",
      "       [3, 2, 5, 5],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 1, 2, 2],\n",
      "       [1, 3, 2, 6],\n",
      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 1, 0, 0],\n",
      "       [1, 3, 2, 2],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 1, 0, 1],\n",
      "       [1, 3, 2, 2],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 1, 0, 2],\n",
      "       [1, 3, 2, 2],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 1, 0, 1],\n",
      "       [2, 3, 2, 3],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 0, 2],\n",
      "       [2, 3, 2, 3],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [0, 0, 1, 2],\n",
      "       [2, 3, 2, 3],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [1, 0, 1, 2],\n",
      "       [2, 3, 2, 3],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [1, 1, 1, 2],\n",
      "       [2, 3, 2, 3],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 2, 1, 2],\n",
      "       [2, 3, 2, 3],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
      "       [1, 2, 1, 2],\n",
      "       [2, 3, 2, 3],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
      "       [2, 2, 1, 2],\n",
      "       [2, 3, 2, 3],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [0, 2, 2, 2],\n",
      "       [3, 3, 2, 3],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
      "       [0, 2, 0, 2],\n",
      "       [3, 3, 3, 3],\n",
      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n",
      "       [1, 2, 0, 2],\n",
      "       [3, 3, 0, 3],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [1, 2, 2, 2],\n",
      "       [3, 3, 1, 3],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 2],\n",
      "       [0, 1, 2, 3],\n",
      "       [0, 4, 1, 3],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n",
      "       [0, 2, 2, 2],\n",
      "       [0, 4, 1, 4],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 2, 3, 2],\n",
      "       [1, 4, 1, 4],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [2, 3, 2, 0],\n",
      "       [1, 4, 1, 4],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [0, 2, 3, 2],\n",
      "       [1, 4, 1, 4],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
      "       [1, 2, 3, 2],\n",
      "       [1, 4, 1, 4],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 2, 3, 3],\n",
      "       [2, 4, 1, 4],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [0, 1, 2, 4],\n",
      "       [2, 4, 1, 4],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 1, 2, 1],\n",
      "       [2, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [0, 2, 2, 1],\n",
      "       [2, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 2, 3, 1],\n",
      "       [2, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 2, 3, 2],\n",
      "       [2, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
      "       [2, 3, 2, 0],\n",
      "       [2, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
      "       [1, 3, 2, 0],\n",
      "       [3, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
      "       [2, 3, 2, 0],\n",
      "       [3, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 2],\n",
      "       [0, 2, 3, 2],\n",
      "       [3, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 2, 3, 3],\n",
      "       [3, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
      "       [1, 2, 4, 0],\n",
      "       [3, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
      "       [2, 2, 4, 0],\n",
      "       [3, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n",
      "       [3, 4, 0, 1],\n",
      "       [3, 4, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(48., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [2, 0, 1, 1],\n",
      "       [4, 5, 1, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [2, 0, 0, 1],\n",
      "       [4, 5, 2, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [2, 0, 1, 1],\n",
      "       [4, 5, 2, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 0, 2, 2],\n",
      "       [4, 5, 2, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
      "       [1, 0, 0, 2],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
      "       [1, 2, 0, 2],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n",
      "       [2, 2, 0, 2],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
      "       [3, 2, 0, 2],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [3, 2, 1, 2],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
      "       [3, 2, 1, 2],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [3, 2, 2, 3],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
      "       [3, 3, 3, 0],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
      "       [3, 3, 1, 0],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
      "       [3, 3, 0, 0],\n",
      "       [4, 5, 1, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
      "       [3, 3, 0, 0],\n",
      "       [4, 5, 2, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
      "       [3, 3, 1, 0],\n",
      "       [4, 5, 2, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
      "       [3, 3, 1, 1],\n",
      "       [4, 5, 2, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
      "       [3, 3, 2, 1],\n",
      "       [4, 5, 2, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
      "       [3, 3, 0, 2],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
      "       [3, 3, 1, 2],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
      "       [1, 4, 1, 2],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [1, 4, 2, 3],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n",
      "       [1, 4, 2, 3],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n",
      "       [1, 4, 2, 3],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
      "       [1, 4, 3, 3],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
      "       [1, 4, 1, 3],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 2],\n",
      "       [1, 4, 1, 3],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n",
      "       [1, 4, 1, 3],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 0],\n",
      "       [2, 4, 2, 3],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 1],\n",
      "       [2, 4, 2, 3],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
      "       [2, 4, 2, 3],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 2],\n",
      "       [2, 4, 2, 3],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 1],\n",
      "       [2, 4, 2, 3],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 1],\n",
      "       [3, 4, 3, 3],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n",
      "       [0, 3, 4, 4],\n",
      "       [4, 5, 4, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n",
      "       [1, 3, 3, 4],\n",
      "       [4, 5, 5, 5],\n",
      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n",
      "       [1, 3, 1, 4],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 6, 6, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 0],\n",
      "       [1, 3, 1, 4],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 0],\n",
      "       [2, 3, 2, 4],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
      "       [3, 3, 2, 4],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(272., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n",
      "       [4, 2, 4, 0],\n",
      "       [4, 5, 3, 5],\n",
      "       [5, 8, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n",
      "       [2, 2, 4, 0],\n",
      "       [5, 5, 3, 1],\n",
      "       [5, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n",
      "       [0, 2, 4, 0],\n",
      "       [2, 5, 3, 1],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n",
      "       [0, 2, 4, 0],\n",
      "       [2, 5, 3, 2],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n",
      "       [1, 2, 4, 1],\n",
      "       [2, 5, 3, 2],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 1],\n",
      "       [1, 2, 4, 1],\n",
      "       [2, 5, 3, 2],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [2, 3, 4, 2],\n",
      "       [2, 5, 3, 2],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
      "       [0, 3, 4, 0],\n",
      "       [3, 5, 3, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
      "       [0, 3, 4, 1],\n",
      "       [3, 5, 3, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n",
      "       [3, 4, 1, 0],\n",
      "       [3, 5, 4, 1],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [2, 4, 1, 0],\n",
      "       [4, 5, 4, 1],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [2, 4, 1, 0],\n",
      "       [4, 5, 4, 2],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [2, 4, 1, 1],\n",
      "       [4, 5, 4, 2],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [0, 2, 4, 2],\n",
      "       [4, 5, 4, 2],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [1, 2, 0, 1],\n",
      "       [4, 5, 5, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 1, 2, 1],\n",
      "       [1, 4, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n",
      "       [0, 1, 2, 2],\n",
      "       [1, 4, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 1, 3, 2],\n",
      "       [1, 4, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [0, 1, 3, 2],\n",
      "       [2, 4, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 1, 3, 2],\n",
      "       [2, 4, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [2, 3, 2, 1],\n",
      "       [2, 4, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n",
      "       [1, 3, 2, 1],\n",
      "       [3, 4, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [1, 3, 3, 1],\n",
      "       [3, 4, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
      "       [1, 4, 1, 0],\n",
      "       [3, 4, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [2, 0, 1, 1],\n",
      "       [3, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [2, 0, 2, 1],\n",
      "       [3, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [2, 0, 2, 2],\n",
      "       [3, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
      "       [3, 2, 0, 0],\n",
      "       [3, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
      "       [1, 2, 1, 0],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 2, 1, 2],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
      "       [1, 2, 1, 2],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [2, 2, 1, 2],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 0],\n",
      "       [3, 1, 2, 0],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
      "       [3, 1, 2, 0],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
      "       [3, 2, 2, 1],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
      "       [3, 3, 1, 0],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
      "       [3, 3, 1, 1],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
      "       [3, 3, 2, 1],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
      "       [3, 3, 2, 2],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n",
      "       [4, 3, 0, 0],\n",
      "       [4, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
      "       [2, 3, 1, 0],\n",
      "       [5, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
      "       [2, 3, 2, 0],\n",
      "       [5, 5, 6, 3],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
      "       [2, 3, 2, 0],\n",
      "       [6, 6, 3, 1],\n",
      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [0, 3, 2, 1],\n",
      "       [3, 6, 3, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
      "       [0, 3, 2, 0],\n",
      "       [3, 6, 3, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n",
      "       [3, 2, 0, 0],\n",
      "       [3, 6, 3, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [2, 2, 0, 1],\n",
      "       [4, 6, 3, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [2, 2, 0, 2],\n",
      "       [4, 6, 3, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [2, 2, 1, 0],\n",
      "       [4, 6, 3, 3],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 0, 3, 1],\n",
      "       [0, 4, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 1, 3, 2],\n",
      "       [1, 4, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [1, 3, 2, 0],\n",
      "       [1, 4, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [1, 3, 2, 0],\n",
      "       [2, 4, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [2, 3, 2, 1],\n",
      "       [2, 4, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 3, 2, 1],\n",
      "       [3, 4, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [0, 3, 2, 2],\n",
      "       [3, 4, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 0],\n",
      "       [1, 3, 2, 2],\n",
      "       [3, 4, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
      "       [0, 1, 3, 3],\n",
      "       [3, 4, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
      "       [1, 4, 1, 0],\n",
      "       [3, 4, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [2, 2, 1, 0],\n",
      "       [3, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
      "       [0, 0, 3, 1],\n",
      "       [3, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [1, 0, 3, 2],\n",
      "       [3, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
      "       [1, 3, 2, 0],\n",
      "       [3, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
      "       [2, 3, 2, 0],\n",
      "       [3, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
      "       [2, 3, 2, 1],\n",
      "       [3, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 2],\n",
      "       [2, 3, 2, 1],\n",
      "       [3, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 1],\n",
      "       [2, 3, 2, 1],\n",
      "       [3, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
      "       [3, 3, 2, 2],\n",
      "       [3, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 1],\n",
      "       [0, 3, 2, 2],\n",
      "       [4, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 0],\n",
      "       [3, 3, 1, 0],\n",
      "       [4, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 3],\n",
      "       [0, 0, 4, 1],\n",
      "       [4, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 0],\n",
      "       [4, 1, 1, 0],\n",
      "       [4, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 0],\n",
      "       [1, 1, 1, 0],\n",
      "       [5, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n",
      "       [1, 1, 2, 0],\n",
      "       [5, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n",
      "       [1, 1, 2, 1],\n",
      "       [5, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 0],\n",
      "       [1, 1, 2, 2],\n",
      "       [5, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n",
      "       [2, 1, 2, 2],\n",
      "       [5, 5, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 1],\n",
      "       [0, 2, 1, 3],\n",
      "       [0, 6, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 3, 1],\n",
      "       [2, 2, 1, 3],\n",
      "       [1, 6, 6, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 0, 1],\n",
      "       [3, 1, 3, 0],\n",
      "       [1, 7, 4, 0],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [4, 2, 3, 0],\n",
      "       [1, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
      "       [0, 4, 2, 3],\n",
      "       [1, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
      "       [0, 4, 2, 3],\n",
      "       [2, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [1, 4, 2, 3],\n",
      "       [2, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 2],\n",
      "       [1, 4, 2, 3],\n",
      "       [2, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 2],\n",
      "       [1, 4, 2, 3],\n",
      "       [2, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 2],\n",
      "       [2, 4, 2, 3],\n",
      "       [2, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 2],\n",
      "       [1, 4, 2, 3],\n",
      "       [3, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 1],\n",
      "       [1, 4, 2, 3],\n",
      "       [3, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 1],\n",
      "       [1, 4, 3, 3],\n",
      "       [3, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 1],\n",
      "       [1, 4, 4, 0],\n",
      "       [3, 7, 4, 1],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 2, 0],\n",
      "       [1, 4, 1, 0],\n",
      "       [3, 7, 5, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 0],\n",
      "       [1, 4, 1, 0],\n",
      "       [3, 7, 5, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 0, 0],\n",
      "       [1, 4, 2, 1],\n",
      "       [3, 7, 5, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 2],\n",
      "       [1, 4, 2, 1],\n",
      "       [3, 7, 5, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 3, 2],\n",
      "       [2, 4, 2, 1],\n",
      "       [3, 7, 5, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n",
      "       [2, 4, 2, 1],\n",
      "       [3, 7, 5, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n",
      "       [2, 4, 3, 2],\n",
      "       [3, 7, 5, 2],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
      "       [2, 4, 3, 0],\n",
      "       [3, 7, 5, 3],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
      "       [2, 4, 3, 1],\n",
      "       [3, 7, 5, 3],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
      "       [2, 4, 3, 2],\n",
      "       [3, 7, 5, 3],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 2],\n",
      "       [2, 4, 3, 2],\n",
      "       [3, 7, 5, 3],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
      "       [2, 4, 4, 3],\n",
      "       [3, 7, 5, 3],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
      "       [2, 4, 4, 1],\n",
      "       [3, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
      "       [2, 4, 4, 2],\n",
      "       [3, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
      "       [2, 5, 2, 0],\n",
      "       [3, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [3, 5, 2, 1],\n",
      "       [3, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
      "       [0, 5, 2, 2],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n",
      "       [5, 3, 0, 1],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
      "       [1, 5, 3, 1],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 0],\n",
      "       [1, 5, 3, 1],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
      "       [1, 5, 3, 1],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n",
      "       [1, 5, 3, 1],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 0, 1],\n",
      "       [1, 5, 3, 1],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 1, 0],\n",
      "       [1, 5, 3, 2],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 0],\n",
      "       [1, 5, 3, 2],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n",
      "       [1, 5, 3, 2],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "TimeStep(step_type=Array(2, dtype=int8), reward=Array(4., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n",
      "       [2, 5, 3, 2],\n",
      "       [4, 7, 5, 4],\n",
      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
      "3716.0\n"
     ]
    }
   ],
   "source": [
    "import jax, jumanji\n",
    "\n",
    "env = jumanji.make(\"Game2048-v1\")\n",
    "key = jax.random.PRNGKey(0)\n",
    "jit_reset = jax.jit(env.reset)\n",
    "jit_step = jax.jit(env.step)\n",
    "state, timestep = jax.jit(env.reset)(key)\n",
    "jit_policy = jax.jit(policy)\n",
    "total_reward = 0\n",
    "while True:\n",
    "    board, action_mask = timestep[\"observation\"]\n",
    "    action = jit_policy(timestep[\"observation\"][0].reshape(-1))\n",
    "    score_with_mask = jnp.where(action_mask, action, -jnp.inf)\n",
    "    action = jnp.argmax(score_with_mask)\n",
    "    state, timestep = jit_step(state, action)\n",
    "    done = jnp.all(~timestep[\"observation\"][1])\n",
    "    print(timestep)\n",
    "    total_reward += timestep[\"reward\"]\n",
    "    if done:\n",
    "        break\n",
    "print(total_reward)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T07:41:33.703431900Z",
     "start_time": "2024-06-05T07:41:26.102578200Z"
    }
   },
   "id": "f166e09c5be1a8fb"
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "outputs": [],
   "source": [
    "import jax.random\n",
    "from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048\n",
    "\n",
    "\n",
    "def random_policy(state, params, obs):\n",
    "    key = jax.random.key(obs.sum())\n",
    "    actions = jax.random.normal(key, (4,))\n",
    "    return actions\n",
    "\n",
    "problem = Jumanji_2048(max_step=10000, repeat_times=10, guarantee_invalid_action=True)\n",
    "state = problem.setup()\n",
    "jit_evaluate = jax.jit(lambda state, randkey: problem.evaluate(state, randkey, random_policy, None))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T08:06:59.491563700Z",
     "start_time": "2024-06-05T08:06:59.465404900Z"
    }
   },
   "id": "187326d08ac1eeb4"
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1193.2001\n"
     ]
    }
   ],
   "source": [
    "\n",
    "reward = jit_evaluate(state, randkey)\n",
    "print(reward)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T08:07:21.630420300Z",
     "start_time": "2024-06-05T08:07:21.107419400Z"
    }
   },
   "id": "4b3506db87568d81"
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
      "       [1, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [1, 1, 1, 1]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 1, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [2, 2, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [3, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [3, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [1, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [3, 2, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [2, 0, 1, 0],\n",
      "       [3, 2, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [2, 1, 0, 0],\n",
      "       [3, 2, 0, 1]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
      "       [0, 0, 0, 0],\n",
      "       [2, 1, 0, 0],\n",
      "       [3, 2, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
      "       [2, 2, 0, 0],\n",
      "       [3, 0, 0, 1],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 1],\n",
      "       [2, 2, 0, 0],\n",
      "       [3, 0, 0, 0],\n",
      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [1, 1, 0, 0],\n",
      "       [2, 2, 0, 0],\n",
      "       [3, 1, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 1],\n",
      "       [2, 2, 0, 0],\n",
      "       [3, 1, 0, 2],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 0, 0, 0],\n",
      "       [2, 3, 1, 1],\n",
      "       [3, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 0, 0, 0],\n",
      "       [2, 3, 0, 1],\n",
      "       [3, 1, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 0, 2, 1],\n",
      "       [0, 2, 3, 1],\n",
      "       [0, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 2, 1],\n",
      "       [0, 2, 3, 2],\n",
      "       [1, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
      "       [0, 3, 3, 2],\n",
      "       [1, 0, 1, 3],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 2, 1],\n",
      "       [1, 2, 3, 2],\n",
      "       [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [2, 1, 1, 0],\n",
      "       [1, 2, 3, 2],\n",
      "       [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 0, 2, 2],\n",
      "       [1, 2, 3, 2],\n",
      "       [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
      "       [2, 3, 3, 3],\n",
      "       [1, 0, 1, 3],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
      "       [2, 3, 3, 4],\n",
      "       [1, 0, 1, 0],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 3, 1],\n",
      "       [0, 2, 4, 4],\n",
      "       [0, 0, 0, 2],\n",
      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [1, 0, 0, 4],\n",
      "       [0, 1, 3, 2],\n",
      "       [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
      "       [1, 4, 0, 0],\n",
      "       [1, 3, 2, 0],\n",
      "       [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
      "       [0, 0, 1, 4],\n",
      "       [0, 1, 3, 2],\n",
      "       [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 2, 2],\n",
      "       [0, 2, 3, 4],\n",
      "       [0, 0, 4, 2],\n",
      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
      "       [1, 0, 2, 4],\n",
      "       [0, 1, 3, 2],\n",
      "       [2, 2, 4, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 2, 2],\n",
      "       [2, 2, 3, 4],\n",
      "       [0, 1, 4, 2],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 3],\n",
      "       [0, 3, 3, 4],\n",
      "       [1, 1, 4, 2],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 3],\n",
      "       [0, 1, 3, 4],\n",
      "       [0, 0, 4, 2],\n",
      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 3],\n",
      "       [1, 1, 3, 4],\n",
      "       [0, 0, 4, 2],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n",
      "       [0, 1, 3, 4],\n",
      "       [0, 1, 4, 3],\n",
      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n",
      "       [0, 2, 3, 4],\n",
      "       [0, 0, 4, 3],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n",
      "       [0, 2, 3, 4],\n",
      "       [2, 0, 4, 3],\n",
      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 3, 2, 3],\n",
      "       [0, 2, 3, 4],\n",
      "       [0, 0, 4, 3],\n",
      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 3, 2, 3],\n",
      "       [0, 2, 3, 4],\n",
      "       [0, 1, 4, 3],\n",
      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
      "       [2, 3, 4, 0],\n",
      "       [1, 4, 3, 0],\n",
      "       [2, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
      "       [0, 2, 3, 4],\n",
      "       [0, 1, 4, 3],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
      "       [2, 3, 4, 0],\n",
      "       [1, 4, 3, 0],\n",
      "       [1, 2, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
      "       [2, 3, 4, 0],\n",
      "       [2, 4, 3, 1],\n",
      "       [0, 2, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
      "       [2, 3, 4, 0],\n",
      "       [2, 4, 3, 1],\n",
      "       [2, 1, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
      "       [2, 3, 4, 0],\n",
      "       [2, 4, 3, 1],\n",
      "       [2, 2, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
      "       [0, 2, 3, 4],\n",
      "       [2, 4, 3, 1],\n",
      "       [0, 1, 2, 3]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
      "       [2, 3, 4, 0],\n",
      "       [2, 4, 3, 1],\n",
      "       [1, 2, 3, 1]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
      "       [1, 2, 3, 4],\n",
      "       [2, 4, 3, 1],\n",
      "       [1, 2, 3, 1]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(44., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 0, 0, 1],\n",
      "       [1, 3, 0, 1],\n",
      "       [2, 4, 4, 4],\n",
      "       [1, 2, 4, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 0, 0],\n",
      "       [1, 3, 1, 0],\n",
      "       [2, 5, 4, 1],\n",
      "       [1, 2, 4, 2]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 1, 1],\n",
      "       [1, 3, 5, 2],\n",
      "       [2, 5, 0, 1],\n",
      "       [1, 2, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 4, 1, 2],\n",
      "       [1, 3, 5, 2],\n",
      "       [1, 2, 5, 1],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 2, 0],\n",
      "       [1, 3, 5, 2],\n",
      "       [1, 2, 5, 1],\n",
      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(80., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 2, 2],\n",
      "       [2, 3, 6, 2],\n",
      "       [1, 3, 0, 0],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 4, 1, 3],\n",
      "       [2, 3, 6, 2],\n",
      "       [0, 0, 1, 3],\n",
      "       [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 3],\n",
      "       [2, 3, 6, 2],\n",
      "       [0, 0, 1, 3],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
      "       [0, 3, 6, 2],\n",
      "       [0, 0, 1, 3],\n",
      "       [1, 0, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
      "       [1, 3, 6, 2],\n",
      "       [0, 0, 1, 3],\n",
      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
      "       [1, 3, 6, 2],\n",
      "       [1, 3, 1, 0],\n",
      "       [1, 2, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
      "       [2, 4, 6, 2],\n",
      "       [1, 2, 1, 0],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 5, 1, 3],\n",
      "       [2, 2, 6, 2],\n",
      "       [2, 0, 1, 0],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 1, 1, 0],\n",
      "       [3, 5, 6, 3],\n",
      "       [3, 2, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [1, 1, 1, 0],\n",
      "       [0, 5, 6, 3],\n",
      "       [4, 2, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
      "       [0, 0, 1, 2],\n",
      "       [1, 5, 6, 3],\n",
      "       [0, 4, 2, 3]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
      "       [1, 2, 0, 0],\n",
      "       [1, 5, 6, 3],\n",
      "       [4, 2, 3, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
      "       [0, 2, 1, 0],\n",
      "       [2, 5, 6, 0],\n",
      "       [4, 2, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 1],\n",
      "       [4, 5, 6, 3],\n",
      "       [0, 2, 3, 0],\n",
      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 0, 0],\n",
      "       [4, 5, 6, 3],\n",
      "       [2, 3, 1, 0],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
      "       [4, 5, 1, 0],\n",
      "       [2, 3, 1, 0],\n",
      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
      "       [4, 5, 2, 0],\n",
      "       [2, 3, 0, 0],\n",
      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
      "       [4, 5, 2, 0],\n",
      "       [2, 3, 1, 0],\n",
      "       [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
      "       [1, 4, 5, 2],\n",
      "       [0, 2, 3, 1],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
      "       [1, 4, 5, 2],\n",
      "       [2, 3, 1, 1],\n",
      "       [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
      "       [1, 4, 5, 2],\n",
      "       [1, 2, 3, 2],\n",
      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
      "       [1, 4, 5, 2],\n",
      "       [1, 2, 3, 2],\n",
      "       [2, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 6, 0],\n",
      "       [3, 2, 5, 1],\n",
      "       [2, 4, 3, 3],\n",
      "       [2, 2, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 6, 0],\n",
      "       [0, 2, 5, 1],\n",
      "       [3, 4, 3, 1],\n",
      "       [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 6],\n",
      "       [1, 2, 5, 1],\n",
      "       [3, 4, 3, 1],\n",
      "       [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[6, 0, 0, 1],\n",
      "       [1, 2, 5, 1],\n",
      "       [3, 4, 3, 1],\n",
      "       [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
      "       [6, 2, 5, 1],\n",
      "       [1, 4, 3, 2],\n",
      "       [4, 2, 1, 4]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 2, 0],\n",
      "       [6, 2, 5, 1],\n",
      "       [1, 4, 3, 2],\n",
      "       [4, 2, 1, 4]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
      "       [6, 4, 5, 2],\n",
      "       [1, 2, 3, 4],\n",
      "       [4, 1, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n",
      "       [6, 4, 5, 2],\n",
      "       [1, 2, 3, 4],\n",
      "       [0, 0, 4, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 3, 1],\n",
      "       [6, 4, 5, 2],\n",
      "       [1, 2, 3, 4],\n",
      "       [0, 1, 4, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 1],\n",
      "       [6, 4, 5, 2],\n",
      "       [1, 2, 3, 4],\n",
      "       [1, 4, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n",
      "       [2, 4, 5, 1],\n",
      "       [6, 2, 3, 2],\n",
      "       [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True,  True, False, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n",
      "       [2, 4, 5, 1],\n",
      "       [6, 2, 3, 2],\n",
      "       [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n",
      "       [2, 4, 5, 1],\n",
      "       [6, 2, 3, 2],\n",
      "       [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n",
      "       [2, 4, 5, 2],\n",
      "       [6, 2, 3, 2],\n",
      "       [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "TimeStep(step_type=Array(2, dtype=int8), reward=Array(8., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n",
      "       [2, 4, 5, 3],\n",
      "       [6, 2, 3, 4],\n",
      "       [2, 4, 2, 1]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
      "636.0\n"
     ]
    }
   ],
   "source": [
    "randkey = jax.random.PRNGKey(14)\n",
    "jit_policy = jax.jit(random_policy)\n",
    "total_reward = 0\n",
    "state, timestep = jax.jit(env.reset)(randkey )\n",
    "while True:\n",
    "    board, action_mask = timestep[\"observation\"]\n",
    "    action = jit_policy(None, None, timestep[\"observation\"][0].reshape(-1))\n",
    "    score_with_mask = jnp.where(action_mask, action, -jnp.inf)\n",
    "    action = jnp.argmax(score_with_mask)\n",
    "    state, timestep = jit_step(state, action)\n",
    "    done = jnp.all(~timestep[\"observation\"][1])\n",
    "    print(timestep)\n",
    "    total_reward += timestep[\"reward\"]\n",
    "    if done:\n",
    "        break\n",
    "print(total_reward)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T08:09:58.242414600Z",
     "start_time": "2024-06-05T08:09:56.452642800Z"
    }
   },
   "id": "8bb888fb742b6b06"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   },
   "id": "3d1b5c8c646d4f07"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}