diff --git a/tensorneat/examples/func_fit/xor_kan.py b/tensorneat/examples/func_fit/xor_kan.py
index 5bdd1ed..60c5feb 100644
--- a/tensorneat/examples/func_fit/xor_kan.py
+++ b/tensorneat/examples/func_fit/xor_kan.py
@@ -16,7 +16,7 @@ if __name__ == "__main__":
                     max_nodes=50,
                     max_conns=100,
                     node_gene=KANNode(),
-                    conn_gene=BSplineConn(grid_cnt=10),
+                    conn_gene=BSplineConn(grid_cnt=6),
                     output_transform=Act.sigmoid,  # the activation function for output node
                     mutation=DefaultMutation(
                         node_add=0.1,
diff --git a/tensorneat/examples/gymnax/cartpole.py b/tensorneat/examples/gymnax/cartpole.py
index 16fcbe5..1368ee8 100644
--- a/tensorneat/examples/gymnax/cartpole.py
+++ b/tensorneat/examples/gymnax/cartpole.py
@@ -5,6 +5,11 @@ from algorithm.neat import *
 
 from problem.rl_env import GymNaxEnv
 
+
+def action_policy(forward_func, obs):
+    return jnp.argmax(forward_func(obs))
+
+
 if __name__ == "__main__":
     pipeline = Pipeline(
         algorithm=NEAT(
@@ -14,18 +19,15 @@ if __name__ == "__main__":
                     num_outputs=2,
                     max_nodes=50,
                     max_conns=100,
-                    output_transform=lambda out: jnp.argmax(
-                        out
-                    ),  # the action of cartpole is {0, 1}
+                    # output_transform=lambda out: jnp.argmax(
+                    #     out
+                    # ),  # the action of cartpole is {0, 1}
                 ),
                 pop_size=10000,
                 species_size=10,
             ),
         ),
-        problem=GymNaxEnv(
-            env_name="CartPole-v1",
-            repeat_times=5
-        ),
+        problem=GymNaxEnv(env_name="CartPole-v1", repeat_times=5, action_policy=action_policy),
         generation_limit=10000,
         fitness_target=500,
     )
diff --git a/tensorneat/examples/jumanji/2048.py b/tensorneat/examples/jumanji/2048.py
deleted file mode 100644
index 39ecd00..0000000
--- a/tensorneat/examples/jumanji/2048.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import jax.numpy as jnp
-
-from pipeline import Pipeline
-from algorithm.neat import *
-
-from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048
-from utils import Act, Agg
-
-if __name__ == "__main__":
-    pipeline = Pipeline(
-        algorithm=NEAT(
-            species=DefaultSpecies(
-                genome=DefaultGenome(
-                    num_inputs=16,
-                    num_outputs=4,
-                    max_nodes=100,
-                    max_conns=1000,
-                    node_gene=DefaultNodeGene(
-                        activation_default=Act.sigmoid,
-                        activation_options=(Act.sigmoid, Act.relu, Act.tanh, Act.identity, Act.inv),
-                        aggregation_default=Agg.sum,
-                        aggregation_options=(Agg.sum, Agg.mean, Agg.max, Agg.product),
-                    ),
-                    mutation=DefaultMutation(
-                        node_add=0.03,
-                        conn_add=0.03,
-                    )
-                ),
-                pop_size=10000,
-                species_size=100,
-                survival_threshold=0.01,
-            ),
-        ),
-        problem=Jumanji_2048(
-            max_step=10000,
-            repeat_times=5
-        ),
-        generation_limit=10000,
-        fitness_target=13000,
-    )
-
-    # initialize state
-    state = pipeline.setup()
-    # print(state)
-    # run until terminate
-    state, best = pipeline.auto_run(state)
diff --git a/tensorneat/examples/jumanji/2048_random_policy.py b/tensorneat/examples/jumanji/2048_random_policy.py
new file mode 100644
index 0000000..6f7172b
--- /dev/null
+++ b/tensorneat/examples/jumanji/2048_random_policy.py
@@ -0,0 +1,25 @@
+import jax, jax.numpy as jnp
+import jax.random
+from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048
+
+
+def random_policy(state, params, obs):
+    # key = jax.random.key(obs.sum())
+    # actions = jax.random.normal(key, (4,))
+    # actions = actions.at[2:].set(-9999)
+    return jnp.array([4, 4, 0, 1])
+    # return jnp.array([1, 2, 3, 4])
+    return actions
+
+
+if __name__ == "__main__":
+    problem = Jumanji_2048(
+        max_step=10000, repeat_times=1000, guarantee_invalid_action=True
+    )
+    state = problem.setup()
+    jit_evaluate = jax.jit(
+        lambda state, randkey: problem.evaluate(state, randkey, random_policy, None)
+    )
+    randkey = jax.random.PRNGKey(0)
+    reward = jit_evaluate(state, randkey)
+    print(reward)
diff --git a/tensorneat/examples/jumanji/2048_test.ipynb b/tensorneat/examples/jumanji/2048_test.ipynb
index e779bfd..aec52d5 100644
--- a/tensorneat/examples/jumanji/2048_test.ipynb
+++ b/tensorneat/examples/jumanji/2048_test.ipynb
@@ -2,13 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 6,
    "id": "initial_id",
    "metadata": {
     "collapsed": true,
     "ExecuteTime": {
-     "end_time": "2024-06-05T05:07:22.736605400Z",
-     "start_time": "2024-06-05T05:06:39.100164300Z"
+     "end_time": "2024-06-05T07:40:13.841629100Z",
+     "start_time": "2024-06-05T07:40:13.076164500Z"
     }
    },
    "outputs": [
@@ -17,26 +17,7 @@
      "output_type": "stream",
      "text": [
       "initializing\n",
-      "initializing finished\n",
-      "start compile\n",
-      "compile finished, cost time: 18.307454s\n",
-      "Generation: 1.0, Cost time: 4551.03ms\n",
-      " \tnode counts: max: 21, min: 21, mean: 21.00\n",
-      " \tconn counts: max: 20, min: 20, mean: 20.00\n",
-      " \tspecies: 1, [10000]\n",
-      " \tfitness: valid cnt: 10000, max: 10124.0000, min: 44.0000, mean: 1758.1263, std: 1212.6823\n",
-      "Generation: 2.0, Cost time: 4636.33ms\n",
-      " \tnode counts: max: 22, min: 21, mean: 21.03\n",
-      " \tconn counts: max: 22, min: 20, mean: 20.05\n",
-      " \tspecies: 1, [10000]\n",
-      " \tfitness: valid cnt: 10000, max: 11000.0000, min: 48.0000, mean: 1870.1300, std: 1263.3086\n",
-      "Generation: 3.0, Cost time: 6271.12ms\n",
-      " \tnode counts: max: 23, min: 21, mean: 21.03\n",
-      " \tconn counts: max: 22, min: 20, mean: 20.05\n",
-      " \tspecies: 1, [10000]\n",
-      " \tfitness: valid cnt: 10000, max: 14624.0000, min: 28.0000, mean: 1943.9924, std: 1293.7146\n",
-      "\n",
-      "Fitness limit reached!\n"
+      "initializing finished\n"
      ]
     }
    ],
@@ -45,84 +26,100 @@
     "\n",
     "from pipeline import Pipeline\n",
     "from algorithm.neat import *\n",
+    "from algorithm.neat.gene.node.default_without_response import NodeGeneWithoutResponse\n",
     "\n",
     "from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048\n",
     "from utils import Act, Agg\n",
     "\n",
-    "if __name__ == \"__main__\":\n",
-    "    pipeline = Pipeline(\n",
-    "        algorithm=NEAT(\n",
-    "            species=DefaultSpecies(\n",
-    "                genome=DefaultGenome(\n",
-    "                    num_inputs=16,\n",
-    "                    num_outputs=4,\n",
-    "                    max_nodes=100,\n",
-    "                    max_conns=1000,\n",
-    "                    node_gene=DefaultNodeGene(\n",
-    "                        activation_default=Act.sigmoid,\n",
-    "                        activation_options=(Act.sigmoid, Act.relu, Act.tanh, Act.identity, Act.inv),\n",
-    "                        aggregation_default=Agg.sum,\n",
-    "                        aggregation_options=(Agg.sum, Agg.mean, Agg.max, Agg.product),\n",
+    "pipeline = Pipeline(\n",
+    "    algorithm=NEAT(\n",
+    "        species=DefaultSpecies(\n",
+    "            genome=DefaultGenome(\n",
+    "                num_inputs=16,\n",
+    "                num_outputs=4,\n",
+    "                max_nodes=100,\n",
+    "                max_conns=1000,\n",
+    "                node_gene=NodeGeneWithoutResponse(\n",
+    "                    activation_default=Act.sigmoid,\n",
+    "                    activation_options=(\n",
+    "                        Act.sigmoid,\n",
+    "                        Act.relu,\n",
+    "                        Act.tanh,\n",
+    "                        Act.identity,\n",
     "                    ),\n",
-    "                    mutation=DefaultMutation(\n",
-    "                        node_add=0.03,\n",
-    "                        conn_add=0.03,\n",
-    "                    )\n",
+    "                    aggregation_default=Agg.sum,\n",
+    "                    aggregation_options=(Agg.sum,),\n",
+    "                    activation_replace_rate=0.02,\n",
+    "                    aggregation_replace_rate=0.02,\n",
+    "                    bias_mutate_rate=0.03,\n",
+    "                    bias_init_std=0.5,\n",
+    "                    bias_mutate_power=0.2,\n",
+    "                    bias_replace_rate=0.01,\n",
+    "                ),\n",
+    "                conn_gene=DefaultConnGene(\n",
+    "                    weight_mutate_rate=0.015,\n",
+    "                    weight_replace_rate=0.003,\n",
+    "                    weight_mutate_power=0.5,\n",
+    "                ),\n",
+    "                mutation=DefaultMutation(\n",
+    "                    node_add=0.1, conn_add=0.2, conn_delete=0.2\n",
     "                ),\n",
-    "                pop_size=10000,\n",
-    "                species_size=100,\n",
-    "                survival_threshold=0.01,\n",
     "            ),\n",
+    "            pop_size=1000,\n",
+    "            species_size=5,\n",
+    "            survival_threshold=0.1,\n",
+    "            max_stagnation=7,\n",
+    "            genome_elitism=3,\n",
+    "            compatibility_threshold=1.2,\n",
     "        ),\n",
-    "        problem=Jumanji_2048(\n",
-    "            max_step=1000,\n",
-    "        ),\n",
-    "        generation_limit=10000,\n",
-    "        fitness_target=13000,\n",
-    "    )\n",
-    "\n",
-    "    # initialize state\n",
-    "    state = pipeline.setup()\n",
-    "    # print(state)\n",
-    "    # run until terminate\n",
-    "    state, best = pipeline.auto_run(state)"
+    "    ),\n",
+    "    problem=Jumanji_2048(max_step=10000, repeat_times=5),\n",
+    "    generation_limit=100,\n",
+    "    fitness_target=13000,\n",
+    "    save_path=\"2048.pkl\",\n",
+    ")\n",
+    "state = pipeline.setup()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
    "outputs": [],
    "source": [
-    "genome = pipeline.algorithm.genome"
+    "import numpy as np\n",
+    "\n",
+    "data = np.load('2048.npz')\n",
+    "nodes, conns = data['nodes'], data['conns']"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-05T05:08:14.332101Z",
-     "start_time": "2024-06-05T05:08:14.324101300Z"
+     "end_time": "2024-06-05T07:40:13.932015100Z",
+     "start_time": "2024-06-05T07:40:13.876631500Z"
     }
    },
    "id": "a0915ecf8179f347"
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
    "outputs": [],
    "source": [
-    "transformed = genome.transform(state, *best)"
+    "genome = pipeline.algorithm.species.genome\n",
+    "transformed = genome.transform(state, nodes, conns)"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-05T05:08:49.132030500Z",
-     "start_time": "2024-06-05T05:08:48.495809200Z"
+     "end_time": "2024-06-05T07:40:14.585804800Z",
+     "start_time": "2024-06-05T07:40:14.568805Z"
     }
    },
    "id": "cd1fa65e8a9d6e13"
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
    "outputs": [],
    "source": [
     "def policy(board):\n",
@@ -132,1089 +129,1229 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-05T05:09:32.355055100Z",
-     "start_time": "2024-06-05T05:09:32.350057Z"
+     "end_time": "2024-06-05T07:40:15.124383600Z",
+     "start_time": "2024-06-05T07:40:15.118384200Z"
     }
    },
    "id": "61bc1895af304651"
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 14,
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [0, 0, 1, 0],\n",
+      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
       "       [0, 0, 0, 0],\n",
       "       [0, 0, 0, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
-      "       [0, 1, 0, 0],\n",
-      "       [0, 0, 0, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 2],\n",
-      "       [0, 0, 0, 0],\n",
-      "       [0, 0, 0, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 2],\n",
-      "       [0, 0, 0, 0],\n",
-      "       [0, 0, 0, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 3],\n",
-      "       [0, 0, 0, 0],\n",
-      "       [2, 0, 0, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 3],\n",
+      "       [1, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
       "       [0, 0, 0, 0],\n",
       "       [0, 0, 0, 1],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 3],\n",
-      "       [0, 0, 0, 1],\n",
-      "       [0, 0, 0, 0],\n",
-      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 3],\n",
-      "       [0, 0, 0, 1],\n",
-      "       [0, 0, 0, 0],\n",
-      "       [0, 0, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 3],\n",
-      "       [0, 0, 2, 1],\n",
-      "       [0, 1, 0, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 3],\n",
-      "       [1, 0, 2, 1],\n",
-      "       [0, 0, 0, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 3],\n",
-      "       [0, 1, 2, 1],\n",
-      "       [0, 1, 0, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 3],\n",
-      "       [0, 2, 2, 1],\n",
+      "       [1, 1, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
       "       [1, 0, 0, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
-      "       [0, 2, 2, 1],\n",
       "       [0, 0, 0, 0],\n",
-      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
-      "       [0, 2, 2, 1],\n",
-      "       [0, 0, 1, 0],\n",
-      "       [0, 0, 2, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
-      "       [0, 0, 3, 1],\n",
-      "       [0, 0, 1, 1],\n",
-      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
-      "       [0, 0, 3, 2],\n",
-      "       [0, 0, 1, 2],\n",
-      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 3],\n",
-      "       [0, 0, 3, 3],\n",
-      "       [0, 1, 1, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
-      "       [0, 1, 3, 1],\n",
-      "       [0, 0, 1, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
-      "       [0, 1, 3, 1],\n",
-      "       [0, 0, 0, 1],\n",
-      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
-      "       [0, 2, 3, 2],\n",
+      "       [1, 1, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 1, 0, 0],\n",
       "       [0, 0, 0, 0],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
-      "       [0, 2, 3, 2],\n",
+      "       [2, 1, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 2, 0, 0],\n",
+      "       [0, 1, 0, 0],\n",
+      "       [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 0, 1, 2],\n",
       "       [0, 0, 0, 1],\n",
-      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
-      "       [1, 2, 3, 2],\n",
-      "       [0, 0, 0, 1],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
-      "       [1, 2, 3, 2],\n",
-      "       [0, 0, 0, 2],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
-      "       [1, 2, 3, 3],\n",
-      "       [0, 0, 0, 1],\n",
-      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
-      "       [1, 2, 3, 3],\n",
-      "       [0, 1, 0, 1],\n",
-      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
-      "       [1, 2, 3, 3],\n",
-      "       [0, 2, 1, 1],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
-      "       [1, 3, 3, 3],\n",
-      "       [0, 0, 1, 1],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
-      "       [1, 0, 3, 3],\n",
-      "       [0, 1, 1, 2],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
-      "       [1, 1, 3, 3],\n",
-      "       [0, 0, 1, 2],\n",
-      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
-      "       [1, 1, 3, 3],\n",
-      "       [0, 1, 2, 2],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
-      "       [1, 2, 3, 3],\n",
-      "       [0, 1, 2, 2],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
-      "       [1, 1, 2, 4],\n",
-      "       [0, 0, 1, 3],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
-      "       [1, 1, 2, 3],\n",
-      "       [0, 1, 1, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
-      "       [1, 2, 2, 3],\n",
-      "       [0, 0, 1, 0],\n",
-      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
-      "       [1, 2, 2, 3],\n",
-      "       [0, 1, 1, 0],\n",
-      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
-      "       [1, 2, 2, 3],\n",
-      "       [0, 1, 2, 1],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
-      "       [1, 2, 3, 3],\n",
-      "       [0, 1, 1, 1],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
-      "       [1, 1, 2, 4],\n",
-      "       [0, 0, 1, 2],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
-      "       [0, 2, 2, 4],\n",
-      "       [0, 0, 1, 2],\n",
-      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
-      "       [1, 2, 2, 4],\n",
-      "       [0, 0, 1, 2],\n",
-      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
-      "       [2, 2, 2, 4],\n",
-      "       [1, 0, 1, 2],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
-      "       [1, 2, 2, 4],\n",
-      "       [0, 0, 1, 2],\n",
-      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
-      "       [1, 2, 2, 4],\n",
-      "       [0, 1, 1, 2],\n",
-      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
-      "       [1, 2, 2, 4],\n",
-      "       [0, 1, 2, 2],\n",
-      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
-      "       [2, 2, 3, 4],\n",
-      "       [1, 1, 0, 2],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
-      "       [0, 3, 3, 4],\n",
-      "       [0, 0, 2, 2],\n",
-      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
-      "       [1, 3, 3, 4],\n",
-      "       [0, 0, 2, 2],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
-      "       [0, 1, 4, 4],\n",
-      "       [0, 1, 0, 3],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
-      "       [0, 2, 4, 4],\n",
-      "       [0, 1, 0, 3],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
-      "       [0, 0, 2, 5],\n",
-      "       [1, 0, 1, 3],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 0, 2, 3],\n",
-      "       [0, 1, 1, 1],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 1, 2, 3],\n",
-      "       [0, 0, 1, 1],\n",
-      "       [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 1, 2, 3],\n",
-      "       [2, 0, 1, 1],\n",
-      "       [0, 2, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 1, 2, 3],\n",
-      "       [2, 2, 1, 1],\n",
-      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 1, 2, 3],\n",
-      "       [2, 2, 2, 1],\n",
-      "       [0, 0, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 1, 3, 3],\n",
-      "       [2, 2, 2, 1],\n",
-      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [0, 1, 2, 4],\n",
-      "       [0, 2, 3, 1],\n",
-      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 2, 4, 0],\n",
-      "       [2, 3, 1, 1],\n",
-      "       [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 2, 4, 1],\n",
-      "       [3, 3, 1, 0],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 2, 4, 2],\n",
-      "       [3, 3, 1, 0],\n",
-      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 2, 4, 2],\n",
-      "       [3, 3, 2, 0],\n",
-      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 2, 4, 2],\n",
-      "       [0, 0, 4, 2],\n",
-      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 2, 5, 3],\n",
-      "       [0, 1, 1, 1],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 2, 5, 3],\n",
-      "       [0, 0, 1, 2],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [1, 2, 5, 3],\n",
-      "       [1, 2, 1, 0],\n",
-      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [2, 3, 5, 3],\n",
-      "       [1, 1, 1, 0],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [2, 3, 5, 3],\n",
-      "       [0, 0, 1, 2],\n",
-      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [2, 3, 5, 3],\n",
-      "       [0, 1, 1, 2],\n",
-      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
-      "       [2, 3, 5, 3],\n",
-      "       [1, 2, 1, 2],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, False,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 0],\n",
-      "       [3, 4, 1, 6],\n",
-      "       [2, 3, 5, 3],\n",
-      "       [1, 2, 1, 2]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
-      "       [2, 4, 5, 3],\n",
-      "       [1, 3, 1, 2],\n",
-      "       [0, 2, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
-      "       [2, 4, 5, 3],\n",
-      "       [1, 3, 1, 2],\n",
-      "       [1, 0, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
-      "       [2, 4, 5, 3],\n",
-      "       [2, 3, 1, 2],\n",
-      "       [0, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
-      "       [3, 4, 5, 3],\n",
-      "       [1, 3, 1, 2],\n",
-      "       [0, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [1, 4, 5, 3],\n",
-      "       [0, 3, 1, 2],\n",
-      "       [2, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [1, 4, 5, 3],\n",
-      "       [2, 3, 1, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [1, 4, 5, 3],\n",
-      "       [2, 3, 1, 2],\n",
-      "       [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [1, 4, 5, 3],\n",
-      "       [3, 3, 2, 2],\n",
-      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [1, 4, 5, 3],\n",
-      "       [0, 1, 4, 3],\n",
-      "       [0, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [1, 4, 5, 4],\n",
-      "       [1, 2, 4, 1],\n",
-      "       [0, 0, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 5, 4],\n",
-      "       [0, 2, 4, 1],\n",
-      "       [0, 0, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 5, 4],\n",
-      "       [0, 2, 4, 2],\n",
-      "       [0, 0, 2, 1]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 5, 4],\n",
-      "       [2, 4, 2, 0],\n",
-      "       [2, 1, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 5, 5, 4],\n",
-      "       [2, 1, 2, 0],\n",
-      "       [0, 1, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 5, 5, 4],\n",
-      "       [2, 2, 2, 0],\n",
-      "       [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 5, 5, 4],\n",
-      "       [2, 2, 2, 1],\n",
-      "       [0, 2, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 5, 5, 4],\n",
-      "       [2, 3, 2, 1],\n",
-      "       [0, 1, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [0, 3, 6, 4],\n",
-      "       [2, 3, 2, 1],\n",
-      "       [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 6, 4],\n",
-      "       [1, 1, 2, 1],\n",
-      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 6, 4],\n",
-      "       [0, 2, 2, 1],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 6, 4],\n",
-      "       [1, 0, 3, 1],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 6, 4],\n",
-      "       [0, 1, 3, 1],\n",
-      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 6, 4],\n",
-      "       [0, 2, 3, 1],\n",
-      "       [0, 2, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 6, 4],\n",
-      "       [0, 3, 3, 1],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 6, 4],\n",
-      "       [1, 3, 3, 1],\n",
-      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [2, 4, 6, 4],\n",
-      "       [2, 1, 4, 1],\n",
-      "       [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 4, 6, 4],\n",
-      "       [1, 1, 4, 1],\n",
-      "       [0, 0, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 4, 6, 4],\n",
-      "       [0, 2, 4, 1],\n",
-      "       [0, 0, 1, 3]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 4, 6, 4],\n",
-      "       [2, 4, 1, 0],\n",
-      "       [1, 3, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 5, 6, 4],\n",
-      "       [2, 3, 1, 1],\n",
-      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 5, 6, 4],\n",
-      "       [2, 3, 1, 2],\n",
-      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 5, 6, 4],\n",
-      "       [2, 3, 1, 2],\n",
-      "       [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 5, 6, 4],\n",
-      "       [2, 3, 1, 3],\n",
-      "       [0, 0, 2, 1]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 5, 6, 4],\n",
-      "       [2, 3, 1, 3],\n",
-      "       [2, 1, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [3, 5, 6, 4],\n",
-      "       [3, 3, 2, 3],\n",
-      "       [0, 1, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
-      "       [4, 5, 6, 4],\n",
-      "       [1, 3, 3, 3],\n",
-      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
-      "       [1, 5, 6, 4],\n",
-      "       [0, 3, 3, 3],\n",
-      "       [2, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
-      "       [1, 5, 6, 4],\n",
-      "       [2, 3, 3, 3],\n",
-      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
-      "       [1, 5, 6, 4],\n",
-      "       [0, 2, 3, 4],\n",
-      "       [1, 0, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
-      "       [2, 5, 6, 5],\n",
-      "       [1, 2, 3, 2],\n",
-      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, False,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "       [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
       "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
-      "       [5, 2, 1, 6],\n",
-      "       [2, 5, 6, 5],\n",
-      "       [1, 2, 3, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 2, 6],\n",
-      "       [2, 5, 6, 5],\n",
-      "       [1, 2, 3, 2],\n",
-      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 5, 3, 6],\n",
-      "       [2, 5, 6, 5],\n",
-      "       [1, 2, 3, 2],\n",
-      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
-      "       [1, 2, 6, 5],\n",
-      "       [0, 1, 3, 2],\n",
-      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
-      "       [1, 2, 6, 5],\n",
-      "       [0, 2, 3, 2],\n",
-      "       [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
-      "       [1, 3, 6, 5],\n",
-      "       [0, 0, 3, 2],\n",
-      "       [1, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
-      "       [2, 3, 6, 5],\n",
-      "       [1, 0, 3, 2],\n",
-      "       [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
-      "       [1, 3, 6, 5],\n",
-      "       [0, 0, 3, 2],\n",
-      "       [1, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
-      "       [2, 3, 6, 5],\n",
-      "       [0, 1, 3, 2],\n",
-      "       [0, 0, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
-      "       [2, 3, 6, 5],\n",
-      "       [0, 1, 3, 2],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
-      "       [2, 3, 6, 5],\n",
-      "       [1, 1, 3, 3],\n",
-      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
-      "       [2, 3, 6, 5],\n",
-      "       [0, 0, 2, 4],\n",
-      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
-      "       [2, 3, 6, 5],\n",
-      "       [1, 1, 2, 4],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
-      "       [2, 3, 6, 5],\n",
-      "       [0, 2, 2, 4],\n",
-      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
-      "       [2, 3, 6, 5],\n",
-      "       [1, 2, 2, 4],\n",
-      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
-      "       [2, 3, 6, 5],\n",
-      "       [2, 1, 3, 4],\n",
-      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
-      "       [3, 3, 6, 5],\n",
-      "       [0, 1, 3, 4],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [0, 3, 6, 5],\n",
-      "       [0, 1, 3, 4],\n",
-      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [0, 3, 6, 5],\n",
-      "       [0, 2, 3, 4],\n",
-      "       [0, 2, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [0, 3, 6, 5],\n",
-      "       [0, 3, 3, 4],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [1, 4, 6, 5],\n",
-      "       [0, 2, 3, 4],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [1, 4, 6, 5],\n",
-      "       [2, 3, 4, 0],\n",
-      "       [1, 2, 2, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [1, 4, 6, 5],\n",
-      "       [0, 2, 3, 4],\n",
-      "       [1, 0, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [2, 4, 6, 5],\n",
-      "       [1, 2, 3, 4],\n",
-      "       [0, 0, 1, 3]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [2, 4, 6, 5],\n",
-      "       [1, 2, 3, 4],\n",
-      "       [1, 3, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [2, 4, 6, 5],\n",
-      "       [2, 2, 3, 4],\n",
-      "       [1, 3, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [3, 4, 6, 5],\n",
-      "       [1, 2, 3, 4],\n",
-      "       [1, 3, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [3, 4, 6, 5],\n",
-      "       [2, 2, 3, 4],\n",
-      "       [0, 3, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [3, 4, 6, 5],\n",
-      "       [0, 3, 3, 4],\n",
-      "       [1, 0, 3, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [3, 4, 6, 5],\n",
-      "       [1, 3, 4, 4],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [3, 4, 6, 5],\n",
-      "       [0, 1, 3, 5],\n",
-      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
-      "       [3, 4, 6, 6],\n",
-      "       [0, 2, 3, 2],\n",
-      "       [2, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 2, 3, 0],\n",
-      "       [0, 1, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [0, 2, 3, 3],\n",
-      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [0, 0, 2, 4],\n",
-      "       [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [0, 1, 2, 4],\n",
-      "       [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [0, 2, 2, 4],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [0, 0, 3, 4],\n",
-      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [0, 1, 3, 4],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [1, 1, 3, 4],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 1, 3, 4],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 1, 3, 4],\n",
-      "       [0, 1, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 2, 3, 4],\n",
-      "       [0, 1, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [1, 3, 3, 4],\n",
-      "       [0, 0, 1, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [0, 1, 4, 4],\n",
-      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [0, 2, 4, 4],\n",
-      "       [1, 0, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [1, 2, 4, 4],\n",
-      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 1, 2, 5],\n",
-      "       [0, 0, 2, 3]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 1, 3, 5],\n",
-      "       [1, 0, 0, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 1, 3, 5],\n",
-      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 2, 3, 5],\n",
-      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [1, 3, 3, 5],\n",
-      "       [0, 0, 2, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [0, 1, 4, 5],\n",
-      "       [0, 2, 2, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [0, 1, 4, 5],\n",
-      "       [1, 0, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [1, 1, 4, 5],\n",
-      "       [0, 1, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [1, 2, 4, 5],\n",
-      "       [0, 1, 3, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [1, 2, 4, 5],\n",
-      "       [0, 1, 1, 4]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [1, 2, 4, 5],\n",
-      "       [0, 2, 2, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [1, 3, 4, 5],\n",
-      "       [1, 0, 2, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 3, 4, 5],\n",
-      "       [0, 1, 2, 4]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 3, 4, 5],\n",
-      "       [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 3, 5, 5],\n",
-      "       [1, 2, 1, 1]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 6, 2],\n",
-      "       [2, 3, 6, 1],\n",
-      "       [1, 2, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 7, 2],\n",
-      "       [2, 3, 2, 1],\n",
-      "       [1, 2, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 7, 2],\n",
-      "       [2, 3, 3, 1],\n",
-      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 7, 2],\n",
-      "       [2, 3, 3, 2],\n",
-      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 7, 3],\n",
-      "       [2, 3, 3, 1],\n",
-      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 7, 3],\n",
-      "       [2, 3, 3, 2],\n",
-      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 7, 3],\n",
-      "       [0, 2, 4, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 7, 3],\n",
-      "       [1, 2, 4, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 7, 3],\n",
-      "       [2, 2, 4, 2],\n",
-      "       [2, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [3, 4, 7, 3],\n",
-      "       [3, 2, 4, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
-      "       [4, 4, 7, 3],\n",
-      "       [1, 2, 4, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [2, 4, 7, 3],\n",
-      "       [0, 2, 4, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [2, 4, 7, 3],\n",
-      "       [1, 2, 4, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [2, 4, 7, 3],\n",
-      "       [2, 2, 4, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [3, 4, 7, 3],\n",
-      "       [1, 2, 4, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [3, 4, 7, 3],\n",
-      "       [2, 2, 4, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [3, 4, 7, 3],\n",
-      "       [3, 4, 2, 0],\n",
-      "       [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(48., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [2, 2, 2, 1],\n",
-      "       [1, 0, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [1, 2, 3, 1],\n",
-      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [1, 2, 3, 1],\n",
-      "       [2, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [1, 2, 3, 2],\n",
-      "       [2, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [1, 2, 3, 2],\n",
-      "       [1, 0, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [2, 2, 3, 2],\n",
-      "       [1, 0, 2, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [0, 3, 3, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [1, 3, 3, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [2, 3, 3, 2],\n",
-      "       [1, 1, 2, 1]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [2, 4, 2, 1],\n",
-      "       [2, 2, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [3, 4, 2, 1],\n",
-      "       [0, 2, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [3, 4, 2, 2],\n",
-      "       [0, 2, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 3],\n",
-      "       [1, 3, 4, 3],\n",
-      "       [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 4],\n",
-      "       [1, 3, 4, 2],\n",
-      "       [0, 0, 2, 1]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 4],\n",
-      "       [1, 3, 4, 2],\n",
-      "       [2, 1, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 4],\n",
-      "       [1, 3, 4, 2],\n",
-      "       [1, 0, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 4],\n",
-      "       [2, 3, 4, 3],\n",
-      "       [1, 0, 2, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 4],\n",
-      "       [2, 3, 4, 3],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 4],\n",
-      "       [2, 3, 4, 3],\n",
-      "       [1, 0, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 4],\n",
-      "       [2, 3, 4, 3],\n",
-      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 4],\n",
-      "       [2, 3, 4, 4],\n",
-      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 5],\n",
-      "       [2, 3, 4, 2],\n",
-      "       [0, 1, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 5],\n",
-      "       [2, 3, 4, 2],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 5],\n",
-      "       [2, 3, 4, 3],\n",
-      "       [1, 0, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 5],\n",
-      "       [2, 3, 4, 3],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 5],\n",
-      "       [2, 3, 4, 3],\n",
-      "       [0, 1, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 5],\n",
-      "       [2, 3, 4, 3],\n",
-      "       [1, 0, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 5],\n",
-      "       [2, 3, 4, 4],\n",
-      "       [1, 0, 1, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 5],\n",
-      "       [1, 2, 3, 5],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [1, 2, 3, 2],\n",
-      "       [0, 1, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [1, 2, 3, 2],\n",
-      "       [1, 0, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 2, 3, 3],\n",
-      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [0, 0, 3, 4],\n",
-      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [0, 1, 3, 4],\n",
-      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [0, 2, 3, 4],\n",
-      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [0, 2, 3, 4],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 4, 1],\n",
-      "       [1, 2, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 4, 1],\n",
-      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 4, 1],\n",
-      "       [0, 1, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 4, 1],\n",
-      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 4, 1],\n",
-      "       [1, 0, 2, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 4, 1],\n",
-      "       [1, 1, 2, 3]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 4, 1],\n",
-      "       [2, 2, 3, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [3, 3, 4, 2],\n",
-      "       [1, 2, 3, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [0, 4, 4, 2],\n",
-      "       [1, 1, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [1, 4, 4, 2],\n",
-      "       [1, 1, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 4, 4, 2],\n",
-      "       [1, 1, 2, 3]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 5, 2, 0],\n",
-      "       [2, 2, 3, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
-      "       [4, 6, 7, 6],\n",
-      "       [3, 2, 2, 1],\n",
-      "       [1, 0, 3, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 2, 7, 6],\n",
+      "       [0, 0, 0, 2],\n",
+      "       [0, 0, 1, 1],\n",
+      "       [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 0, 0, 2],\n",
+      "       [0, 1, 2, 1],\n",
+      "       [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 0, 0, 2],\n",
+      "       [0, 0, 2, 1],\n",
+      "       [0, 1, 3, 2]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
+      "       [2, 0, 0, 0],\n",
+      "       [2, 1, 0, 0],\n",
+      "       [1, 3, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 0, 1],\n",
+      "       [3, 2, 0, 0],\n",
+      "       [1, 3, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [1, 0, 0, 0],\n",
+      "       [3, 2, 0, 0],\n",
+      "       [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 1, 0, 1],\n",
+      "       [3, 2, 0, 0],\n",
+      "       [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [1, 1, 0, 0],\n",
+      "       [3, 2, 0, 0],\n",
+      "       [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 1, 0, 0],\n",
+      "       [3, 2, 1, 0],\n",
+      "       [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [1, 1, 0, 0],\n",
+      "       [3, 2, 1, 1],\n",
+      "       [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 2, 0, 0],\n",
+      "       [3, 2, 1, 1],\n",
+      "       [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 0, 1],\n",
+      "       [3, 3, 1, 2],\n",
+      "       [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 1, 0],\n",
+      "       [3, 0, 1, 1],\n",
+      "       [1, 4, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 1, 0],\n",
       "       [3, 0, 2, 1],\n",
-      "       [1, 0, 3, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 2, 7, 6],\n",
-      "       [2, 3, 2, 1],\n",
-      "       [0, 1, 3, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 2, 7, 6],\n",
-      "       [2, 3, 2, 1],\n",
-      "       [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 2, 7, 6],\n",
-      "       [2, 4, 3, 2],\n",
-      "       [1, 1, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 2, 7, 6],\n",
-      "       [2, 4, 3, 2],\n",
-      "       [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 2, 7, 6],\n",
-      "       [2, 4, 3, 3],\n",
-      "       [1, 0, 2, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 2, 7, 6],\n",
-      "       [2, 2, 4, 4],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 3, 7, 6],\n",
-      "       [2, 0, 4, 4],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 3, 7, 6],\n",
-      "       [0, 0, 2, 5],\n",
-      "       [0, 2, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 3, 7, 6],\n",
-      "       [0, 2, 3, 5],\n",
-      "       [0, 2, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 3, 7, 6],\n",
-      "       [0, 3, 3, 5],\n",
-      "       [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 4, 7, 6],\n",
+      "       [1, 4, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 0, 1],\n",
+      "       [3, 0, 1, 1],\n",
+      "       [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
+      "       [1, 0, 0, 0],\n",
+      "       [3, 0, 1, 2],\n",
+      "       [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 1, 0],\n",
+      "       [3, 0, 1, 3],\n",
+      "       [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 0, 1],\n",
+      "       [3, 0, 2, 0],\n",
+      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 0, 1],\n",
+      "       [3, 0, 2, 1],\n",
+      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [1, 0, 0, 0],\n",
+      "       [3, 0, 2, 2],\n",
+      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 1, 2],\n",
+      "       [3, 0, 2, 2],\n",
+      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 1, 1, 0],\n",
+      "       [3, 0, 2, 3],\n",
+      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [1, 0, 1, 0],\n",
+      "       [3, 1, 2, 3],\n",
+      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 0, 2, 0],\n",
+      "       [3, 1, 2, 3],\n",
+      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 1, 0, 1],\n",
+      "       [3, 1, 3, 3],\n",
+      "       [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 1, 1],\n",
+      "       [3, 2, 0, 3],\n",
+      "       [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [1, 0, 0, 1],\n",
+      "       [3, 2, 1, 3],\n",
+      "       [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [2, 0, 0, 1],\n",
+      "       [3, 2, 1, 3],\n",
+      "       [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [2, 1, 0, 2],\n",
+      "       [3, 2, 1, 3],\n",
+      "       [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [0, 2, 1, 2],\n",
+      "       [3, 2, 1, 3],\n",
+      "       [0, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 1, 0, 2],\n",
+      "       [1, 3, 2, 3],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 1, 2],\n",
+      "       [1, 3, 2, 3],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [0, 0, 1, 2],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 1, 1, 2],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 1, 2, 2],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 1, 0, 2],\n",
+      "       [2, 3, 3, 3],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
+      "       [0, 0, 2, 2],\n",
+      "       [0, 2, 3, 4],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
+      "       [0, 0, 2, 2],\n",
+      "       [0, 3, 3, 4],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 0, 2, 2],\n",
+      "       [1, 3, 3, 4],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [0, 0, 2, 2],\n",
+      "       [2, 3, 3, 4],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
+      "       [0, 1, 2, 2],\n",
+      "       [2, 3, 3, 4],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [1, 1, 2, 2],\n",
+      "       [2, 3, 3, 4],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
+      "       [1, 2, 2, 2],\n",
+      "       [2, 3, 3, 4],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
+      "       [2, 2, 2, 2],\n",
+      "       [2, 3, 3, 4],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
+      "       [1, 2, 2, 2],\n",
+      "       [3, 3, 3, 4],\n",
+      "       [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n",
+      "       [0, 2, 2, 2],\n",
+      "       [1, 3, 3, 4],\n",
+      "       [4, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
+      "       [3, 2, 0, 2],\n",
+      "       [1, 4, 4, 0],\n",
+      "       [4, 1, 4, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
+      "       [3, 2, 1, 0],\n",
+      "       [1, 4, 0, 2],\n",
+      "       [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
+      "       [3, 2, 0, 0],\n",
+      "       [1, 4, 1, 2],\n",
+      "       [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
+      "       [3, 2, 1, 1],\n",
+      "       [1, 4, 1, 2],\n",
+      "       [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
+      "       [3, 2, 0, 1],\n",
+      "       [1, 4, 2, 2],\n",
+      "       [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n",
+      "       [3, 2, 1, 1],\n",
+      "       [1, 4, 2, 2],\n",
+      "       [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(76., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 2],\n",
+      "       [1, 3, 2, 2],\n",
+      "       [0, 1, 4, 3],\n",
+      "       [0, 4, 1, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 0],\n",
+      "       [0, 3, 2, 3],\n",
+      "       [0, 1, 4, 3],\n",
+      "       [1, 4, 1, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n",
+      "       [0, 3, 2, 0],\n",
+      "       [0, 1, 4, 4],\n",
+      "       [2, 4, 1, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
+      "       [0, 3, 2, 2],\n",
+      "       [0, 1, 4, 4],\n",
+      "       [3, 4, 1, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 1],\n",
+      "       [1, 0, 3, 3],\n",
+      "       [0, 0, 1, 5],\n",
+      "       [3, 4, 1, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 1, 2, 3],\n",
+      "       [1, 0, 3, 5],\n",
+      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 2, 2, 3],\n",
       "       [1, 1, 3, 5],\n",
-      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [0, 5, 7, 6],\n",
-      "       [0, 2, 3, 5],\n",
-      "       [0, 1, 0, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [0, 5, 7, 6],\n",
+      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 0, 3, 3],\n",
       "       [1, 2, 3, 5],\n",
-      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [1, 5, 7, 6],\n",
-      "       [0, 2, 3, 5],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [2, 5, 7, 6],\n",
-      "       [0, 2, 3, 5],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [2, 5, 7, 6],\n",
-      "       [1, 2, 3, 5],\n",
-      "       [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [2, 5, 7, 6],\n",
-      "       [2, 2, 3, 5],\n",
-      "       [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [3, 5, 7, 6],\n",
-      "       [0, 2, 3, 5],\n",
-      "       [1, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [3, 5, 7, 6],\n",
-      "       [1, 2, 3, 5],\n",
-      "       [1, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [3, 5, 7, 6],\n",
-      "       [2, 2, 3, 5],\n",
-      "       [1, 1, 1, 2]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [3, 5, 7, 6],\n",
-      "       [3, 3, 5, 1],\n",
-      "       [2, 1, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 5, 1],\n",
-      "       [1, 1, 2, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 5, 1],\n",
-      "       [1, 0, 2, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 5, 1],\n",
-      "       [0, 1, 1, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [2, 3, 5, 1],\n",
-      "       [2, 0, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [3, 3, 5, 1],\n",
-      "       [1, 0, 2, 3]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [0, 4, 5, 1],\n",
-      "       [1, 1, 2, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "TimeStep(step_type=Array(2, dtype=int8), reward=Array(0., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
-      "       [4, 5, 7, 6],\n",
-      "       [1, 4, 5, 1],\n",
-      "       [2, 1, 2, 3]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
-      "3004.0\n"
+      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
+      "       [0, 0, 0, 3],\n",
+      "       [1, 2, 4, 5],\n",
+      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 1],\n",
+      "       [0, 0, 1, 3],\n",
+      "       [1, 2, 4, 5],\n",
+      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
+      "       [1, 3, 0, 1],\n",
+      "       [1, 2, 4, 5],\n",
+      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
+      "       [2, 3, 0, 1],\n",
+      "       [2, 2, 4, 5],\n",
+      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
+      "       [1, 3, 0, 1],\n",
+      "       [3, 2, 4, 5],\n",
+      "       [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [1, 3, 1, 1],\n",
+      "       [1, 2, 4, 5],\n",
+      "       [4, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [1, 3, 1, 1],\n",
+      "       [2, 2, 4, 5],\n",
+      "       [4, 4, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(44., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 1, 3, 2],\n",
+      "       [0, 3, 4, 5],\n",
+      "       [0, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [0, 1, 3, 2],\n",
+      "       [0, 3, 4, 5],\n",
+      "       [1, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 2, 3, 2],\n",
+      "       [0, 3, 4, 5],\n",
+      "       [1, 5, 2, 6]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
+      "       [0, 2, 3, 2],\n",
+      "       [0, 3, 4, 5],\n",
+      "       [2, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
+      "       [2, 2, 3, 2],\n",
+      "       [0, 3, 4, 5],\n",
+      "       [2, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 2, 3, 3],\n",
+      "       [1, 3, 4, 5],\n",
+      "       [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 2, 4],\n",
+      "       [1, 3, 4, 5],\n",
+      "       [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 0, 2, 4],\n",
+      "       [2, 3, 4, 5],\n",
+      "       [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
+      "       [2, 4, 0, 0],\n",
+      "       [2, 3, 4, 5],\n",
+      "       [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [1, 4, 1, 0],\n",
+      "       [3, 3, 4, 5],\n",
+      "       [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [1, 4, 1, 0],\n",
+      "       [1, 3, 4, 5],\n",
+      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [0, 4, 1, 0],\n",
+      "       [2, 3, 4, 5],\n",
+      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
+      "       [0, 4, 1, 1],\n",
+      "       [2, 3, 4, 5],\n",
+      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n",
+      "       [1, 4, 1, 1],\n",
+      "       [2, 3, 4, 5],\n",
+      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
+      "       [1, 4, 2, 1],\n",
+      "       [2, 3, 4, 5],\n",
+      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
+      "       [2, 4, 2, 1],\n",
+      "       [2, 3, 4, 5],\n",
+      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
+      "       [0, 4, 2, 2],\n",
+      "       [3, 3, 4, 5],\n",
+      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 2, 0],\n",
+      "       [1, 4, 2, 2],\n",
+      "       [3, 3, 4, 5],\n",
+      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
+      "       [1, 4, 3, 2],\n",
+      "       [3, 3, 4, 5],\n",
+      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
+      "       [1, 4, 3, 2],\n",
+      "       [4, 4, 5, 2],\n",
+      "       [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [2, 1, 3, 0],\n",
+      "       [1, 5, 5, 3],\n",
+      "       [5, 5, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
+      "       [2, 0, 3, 0],\n",
+      "       [1, 1, 5, 3],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
+      "       [2, 0, 3, 1],\n",
+      "       [1, 1, 5, 3],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
+      "       [2, 2, 3, 1],\n",
+      "       [1, 1, 5, 3],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
+      "       [2, 2, 3, 2],\n",
+      "       [1, 1, 5, 3],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
+      "       [1, 3, 3, 2],\n",
+      "       [0, 2, 5, 3],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [0, 3, 3, 3],\n",
+      "       [1, 2, 5, 3],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
+      "       [0, 3, 3, 0],\n",
+      "       [1, 2, 5, 4],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
+      "       [0, 0, 1, 4],\n",
+      "       [1, 2, 5, 4],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [0, 0, 1, 2],\n",
+      "       [1, 2, 5, 5],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [0, 1, 1, 2],\n",
+      "       [1, 2, 5, 5],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [2, 1, 1, 2],\n",
+      "       [2, 2, 5, 5],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 1, 1, 2],\n",
+      "       [3, 2, 5, 5],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 1, 2, 2],\n",
+      "       [1, 3, 2, 6],\n",
+      "       [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 1, 0, 0],\n",
+      "       [1, 3, 2, 2],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 1, 0, 1],\n",
+      "       [1, 3, 2, 2],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 1, 0, 2],\n",
+      "       [1, 3, 2, 2],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 1, 0, 1],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 0, 2],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [0, 0, 1, 2],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [1, 0, 1, 2],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [1, 1, 1, 2],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 2, 1, 2],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
+      "       [1, 2, 1, 2],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
+      "       [2, 2, 1, 2],\n",
+      "       [2, 3, 2, 3],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [0, 2, 2, 2],\n",
+      "       [3, 3, 2, 3],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
+      "       [0, 2, 0, 2],\n",
+      "       [3, 3, 3, 3],\n",
+      "       [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n",
+      "       [1, 2, 0, 2],\n",
+      "       [3, 3, 0, 3],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [1, 2, 2, 2],\n",
+      "       [3, 3, 1, 3],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 2],\n",
+      "       [0, 1, 2, 3],\n",
+      "       [0, 4, 1, 3],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n",
+      "       [0, 2, 2, 2],\n",
+      "       [0, 4, 1, 4],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 2, 3, 2],\n",
+      "       [1, 4, 1, 4],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [2, 3, 2, 0],\n",
+      "       [1, 4, 1, 4],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [0, 2, 3, 2],\n",
+      "       [1, 4, 1, 4],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
+      "       [1, 2, 3, 2],\n",
+      "       [1, 4, 1, 4],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 2, 3, 3],\n",
+      "       [2, 4, 1, 4],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [0, 1, 2, 4],\n",
+      "       [2, 4, 1, 4],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 1, 2, 1],\n",
+      "       [2, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [0, 2, 2, 1],\n",
+      "       [2, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 2, 3, 1],\n",
+      "       [2, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 2, 3, 2],\n",
+      "       [2, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
+      "       [2, 3, 2, 0],\n",
+      "       [2, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
+      "       [1, 3, 2, 0],\n",
+      "       [3, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
+      "       [2, 3, 2, 0],\n",
+      "       [3, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 2],\n",
+      "       [0, 2, 3, 2],\n",
+      "       [3, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 2, 3, 3],\n",
+      "       [3, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
+      "       [1, 2, 4, 0],\n",
+      "       [3, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
+      "       [2, 2, 4, 0],\n",
+      "       [3, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n",
+      "       [3, 4, 0, 1],\n",
+      "       [3, 4, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(48., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [2, 0, 1, 1],\n",
+      "       [4, 5, 1, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [2, 0, 0, 1],\n",
+      "       [4, 5, 2, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [2, 0, 1, 1],\n",
+      "       [4, 5, 2, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 0, 2, 2],\n",
+      "       [4, 5, 2, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
+      "       [1, 0, 0, 2],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
+      "       [1, 2, 0, 2],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n",
+      "       [2, 2, 0, 2],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
+      "       [3, 2, 0, 2],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [3, 2, 1, 2],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
+      "       [3, 2, 1, 2],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [3, 2, 2, 3],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
+      "       [3, 3, 3, 0],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
+      "       [3, 3, 1, 0],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
+      "       [3, 3, 0, 0],\n",
+      "       [4, 5, 1, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
+      "       [3, 3, 0, 0],\n",
+      "       [4, 5, 2, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
+      "       [3, 3, 1, 0],\n",
+      "       [4, 5, 2, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
+      "       [3, 3, 1, 1],\n",
+      "       [4, 5, 2, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
+      "       [3, 3, 2, 1],\n",
+      "       [4, 5, 2, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
+      "       [3, 3, 0, 2],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
+      "       [3, 3, 1, 2],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
+      "       [1, 4, 1, 2],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [1, 4, 2, 3],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n",
+      "       [1, 4, 2, 3],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n",
+      "       [1, 4, 2, 3],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
+      "       [1, 4, 3, 3],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
+      "       [1, 4, 1, 3],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 2],\n",
+      "       [1, 4, 1, 3],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n",
+      "       [1, 4, 1, 3],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 0],\n",
+      "       [2, 4, 2, 3],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 1],\n",
+      "       [2, 4, 2, 3],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
+      "       [2, 4, 2, 3],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 2],\n",
+      "       [2, 4, 2, 3],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 1],\n",
+      "       [2, 4, 2, 3],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 1],\n",
+      "       [3, 4, 3, 3],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n",
+      "       [0, 3, 4, 4],\n",
+      "       [4, 5, 4, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n",
+      "       [1, 3, 3, 4],\n",
+      "       [4, 5, 5, 5],\n",
+      "       [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n",
+      "       [1, 3, 1, 4],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 6, 6, 7]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 0],\n",
+      "       [1, 3, 1, 4],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 0],\n",
+      "       [2, 3, 2, 4],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
+      "       [3, 3, 2, 4],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(272., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n",
+      "       [4, 2, 4, 0],\n",
+      "       [4, 5, 3, 5],\n",
+      "       [5, 8, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n",
+      "       [2, 2, 4, 0],\n",
+      "       [5, 5, 3, 1],\n",
+      "       [5, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n",
+      "       [0, 2, 4, 0],\n",
+      "       [2, 5, 3, 1],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n",
+      "       [0, 2, 4, 0],\n",
+      "       [2, 5, 3, 2],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n",
+      "       [1, 2, 4, 1],\n",
+      "       [2, 5, 3, 2],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 1],\n",
+      "       [1, 2, 4, 1],\n",
+      "       [2, 5, 3, 2],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [2, 3, 4, 2],\n",
+      "       [2, 5, 3, 2],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
+      "       [0, 3, 4, 0],\n",
+      "       [3, 5, 3, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
+      "       [0, 3, 4, 1],\n",
+      "       [3, 5, 3, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n",
+      "       [3, 4, 1, 0],\n",
+      "       [3, 5, 4, 1],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [2, 4, 1, 0],\n",
+      "       [4, 5, 4, 1],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [2, 4, 1, 0],\n",
+      "       [4, 5, 4, 2],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [2, 4, 1, 1],\n",
+      "       [4, 5, 4, 2],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [0, 2, 4, 2],\n",
+      "       [4, 5, 4, 2],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [1, 2, 0, 1],\n",
+      "       [4, 5, 5, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 1, 2, 1],\n",
+      "       [1, 4, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n",
+      "       [0, 1, 2, 2],\n",
+      "       [1, 4, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 1, 3, 2],\n",
+      "       [1, 4, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [0, 1, 3, 2],\n",
+      "       [2, 4, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 1, 3, 2],\n",
+      "       [2, 4, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [2, 3, 2, 1],\n",
+      "       [2, 4, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n",
+      "       [1, 3, 2, 1],\n",
+      "       [3, 4, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [1, 3, 3, 1],\n",
+      "       [3, 4, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
+      "       [1, 4, 1, 0],\n",
+      "       [3, 4, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [2, 0, 1, 1],\n",
+      "       [3, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [2, 0, 2, 1],\n",
+      "       [3, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [2, 0, 2, 2],\n",
+      "       [3, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
+      "       [3, 2, 0, 0],\n",
+      "       [3, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
+      "       [1, 2, 1, 0],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 2, 1, 2],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
+      "       [1, 2, 1, 2],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [2, 2, 1, 2],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 0],\n",
+      "       [3, 1, 2, 0],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
+      "       [3, 1, 2, 0],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
+      "       [3, 2, 2, 1],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
+      "       [3, 3, 1, 0],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
+      "       [3, 3, 1, 1],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
+      "       [3, 3, 2, 1],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
+      "       [3, 3, 2, 2],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n",
+      "       [4, 3, 0, 0],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
+      "       [2, 3, 1, 0],\n",
+      "       [5, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
+      "       [2, 3, 2, 0],\n",
+      "       [5, 5, 6, 3],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
+      "       [2, 3, 2, 0],\n",
+      "       [6, 6, 3, 1],\n",
+      "       [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [0, 3, 2, 1],\n",
+      "       [3, 6, 3, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
+      "       [0, 3, 2, 0],\n",
+      "       [3, 6, 3, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n",
+      "       [3, 2, 0, 0],\n",
+      "       [3, 6, 3, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [2, 2, 0, 1],\n",
+      "       [4, 6, 3, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [2, 2, 0, 2],\n",
+      "       [4, 6, 3, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [2, 2, 1, 0],\n",
+      "       [4, 6, 3, 3],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 0, 3, 1],\n",
+      "       [0, 4, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 1, 3, 2],\n",
+      "       [1, 4, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [1, 3, 2, 0],\n",
+      "       [1, 4, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [1, 3, 2, 0],\n",
+      "       [2, 4, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [2, 3, 2, 1],\n",
+      "       [2, 4, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 3, 2, 1],\n",
+      "       [3, 4, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [0, 3, 2, 2],\n",
+      "       [3, 4, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 0],\n",
+      "       [1, 3, 2, 2],\n",
+      "       [3, 4, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
+      "       [0, 1, 3, 3],\n",
+      "       [3, 4, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
+      "       [1, 4, 1, 0],\n",
+      "       [3, 4, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [2, 2, 1, 0],\n",
+      "       [3, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
+      "       [0, 0, 3, 1],\n",
+      "       [3, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [1, 0, 3, 2],\n",
+      "       [3, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
+      "       [1, 3, 2, 0],\n",
+      "       [3, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
+      "       [2, 3, 2, 0],\n",
+      "       [3, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
+      "       [2, 3, 2, 1],\n",
+      "       [3, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 2],\n",
+      "       [2, 3, 2, 1],\n",
+      "       [3, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 1],\n",
+      "       [2, 3, 2, 1],\n",
+      "       [3, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
+      "       [3, 3, 2, 2],\n",
+      "       [3, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 1],\n",
+      "       [0, 3, 2, 2],\n",
+      "       [4, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 0],\n",
+      "       [3, 3, 1, 0],\n",
+      "       [4, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 3],\n",
+      "       [0, 0, 4, 1],\n",
+      "       [4, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 0],\n",
+      "       [4, 1, 1, 0],\n",
+      "       [4, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 0],\n",
+      "       [1, 1, 1, 0],\n",
+      "       [5, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n",
+      "       [1, 1, 2, 0],\n",
+      "       [5, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n",
+      "       [1, 1, 2, 1],\n",
+      "       [5, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 0],\n",
+      "       [1, 1, 2, 2],\n",
+      "       [5, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n",
+      "       [2, 1, 2, 2],\n",
+      "       [5, 5, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 1],\n",
+      "       [0, 2, 1, 3],\n",
+      "       [0, 6, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 3, 1],\n",
+      "       [2, 2, 1, 3],\n",
+      "       [1, 6, 6, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 0, 1],\n",
+      "       [3, 1, 3, 0],\n",
+      "       [1, 7, 4, 0],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [4, 2, 3, 0],\n",
+      "       [1, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
+      "       [0, 4, 2, 3],\n",
+      "       [1, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
+      "       [0, 4, 2, 3],\n",
+      "       [2, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [1, 4, 2, 3],\n",
+      "       [2, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 2],\n",
+      "       [1, 4, 2, 3],\n",
+      "       [2, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 2],\n",
+      "       [1, 4, 2, 3],\n",
+      "       [2, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 2],\n",
+      "       [2, 4, 2, 3],\n",
+      "       [2, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 2],\n",
+      "       [1, 4, 2, 3],\n",
+      "       [3, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 1],\n",
+      "       [1, 4, 2, 3],\n",
+      "       [3, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 1],\n",
+      "       [1, 4, 3, 3],\n",
+      "       [3, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 1],\n",
+      "       [1, 4, 4, 0],\n",
+      "       [3, 7, 4, 1],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 2, 0],\n",
+      "       [1, 4, 1, 0],\n",
+      "       [3, 7, 5, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 0],\n",
+      "       [1, 4, 1, 0],\n",
+      "       [3, 7, 5, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 0, 0],\n",
+      "       [1, 4, 2, 1],\n",
+      "       [3, 7, 5, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 2],\n",
+      "       [1, 4, 2, 1],\n",
+      "       [3, 7, 5, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 3, 2],\n",
+      "       [2, 4, 2, 1],\n",
+      "       [3, 7, 5, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n",
+      "       [2, 4, 2, 1],\n",
+      "       [3, 7, 5, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n",
+      "       [2, 4, 3, 2],\n",
+      "       [3, 7, 5, 2],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
+      "       [2, 4, 3, 0],\n",
+      "       [3, 7, 5, 3],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
+      "       [2, 4, 3, 1],\n",
+      "       [3, 7, 5, 3],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
+      "       [2, 4, 3, 2],\n",
+      "       [3, 7, 5, 3],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([False,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 2],\n",
+      "       [2, 4, 3, 2],\n",
+      "       [3, 7, 5, 3],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
+      "       [2, 4, 4, 3],\n",
+      "       [3, 7, 5, 3],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
+      "       [2, 4, 4, 1],\n",
+      "       [3, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
+      "       [2, 4, 4, 2],\n",
+      "       [3, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
+      "       [2, 5, 2, 0],\n",
+      "       [3, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [3, 5, 2, 1],\n",
+      "       [3, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
+      "       [0, 5, 2, 2],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n",
+      "       [5, 3, 0, 1],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
+      "       [1, 5, 3, 1],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 0],\n",
+      "       [1, 5, 3, 1],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
+      "       [1, 5, 3, 1],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n",
+      "       [1, 5, 3, 1],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 0, 1],\n",
+      "       [1, 5, 3, 1],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 1, 0],\n",
+      "       [1, 5, 3, 2],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 0],\n",
+      "       [1, 5, 3, 2],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True,  True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n",
+      "       [1, 5, 3, 2],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "TimeStep(step_type=Array(2, dtype=int8), reward=Array(4., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n",
+      "       [2, 5, 3, 2],\n",
+      "       [4, 7, 5, 4],\n",
+      "       [7, 8, 1, 5]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
+      "3716.0\n"
      ]
     }
    ],
@@ -1222,7 +1359,7 @@
     "import jax, jumanji\n",
     "\n",
     "env = jumanji.make(\"Game2048-v1\")\n",
-    "key = jax.random.PRNGKey(48)\n",
+    "key = jax.random.PRNGKey(0)\n",
     "jit_reset = jax.jit(env.reset)\n",
     "jit_step = jax.jit(env.step)\n",
     "state, timestep = jax.jit(env.reset)(key)\n",
@@ -1244,12 +1381,464 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-05T05:15:43.041491500Z",
-     "start_time": "2024-06-05T05:15:37.325953600Z"
+     "end_time": "2024-06-05T07:41:33.703431900Z",
+     "start_time": "2024-06-05T07:41:26.102578200Z"
     }
    },
    "id": "f166e09c5be1a8fb"
   },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "outputs": [],
+   "source": [
+    "import jax.random\n",
+    "from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048\n",
+    "\n",
+    "\n",
+    "def random_policy(state, params, obs):\n",
+    "    key = jax.random.key(obs.sum())\n",
+    "    actions = jax.random.normal(key, (4,))\n",
+    "    return actions\n",
+    "\n",
+    "problem = Jumanji_2048(max_step=10000, repeat_times=10, guarantee_invalid_action=True)\n",
+    "state = problem.setup()\n",
+    "jit_evaluate = jax.jit(lambda state, randkey: problem.evaluate(state, randkey, random_policy, None))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-05T08:06:59.491563700Z",
+     "start_time": "2024-06-05T08:06:59.465404900Z"
+    }
+   },
+   "id": "187326d08ac1eeb4"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1193.2001\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "reward = jit_evaluate(state, randkey)\n",
+    "print(reward)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-05T08:07:21.630420300Z",
+     "start_time": "2024-06-05T08:07:21.107419400Z"
+    }
+   },
+   "id": "4b3506db87568d81"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
+      "       [1, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [1, 1, 1, 1]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 1, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [2, 2, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [3, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [3, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [1, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [3, 2, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [2, 0, 1, 0],\n",
+      "       [3, 2, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [2, 1, 0, 0],\n",
+      "       [3, 2, 0, 1]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
+      "       [0, 0, 0, 0],\n",
+      "       [2, 1, 0, 0],\n",
+      "       [3, 2, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
+      "       [2, 2, 0, 0],\n",
+      "       [3, 0, 0, 1],\n",
+      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 1],\n",
+      "       [2, 2, 0, 0],\n",
+      "       [3, 0, 0, 0],\n",
+      "       [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [1, 1, 0, 0],\n",
+      "       [2, 2, 0, 0],\n",
+      "       [3, 1, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 1],\n",
+      "       [2, 2, 0, 0],\n",
+      "       [3, 1, 0, 2],\n",
+      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 0, 0, 0],\n",
+      "       [2, 3, 1, 1],\n",
+      "       [3, 1, 1, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 0, 0, 0],\n",
+      "       [2, 3, 0, 1],\n",
+      "       [3, 1, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 0, 2, 1],\n",
+      "       [0, 2, 3, 1],\n",
+      "       [0, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 0, 2, 1],\n",
+      "       [0, 2, 3, 2],\n",
+      "       [1, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
+      "       [0, 3, 3, 2],\n",
+      "       [1, 0, 1, 3],\n",
+      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 0, 2, 1],\n",
+      "       [1, 2, 3, 2],\n",
+      "       [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False, False,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [2, 1, 1, 0],\n",
+      "       [1, 2, 3, 2],\n",
+      "       [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 0, 2, 2],\n",
+      "       [1, 2, 3, 2],\n",
+      "       [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
+      "       [2, 3, 3, 3],\n",
+      "       [1, 0, 1, 3],\n",
+      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
+      "       [2, 3, 3, 4],\n",
+      "       [1, 0, 1, 0],\n",
+      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 3, 1],\n",
+      "       [0, 2, 4, 4],\n",
+      "       [0, 0, 0, 2],\n",
+      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [1, 0, 0, 4],\n",
+      "       [0, 1, 3, 2],\n",
+      "       [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
+      "       [1, 4, 0, 0],\n",
+      "       [1, 3, 2, 0],\n",
+      "       [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
+      "       [0, 0, 1, 4],\n",
+      "       [0, 1, 3, 2],\n",
+      "       [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 2, 2],\n",
+      "       [0, 2, 3, 4],\n",
+      "       [0, 0, 4, 2],\n",
+      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
+      "       [1, 0, 2, 4],\n",
+      "       [0, 1, 3, 2],\n",
+      "       [2, 2, 4, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 2, 2],\n",
+      "       [2, 2, 3, 4],\n",
+      "       [0, 1, 4, 2],\n",
+      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 3],\n",
+      "       [0, 3, 3, 4],\n",
+      "       [1, 1, 4, 2],\n",
+      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 3],\n",
+      "       [0, 1, 3, 4],\n",
+      "       [0, 0, 4, 2],\n",
+      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 3],\n",
+      "       [1, 1, 3, 4],\n",
+      "       [0, 0, 4, 2],\n",
+      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n",
+      "       [0, 1, 3, 4],\n",
+      "       [0, 1, 4, 3],\n",
+      "       [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n",
+      "       [0, 2, 3, 4],\n",
+      "       [0, 0, 4, 3],\n",
+      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n",
+      "       [0, 2, 3, 4],\n",
+      "       [2, 0, 4, 3],\n",
+      "       [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 3, 2, 3],\n",
+      "       [0, 2, 3, 4],\n",
+      "       [0, 0, 4, 3],\n",
+      "       [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 3, 2, 3],\n",
+      "       [0, 2, 3, 4],\n",
+      "       [0, 1, 4, 3],\n",
+      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
+      "       [2, 3, 4, 0],\n",
+      "       [1, 4, 3, 0],\n",
+      "       [2, 0, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
+      "       [0, 2, 3, 4],\n",
+      "       [0, 1, 4, 3],\n",
+      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
+      "       [2, 3, 4, 0],\n",
+      "       [1, 4, 3, 0],\n",
+      "       [1, 2, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
+      "       [2, 3, 4, 0],\n",
+      "       [2, 4, 3, 1],\n",
+      "       [0, 2, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
+      "       [2, 3, 4, 0],\n",
+      "       [2, 4, 3, 1],\n",
+      "       [2, 1, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
+      "       [2, 3, 4, 0],\n",
+      "       [2, 4, 3, 1],\n",
+      "       [2, 2, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
+      "       [0, 2, 3, 4],\n",
+      "       [2, 4, 3, 1],\n",
+      "       [0, 1, 2, 3]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
+      "       [2, 3, 4, 0],\n",
+      "       [2, 4, 3, 1],\n",
+      "       [1, 2, 3, 1]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
+      "       [1, 2, 3, 4],\n",
+      "       [2, 4, 3, 1],\n",
+      "       [1, 2, 3, 1]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(44., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 0, 0, 1],\n",
+      "       [1, 3, 0, 1],\n",
+      "       [2, 4, 4, 4],\n",
+      "       [1, 2, 4, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 0, 0],\n",
+      "       [1, 3, 1, 0],\n",
+      "       [2, 5, 4, 1],\n",
+      "       [1, 2, 4, 2]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 1, 1],\n",
+      "       [1, 3, 5, 2],\n",
+      "       [2, 5, 0, 1],\n",
+      "       [1, 2, 0, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 4, 1, 2],\n",
+      "       [1, 3, 5, 2],\n",
+      "       [1, 2, 5, 1],\n",
+      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 2, 0],\n",
+      "       [1, 3, 5, 2],\n",
+      "       [1, 2, 5, 1],\n",
+      "       [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(80., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 2, 2],\n",
+      "       [2, 3, 6, 2],\n",
+      "       [1, 3, 0, 0],\n",
+      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 4, 1, 3],\n",
+      "       [2, 3, 6, 2],\n",
+      "       [0, 0, 1, 3],\n",
+      "       [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 3],\n",
+      "       [2, 3, 6, 2],\n",
+      "       [0, 0, 1, 3],\n",
+      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
+      "       [0, 3, 6, 2],\n",
+      "       [0, 0, 1, 3],\n",
+      "       [1, 0, 0, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
+      "       [1, 3, 6, 2],\n",
+      "       [0, 0, 1, 3],\n",
+      "       [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
+      "       [1, 3, 6, 2],\n",
+      "       [1, 3, 1, 0],\n",
+      "       [1, 2, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
+      "       [2, 4, 6, 2],\n",
+      "       [1, 2, 1, 0],\n",
+      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 5, 1, 3],\n",
+      "       [2, 2, 6, 2],\n",
+      "       [2, 0, 1, 0],\n",
+      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 1, 1, 0],\n",
+      "       [3, 5, 6, 3],\n",
+      "       [3, 2, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [1, 1, 1, 0],\n",
+      "       [0, 5, 6, 3],\n",
+      "       [4, 2, 2, 2]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
+      "       [0, 0, 1, 2],\n",
+      "       [1, 5, 6, 3],\n",
+      "       [0, 4, 2, 3]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
+      "       [1, 2, 0, 0],\n",
+      "       [1, 5, 6, 3],\n",
+      "       [4, 2, 3, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
+      "       [0, 2, 1, 0],\n",
+      "       [2, 5, 6, 0],\n",
+      "       [4, 2, 3, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 1],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [0, 2, 3, 0],\n",
+      "       [0, 0, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 0, 0],\n",
+      "       [4, 5, 6, 3],\n",
+      "       [2, 3, 1, 0],\n",
+      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
+      "       [4, 5, 1, 0],\n",
+      "       [2, 3, 1, 0],\n",
+      "       [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
+      "       [4, 5, 2, 0],\n",
+      "       [2, 3, 0, 0],\n",
+      "       [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
+      "       [4, 5, 2, 0],\n",
+      "       [2, 3, 1, 0],\n",
+      "       [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
+      "       [1, 4, 5, 2],\n",
+      "       [0, 2, 3, 1],\n",
+      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
+      "       [1, 4, 5, 2],\n",
+      "       [2, 3, 1, 1],\n",
+      "       [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
+      "       [1, 4, 5, 2],\n",
+      "       [1, 2, 3, 2],\n",
+      "       [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
+      "       [1, 4, 5, 2],\n",
+      "       [1, 2, 3, 2],\n",
+      "       [2, 0, 1, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 6, 0],\n",
+      "       [3, 2, 5, 1],\n",
+      "       [2, 4, 3, 3],\n",
+      "       [2, 2, 1, 3]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 6, 0],\n",
+      "       [0, 2, 5, 1],\n",
+      "       [3, 4, 3, 1],\n",
+      "       [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 6],\n",
+      "       [1, 2, 5, 1],\n",
+      "       [3, 4, 3, 1],\n",
+      "       [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[6, 0, 0, 1],\n",
+      "       [1, 2, 5, 1],\n",
+      "       [3, 4, 3, 1],\n",
+      "       [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
+      "       [6, 2, 5, 1],\n",
+      "       [1, 4, 3, 2],\n",
+      "       [4, 2, 1, 4]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 2, 0],\n",
+      "       [6, 2, 5, 1],\n",
+      "       [1, 4, 3, 2],\n",
+      "       [4, 2, 1, 4]], dtype=int32), action_mask=Array([ True,  True, False,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
+      "       [6, 4, 5, 2],\n",
+      "       [1, 2, 3, 4],\n",
+      "       [4, 1, 1, 0]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n",
+      "       [6, 4, 5, 2],\n",
+      "       [1, 2, 3, 4],\n",
+      "       [0, 0, 4, 2]], dtype=int32), action_mask=Array([False,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 3, 1],\n",
+      "       [6, 4, 5, 2],\n",
+      "       [1, 2, 3, 4],\n",
+      "       [0, 1, 4, 2]], dtype=int32), action_mask=Array([ True, False,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 1],\n",
+      "       [6, 4, 5, 2],\n",
+      "       [1, 2, 3, 4],\n",
+      "       [1, 4, 2, 0]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n",
+      "       [2, 4, 5, 1],\n",
+      "       [6, 2, 3, 2],\n",
+      "       [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True,  True, False, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n",
+      "       [2, 4, 5, 1],\n",
+      "       [6, 2, 3, 2],\n",
+      "       [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True,  True,  True,  True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n",
+      "       [2, 4, 5, 1],\n",
+      "       [6, 2, 3, 2],\n",
+      "       [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n",
+      "       [2, 4, 5, 2],\n",
+      "       [6, 2, 3, 2],\n",
+      "       [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, False,  True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "TimeStep(step_type=Array(2, dtype=int8), reward=Array(8., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n",
+      "       [2, 4, 5, 3],\n",
+      "       [6, 2, 3, 4],\n",
+      "       [2, 4, 2, 1]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
+      "636.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "randkey = jax.random.PRNGKey(14)\n",
+    "jit_policy = jax.jit(random_policy)\n",
+    "total_reward = 0\n",
+    "state, timestep = jax.jit(env.reset)(randkey )\n",
+    "while True:\n",
+    "    board, action_mask = timestep[\"observation\"]\n",
+    "    action = jit_policy(None, None, timestep[\"observation\"][0].reshape(-1))\n",
+    "    score_with_mask = jnp.where(action_mask, action, -jnp.inf)\n",
+    "    action = jnp.argmax(score_with_mask)\n",
+    "    state, timestep = jit_step(state, action)\n",
+    "    done = jnp.all(~timestep[\"observation\"][1])\n",
+    "    print(timestep)\n",
+    "    total_reward += timestep[\"reward\"]\n",
+    "    if done:\n",
+    "        break\n",
+    "print(total_reward)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-05T08:09:58.242414600Z",
+     "start_time": "2024-06-05T08:09:56.452642800Z"
+    }
+   },
+   "id": "8bb888fb742b6b06"
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1258,7 +1847,7 @@
    "metadata": {
     "collapsed": false
    },
-   "id": "187326d08ac1eeb4"
+   "id": "3d1b5c8c646d4f07"
   }
  ],
  "metadata": {
diff --git a/tensorneat/examples/jumanji/train_2048.py b/tensorneat/examples/jumanji/train_2048.py
new file mode 100644
index 0000000..336f07c
--- /dev/null
+++ b/tensorneat/examples/jumanji/train_2048.py
@@ -0,0 +1,119 @@
+import jax, jax.numpy as jnp
+
+from pipeline import Pipeline
+from algorithm.neat import *
+from algorithm.neat.gene.node.default_without_response import NodeGeneWithoutResponse
+from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048
+from utils import Act, Agg
+
+
+def rot_li(li):
+    return li[1:] + [li[0]]
+
+
+def rot_boards(board):
+    def rot(a, _):
+        a = jnp.rot90(a)
+        return a, a  # carry, y
+
+    # carry, np.stack(ys)
+    _, boards = jax.lax.scan(rot, board, jnp.arange(4, dtype=jnp.int32))
+    return boards
+
+
+direction = ["up", "right", "down", "left"]
+lr_flip_direction = ["up", "left", "down", "right"]
+
+directions = []
+lr_flip_directions = []
+for _ in range(4):
+    direction = rot_li(direction)
+    lr_flip_direction = rot_li(lr_flip_direction)
+    directions.append(direction.copy())
+    lr_flip_directions.append(lr_flip_direction.copy())
+
+full_directions = directions + lr_flip_directions
+
+
+def action_policy(forward_func, obs):
+    board = obs.reshape(4, 4)
+    lr_flip_board = jnp.fliplr(board)
+
+    boards = rot_boards(board)
+    lr_flip_boards = rot_boards(lr_flip_board)
+    # stack
+    full_boards = jnp.concatenate([boards, lr_flip_boards], axis=0)
+    scores = jax.vmap(forward_func)(full_boards.reshape(8, -1))
+    total_score = {"up": 0, "right": 0, "down": 0, "left": 0}
+    for i in range(8):
+        dire = full_directions[i]
+        for j in range(4):
+            total_score[dire[j]] += scores[i, j]
+
+    return jnp.array(
+        [
+            total_score["up"],
+            total_score["right"],
+            total_score["down"],
+            total_score["left"],
+        ]
+    )
+
+
+if __name__ == "__main__":
+    pipeline = Pipeline(
+        algorithm=NEAT(
+            species=DefaultSpecies(
+                genome=DefaultGenome(
+                    num_inputs=16,
+                    num_outputs=4,
+                    max_nodes=100,
+                    max_conns=1000,
+                    node_gene=NodeGeneWithoutResponse(
+                        activation_default=Act.sigmoid,
+                        activation_options=(
+                            Act.sigmoid,
+                            Act.relu,
+                            Act.tanh,
+                            Act.identity,
+                        ),
+                        aggregation_default=Agg.sum,
+                        aggregation_options=(Agg.sum,),
+                        activation_replace_rate=0.02,
+                        aggregation_replace_rate=0.02,
+                        bias_mutate_rate=0.03,
+                        bias_init_std=0.5,
+                        bias_mutate_power=0.2,
+                        bias_replace_rate=0.01,
+                    ),
+                    conn_gene=DefaultConnGene(
+                        weight_mutate_rate=0.015,
+                        weight_replace_rate=0.003,
+                        weight_mutate_power=0.5,
+                    ),
+                    mutation=DefaultMutation(node_add=0.001, conn_add=0.002),
+                ),
+                pop_size=1000,
+                species_size=5,
+                survival_threshold=0.1,
+                max_stagnation=7,
+                genome_elitism=3,
+                compatibility_threshold=1.2,
+            ),
+        ),
+        problem=Jumanji_2048(
+            max_step=10000,
+            repeat_times=10,
+            guarantee_invalid_action=True,
+            action_policy=action_policy,
+        ),
+        generation_limit=1000,
+        fitness_target=13000,
+        save_path="2048.npz",
+    )
+
+    # initialize state
+    state = pipeline.setup()
+    # print(state)
+    # run until terminate
+    state, best = pipeline.auto_run(state)
diff --git a/tensorneat/pipeline.py b/tensorneat/pipeline.py
index edaead7..a31808b 100644
--- a/tensorneat/pipeline.py
+++ b/tensorneat/pipeline.py
@@ -19,6 +19,7 @@ class Pipeline:
         generation_limit: int = 1000,
         pre_update: bool = False,
         update_batch_size: int = 10000,
+        save_path=None,
     ):
         assert problem.jitable, "Currently, problem must be jitable"
 
@@ -55,6 +56,7 @@ class Pipeline:
                 assert not problem.record_episode, "record_episode must be False"
             elif isinstance(problem, FuncFit):
                 assert not problem.return_data, "return_data must be False"
+        self.save_path = save_path
 
     def setup(self, state=State()):
         print("initializing")
@@ -181,6 +183,17 @@ class Pipeline:
             self.best_fitness = fitnesses[max_idx]
             self.best_genome = pop[0][max_idx], pop[1][max_idx]
 
+        # save best if save path is not None
+        if self.save_path is not None:
+            best_genome = jax.device_get(self.best_genome)
+            with open(self.save_path, "wb") as f:
+                np.savez(
+                    f,
+                    nodes=best_genome[0],
+                    conns=best_genome[1],
+                    fitness=self.best_fitness,
+                )
+
         member_count = jax.device_get(self.algorithm.member_count(state))
         species_sizes = [int(i) for i in member_count if i > 0]
 
diff --git a/tensorneat/problem/rl_env/brax_env.py b/tensorneat/problem/rl_env/brax_env.py
index 7df8040..f3adb15 100644
--- a/tensorneat/problem/rl_env/brax_env.py
+++ b/tensorneat/problem/rl_env/brax_env.py
@@ -5,8 +5,10 @@ from .rl_jit import RLEnv
 
 
 class BraxEnv(RLEnv):
-    def __init__(self, max_step=1000, repeat_times=1, record_episode=False, env_name: str = "ant", backend: str = "generalized"):
-        super().__init__(max_step, repeat_times, record_episode)
+    def __init__(
+        self, env_name: str = "ant", backend: str = "generalized", *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
         self.env = envs.create(env_name=env_name, backend=backend)
 
     def env_step(self, randkey, env_state, action):
diff --git a/tensorneat/problem/rl_env/gymnax_env.py b/tensorneat/problem/rl_env/gymnax_env.py
index af75d60..da15122 100644
--- a/tensorneat/problem/rl_env/gymnax_env.py
+++ b/tensorneat/problem/rl_env/gymnax_env.py
@@ -4,8 +4,8 @@ from .rl_jit import RLEnv
 
 
 class GymNaxEnv(RLEnv):
-    def __init__(self, env_name, max_step=1000, repeat_times=1, record_episode=False):
-        super().__init__(max_step, repeat_times, record_episode)
+    def __init__(self, env_name, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         assert env_name in gymnax.registered_envs, f"Env {env_name} not registered"
         self.env, self.env_params = gymnax.make(env_name)
 
diff --git a/tensorneat/problem/rl_env/jumanji/jumanji_2048.py b/tensorneat/problem/rl_env/jumanji/jumanji_2048.py
index e9b7274..8d23fca 100644
--- a/tensorneat/problem/rl_env/jumanji/jumanji_2048.py
+++ b/tensorneat/problem/rl_env/jumanji/jumanji_2048.py
@@ -7,14 +7,21 @@ from ..rl_jit import RLEnv
 
 class Jumanji_2048(RLEnv):
     def __init__(
-        self, max_step=1000, repeat_times=1, record_episode=False, guarantee_invalid_action=True
+        self, guarantee_invalid_action=True, *args, **kwargs
     ):
-        super().__init__(max_step, repeat_times, record_episode)
+        super().__init__(*args, **kwargs)
         self.guarantee_invalid_action = guarantee_invalid_action
         self.env = jumanji.make("Game2048-v1")
 
     def env_step(self, randkey, env_state, action):
         action_mask = env_state["action_mask"]
+
+        ###################################################################
+
+        action = jnp.concatenate([action, jnp.full((4 - action.shape[0], ), -99999)])
+        action = (action - 1) / 15
+
+        ###################################################################
         if self.guarantee_invalid_action:
             score_with_mask = jnp.where(action_mask, action, -jnp.inf)
             action = jnp.argmax(score_with_mask)
diff --git a/tensorneat/problem/rl_env/rl_jit.py b/tensorneat/problem/rl_env/rl_jit.py
index 285a9a6..c5d54c8 100644
--- a/tensorneat/problem/rl_env/rl_jit.py
+++ b/tensorneat/problem/rl_env/rl_jit.py
@@ -11,11 +11,18 @@ from .. import BaseProblem
 class RLEnv(BaseProblem):
     jitable = True
 
-    def __init__(self, max_step=1000, repeat_times=1, record_episode=False):
+    def __init__(
+        self,
+        max_step=1000,
+        repeat_times=1,
+        record_episode=False,
+        action_policy: Callable = None,
+    ):
         super().__init__()
         self.max_step = max_step
         self.record_episode = record_episode
         self.repeat_times = repeat_times
+        self.action_policy = action_policy
 
     def evaluate(self, state: State, randkey, act_func: Callable, params):
         keys = jax.random.split(randkey, self.repeat_times)
@@ -63,7 +70,11 @@ class RLEnv(BaseProblem):
 
         def body_func(carry):
             obs, env_state, rng, done, tr, count, epis = carry  # tr -> total reward
-            action = act_func(state, params, obs)
+            if self.action_policy is not None:
+                forward_func = lambda obs: act_func(state, params, obs)
+                action = self.action_policy(forward_func, obs)
+            else:
+                action = act_func(state, params, obs)
             next_obs, next_env_state, reward, done, _ = self.step(
                 rng, env_state, action
             )
diff --git a/tensorneat/test/test_efficient_b_spline.ipynb b/tensorneat/test/test_efficient_b_spline.ipynb
new file mode 100644
index 0000000..5c62a9c
--- /dev/null
+++ b/tensorneat/test/test_efficient_b_spline.ipynb
@@ -0,0 +1,283 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2024-06-03T03:34:30.787475200Z",
+     "start_time": "2024-06-03T03:34:28.159120700Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from algorithm.neat.gene.conn.bspline import BSplineConn\n",
+    "from algorithm.neat.gene.conn.cache_bspline import CacheBSplineConn\n",
+    "import jax, jax.numpy as jnp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "outputs": [],
+   "source": [
+    "normal_gene = BSplineConn(grid_cnt=6, spline_order=3, grid_init_range=[-1, 1])\n",
+    "cache_gene = CacheBSplineConn(grid_cnt=6, spline_order=3, grid_range=[-1, 1], cache_num=1000000)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-03T03:41:35.807924900Z",
+     "start_time": "2024-06-03T03:41:35.793415500Z"
+    }
+   },
+   "id": "a09d6ccf956606b3"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "State ({'kan_initial_grids': Array([-1.        , -0.6       , -0.20000002,  0.20000005,  0.6       ,\n        1.        ], dtype=float32), 'bspline_cache': Array([[0.16666666, 0.6666666 , 0.16666667, ..., 0.        , 0.        ,\n        0.        ],\n       [0.16666412, 0.6666665 , 0.1666692 , ..., 0.        , 0.        ,\n        0.        ],\n       [0.16666159, 0.66666657, 0.16667175, ..., 0.        , 0.        ,\n        0.        ],\n       ...,\n       [0.        , 0.        , 0.        , ..., 0.1666717 , 0.6666666 ,\n        0.16666172],\n       [0.        , 0.        , 0.        , ..., 0.16666915, 0.6666666 ,\n        0.16666426],\n       [0.        , 0.        , 0.        , ..., 0.16666675, 0.6666665 ,\n        0.16666663]], dtype=float32)})"
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "state = normal_gene.setup()\n",
+    "state = cache_gene.setup(state)\n",
+    "state"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-03T03:41:37.462678200Z",
+     "start_time": "2024-06-03T03:41:36.459771900Z"
+    }
+   },
+   "id": "57fbeab2e4d4c511"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "Array([ 0.08086783, -0.38624713, -0.37565565,  1.6689739 , -1.2758198 ,\n        2.1192005 , -0.85821223,  1.1305932 ], dtype=float32)"
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "key = jax.random.PRNGKey(0)\n",
+    "normal_attrs = normal_gene.new_random_attrs(state, key)\n",
+    "normal_attrs\n",
+    "weights = normal_attrs[normal_gene.grid_cnt:]\n",
+    "weights"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-03T03:41:38.867812Z",
+     "start_time": "2024-06-03T03:41:38.789154200Z"
+    }
+   },
+   "id": "9d9cacf5af5f38c3"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "(Array(-0.0304966, dtype=float32), Array(-0.03049916, dtype=float32))"
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t = 0.99999\n",
+    "normal_res = normal_gene.forward(state, normal_attrs, t)\n",
+    "cache_res = cache_gene.forward(state, weights, t)\n",
+    "normal_res, cache_res"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-03T03:43:23.083384100Z",
+     "start_time": "2024-06-03T03:43:23.002384100Z"
+    }
+   },
+   "id": "9177b012b7ab25cd"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "outputs": [],
+   "source": [
+    "batch = 100000\n",
+    "t = jnp.linspace(-1, 1, batch)\n",
+    "batch_normal_forward = jax.jit(jax.vmap(normal_gene.forward, in_axes=(None, None, 0)))\n",
+    "batch_cache_forward = jax.jit(jax.vmap(cache_gene.forward, in_axes=(None, None, 0)))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-03T03:50:05.806318700Z",
+     "start_time": "2024-06-03T03:50:05.785312900Z"
+    }
+   },
+   "id": "878f5eda35df17f7"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.34 ms ± 99.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit batch_normal_forward(state, normal_attrs, t).block_until_ready()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-03T03:52:23.070679300Z",
+     "start_time": "2024-06-03T03:52:21.768451Z"
+    }
+   },
+   "id": "2f649d81e1945757"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "134 µs ± 28.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit batch_cache_forward(state, weights, t).block_until_ready()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-03T03:52:43.066619300Z",
+     "start_time": "2024-06-03T03:52:42.599706100Z"
+    }
+   },
+   "id": "cac106d6792ea53b"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "outputs": [],
+   "source": [
+    "normal_res = batch_normal_forward(state, normal_attrs, t).block_until_ready()\n",
+    "cache_res = batch_cache_forward(state, weights, t).block_until_ready()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-03T03:53:15.153532800Z",
+     "start_time": "2024-06-03T03:53:15.145531800Z"
+    }
+   },
+   "id": "27bc7e5abf9b65d4"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "1b25ee5a2fe153c4"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "[<matplotlib.lines.Line2D at 0x7f5b32908a30>]"
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": "<Figure size 640x480 with 1 Axes>",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAi8AAAGdCAYAAADaPpOnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAA9hAAAPYQGoP6dpAABr50lEQVR4nO3deVxU9f7H8dfMsCubsiuCCKi4gCvikprkklm22qqZ1W1fvHXL9lvdrF+2X8uyLLs306w0KzPLpVzIBUVcAEVBcAFFlF22Ob8/TnKjUAGZ+c4Mn+fjMY97G87MvMcReHs+33OOQdM0DSGEEEIIO2FUHUAIIYQQoimkvAghhBDCrkh5EUIIIYRdkfIihBBCCLsi5UUIIYQQdkXKixBCCCHsipQXIYQQQtgVKS9CCCGEsCtOqgO0NLPZzJEjR/D09MRgMKiOI4QQQohG0DSNkpISQkJCMBrPvW/F4crLkSNHCA0NVR1DCCGEEM2Qm5tLx44dz7mNw5UXT09PQH/zXl5eitMIIYQQojGKi4sJDQ2t+z1+Lg5XXs6Miry8vKS8CCGEEHamMUs+ZMGuEEIIIeyKlBchhBBC2BUpL0IIIYSwK1JehBBCCGFXpLwIIYQQwq5IeRFCCCGEXZHyIoQQQgi7IuVFCCGEEHZFyosQQggh7IqUFyGEEELYFSkvQgghhLArUl6EEEIIYVcc7sKMQgghWifNrLHzgyROfrsejh9H8/HFa/Qget93EU5u8uvOkcinKYQQwu5tfX45vv96hN5VafW/8BMcfjyU/bc8x9APp2I0nf+KxcL2SXkRQghht2ora1iX8A9GbH8DgBLasqvDWCqDwnAuOEK3nJV0qM2lwyfTSF62kM6bFtIusp3i1OJCyZoXIYQQdqnmdA2butxYV1x+6fMQWs4hEg4tZsTWWQzJXkCbwkP8ctmrlONOv8KfOBWTwJGkg4qTiwsl5UUIIYTd0cwav/W+k8GHF1OFMxunL2b4tjfwCvWut52bjxvDv32EQ19t5rCpExHVe6kankh+ar6i5KIlSHkRQghhd365bjZD931MDSa2/WMRg1+75pzbR1/VE0PSRnKdwgmvzqRw8Hgqi05bKa1oaVJehBBC2JU987cw+KvpAGyYOItBr1zZqMeFDOiAecVPnDC0p3tZMr/FP2DJmMKCpLwIIYSwG5UlVTj/bSouVJMUcjUXffVgkx4fNiqSrBcXYMbA8Iy5JD240EJJhSVJeRFCCGE3kia+QlTlbgoM/nT75X0MxqYf+tz/idH8MuxpAKLfuY9ju461dExhYVJehBBC2IWDq/eTsPpFAPbe+xa+ke2b/VxDfniKDLdY2msn2D/uvpaKKKxEyosQQgi7cOjWJ3GliuR2l5Dw1vUX9FwubZzhY33Bb8KhxaS8tqqFUgprkPIihBDC5u2Zv4UhuYswY8Dr/VebNS76s67X92F9r3sAaPPkQ9Scrrng5xTWYdHy8uuvvzJhwgRCQkIwGAwsXbr0vI9Zu3Ytffv2xdXVlcjISD755BNLRhRCCGEHKh9+HICNETcTdU1siz1v7JLnOGnwJapyFxtv+7DFnldYlkXLS1lZGbGxscyePbtR22dlZTF+/HhGjhxJSkoKDz30ELfffjs//vijJWMKIYSwYSnvbqTPydVU4UzY/Bda9Ll9u7Qj9ap/AhC96HlOn6xo0ecXlmHQNE2zygsZDCxZsoSJEyeedZvHHnuM77//nl27dtXdd/3113Pq1ClWrFjRqNcpLi7G29uboqIivLy8LjS2EEIIxTYHTmDgse9Y33UaQ9Nbfu9IZUkVx32j6Fibwy8T32D4koda/DXE+TXl97dNrXlJSkoiMTGx3n1jxowhKSnprI+prKykuLi43k0IIYRj2Pf1TgYe+w4zBjq+/Q+LvIarpwsHbngKgO7LXqbseLlFXke0HJsqL3l5eQQGBta7LzAwkOLiYioqGt6VN3PmTLy9vetuoaGh1ogqhBDCCo4/8goAv3W8hvDR0RZ7nYT3byXXKZwAcz7Jd8yx2OuIlmFT5aU5ZsyYQVFRUd0tNzdXdSQhhBAtIG/7UQZkLQLA+1+PWfS1nD2cOTDpCQC6fPcmNRXVFn09cWFsqrwEBQWRn1//Sp/5+fl4eXnh7u7e4GNcXV3x8vKqdxNCCGH/0v8+F2dqSPUcTI/J/Sz+egPfuYXjhgA61Oay+R9fWvz1RPPZVHlJSEhg1ar6Jwr66aefSEhIUJRICCGECjUV1XT75X0ASm651yqv6e7rxq4R+tl2fT+ahWa2yvEsohksWl5KS0tJSUkhJSUF0A+FTklJIScnB9BHPpMnT67b/q677uLAgQP84x//ID09nXfffZcvvviChx9+2JIxhRBC2Jhtz31DkPkIxwwB9J95tdVet/d7d1OOO90rtrH9zV+s9rqiaSxaXrZu3UqfPn3o06cPANOnT6dPnz4888wzABw9erSuyAB07tyZ77//np9++onY2Fhee+01PvzwQ8aMGWPJmEIIIWyM84f6otmdg+7A1cvVaq/bvqsfW2OmAFD12ttWe13RNFY7z4u1yHlehBDCvuWsO0jHizpjRCPnlyw6XRRu1dfft3Q3UVf2pAYTBck5BPUNserrt1Z2e54XIYQQIvPZ/2BEI8V3pNWLC0DUxB7s8BqKE7WkPTrP6q8vzk/KixBCCJuhmTUi1s8HoOy6W5XlKLvpbwBE//IBNZW1ynKIhkl5EUIIYTN2fbCR8OpMSmhL3PPWW6j7Z/1mXkOhoR0danPZ+mLjLk8jrEfKixBCCJtR9NYnAKR0uYY2AW2U5XD1dmNX/6kAGD+QM+7aGikvQgghbEJl0Wl6p38BgMc9t6oNA3T4550A9Du2nOM78xSnEX8k5UUIIYRN2DbzR7wo5oipI3H3D1Mdhy7jotnRdjAmzOx5aoHqOOIPpLwIIYSwCbWf63tdMuOuxeRsG7+eTl12CwBBKz9VnET8kW387RBCCNGqlZ+oIDZnGQD+916nOM3/9H7xOipxoevpHWQsTlUdR/xOyosQQgjlUv9vBZ6UcsjUiW5T4lXHqePbpR3bQy4D4Mgr/1GcRpwh5UUIIYRy5oW/j4z6XIfBaFCcpj7jrfo1+Lpv/0zO+WIjpLwIIYRQ6nRhOb1zvgWg/d22MzI6o88T4zhhaE+Q+Sipb6xSHUcg5UUIIYRiO19ZTlvKyDGF02NKf9Vx/sK5jQs7u+ulqvzjRYrTCJDyIoQQQrHKxd8AsD/2aowm2xoZneE1TS8vPfYtobqsSnEaIeVFCCGEMlXlNfTI/h6A9rddoTjN2fW+dxjHjIH4aifZ8bqMjlST8iKEEEKZnXM24Kud5IShPT3vSFAd56ycXE2kxVwDQMWnXyhOI6S8CCGEUKbkv/rIKK3LZRhdnBSnOTefO/XRUe/MJVSWyOhIJSkvQgghlNDMGp136iemc77qcsVpzq/XXUPIMwbjTREpr/6kOk6rJuVFCCGEEge+TyOsZj+VuNDj4dGq45yX0dnE3l5XA1D12WLFaVo3KS9CCCGUOPSuvtdlh98o2ga1VZymcbxv/310lLVUjjpSSMqLEEIIJdpv0MtLeaLtHmX0Zz3vHMwxQwDeWhG7Zv+iOk6rJeVFCCGE1RWmHyOm5DcAIh+6THGaxjO5mEiPmgD8b7GxsD4pL0IIIawu7a2VGNFIc+tDx/gOquM0idskfU9Rlz3L0Mya4jStk5QXIYQQVmf+YQUA+X3HKk7SdL0eSqQMDzrU5pL2+XbVcVolKS9CCCGsqrbaTLeclQD43ThGcZqmc2/nzq4QPXfeHBkdqSDlRQghhFWlLdiOv3acEtrSbartnlX3XGov00dHIVulvKhg26czFEI0iabBkQ1ZHFm7l5rCYtw7tKPj+Fj8uvmpjiZEnWP/+RGAtOBRDPRwUZymeWIeHU/tB0a6nd5B1ppsOo8MVx2pVZHyIoQDOJl5gu13vU+XXz4irOYA9ZY/PgK72sRz6uZ7GfTWjTi5mlTFFAKAdlv08lI50v5GRmf4RPqR6jOU3qd+JfvtZXQe+YDqSK2KjI2EsGNarZm1178H0VFcvOpJwmoOUI0Te916scNrKFnOUQD0LNvE0Pcns893AGkLZIGhUKf4UDE9ijcCEH6n/ZYXgJPD9dGR9y8yOrI2KS9C2KnC/SfZGjyBEYvuwVc7SYZbb9b/7VOq8k4SXZFKbNE6Olft5diOo6y95F+cMvjQvWI74TcNZv2dn6qOL1qptNmrcaaGLOcoQodHqI5zQcLu18tL75O/UHzwpOI0rYuUFyHs0OGkHIq7D2TA8eVU4MYvV79F5Klkhs65hTaB9U+zHtA7iBErn6B6ZwZbAsbjzmmGzp3CL9e8oyi9aM2qlumHSOd0s++9LgDho7qwzyUGJ2pJe1su1GhNUl6EsDMH1xzAPOwiwqszyTWFkbtwI8O/fACT67mXsPn3CKDf4WX8MuARAIZ/9QC/Tv7QGpGF0Gka4Xv19S5uE+3v/C4Nye05DoCaZcsVJ2ldpLwIYUeO78rHOHoUobUHyXKJxrRxPdGT+jT68UYnIxf99n+sGfQ4AAn/uZuUN9daKK0Q9eWs2kdoTbZ+Fel7R6iO0yK8Jl0KQNSBFZhrzIrTtB5SXoSwE+UnKsiLv4LQmmyynSLx2LSWkIEdm/w8BqOBERteYkPYDThTQ8fp15K/85gFEgtRX/aH+mhll89Q2ga2UZymZfS6eygltCXAnE/GIlkMby1SXoSwA5pZY8eA2+lVvomTBl8My78nMC642c9nMBrou+0j9rr1xk8rIGP0fWhyiRZhYU7r1gBQMmCU4iQtx9XThd1BiQDkzftBcZrWQ8qLEHbg16kfk5C1gBpMZM36mrBLoi/4Od3bueP030+owcRFeYtZeefiFkgqRMNqq810O6qXl4BJIxWnaVlVl+ijo/abZd2LtUh5EcLGHfg+jf6f3g/A+rEv0nf6iBZ77oir+5A8+gkA+n50L8d2H2+x5xbijzIWp9JOK6SEtkTf2F91nBYVdb++aLdn6W8UZJxQnKZ1kPIihA2rrayh6rqbaUM529onctG3/2jx1+i/9Cky3Xvirx0nc8JDLf78QgDkL9T3umT4D8PJ3VlxmpYVPKAje916YURjz5srVcdpFSxeXmbPnk14eDhubm7Ex8ezefPmc27/5ptv0rVrV9zd3QkNDeXhhx/m9OnTlo4phE1ad9UbdCvfxkl8CfrxU4xOLf8ta3J3ofK9jzFjYHDWAnbMPff3qBDN4bFpNQBl8RcrTmIZR+P0vS/GFTI6sgaLlpdFixYxffp0nn32WbZt20ZsbCxjxozh2LGGj2xYsGABjz/+OM8++yxpaWl89NFHLFq0iCeeeMKSMYWwSdmr9jNw+bMA7Jr6GiH9mr9A93x6TOlPUtRkALRHHkEzy+pd0XJqTtfQ7divAATe4Jjlxft6fd1Lt4MrMFfXKk7j+CxaXl5//XXuuOMOpk6dSkxMDHPmzMHDw4N58+Y1uP3GjRsZMmQIN954I+Hh4YwePZobbrjhvHtrhHA4mkbh9XfjQQXb213M0A9vtfhLRnz2IhW4EVe8juRnl1n89UTrkb5gG94Uc8rgQ/S1sarjWETM7YMpwgs/rYB9X8gh05ZmsfJSVVVFcnIyiYmJ/3sxo5HExESSkpIafMzgwYNJTk6uKysHDhxg+fLlXHrppWd9ncrKSoqLi+vdhLB3W575lr4FP1GJCz6fz8FgNFj8NYMHdGRj/MMAeM16Rk64JVrM8cX6epf0oBEYnR3zquYubZxJC9SPosr7r1wqwNIsVl4KCgqora0lMDCw3v2BgYHk5eU1+Jgbb7yR559/nqFDh+Ls7EyXLl0YMWLEOcdGM2fOxNvbu+4WGhraou9DCGurLKnC7xX9FP4b4x+m8+goq712n/8+QjGeRJ9O5benvrPa6wrH5rlFX+9SmeBYh0j/2elhlwDg/Zss2rU0mzraaO3atbz00ku8++67bNu2ja+//prvv/+eF1544ayPmTFjBkVFRXW33NxcKyYWouVtuOU9Olfv47gxgP5fWXe9V7vIdqQMvhcAn3dekLUv4oJVlVbR/cR6AIJvcsz1Lmd0un00ADGnNlB+vExxGsdmsfLi5+eHyWQiPz+/3v35+fkEBQU1+Jinn36aW265hdtvv51evXpx5ZVX8tJLLzFz5kzM5oZ3Ybu6uuLl5VXvJoS9Kj54kj7L/gnA3ptfwLOD9f8+d//gYcpxJ6Z8K6mv/mj11xeOJf3TzbShnOMGfyKv6KE6jkV1viSSXFMYLlSzZ86vquM4NIuVFxcXF/r168eqVavq7jObzaxatYqEhIQGH1NeXo7RWD+SyaTPRzU5d7loBbbf8jq+2kn2uvYk/oNpSjL49wjgt95/A8D8f7OUZBCOo/Arfb3Lvg4jMZosv3ZLJYPRQFakPjoqXSrrXizJomOj6dOnM3fuXObPn09aWhp33303ZWVlTJ06FYDJkyczY8aMuu0nTJjAe++9x8KFC8nKyuKnn37i6aefZsKECXUlRghHdWLvCfquexOAgvufx8lV3d/5yH8/RC1G+hSuIv3LXcpyCPvnvU1f71I11LFHRmc4X6qPjjrslnUvluRkySefNGkSx48f55lnniEvL4+4uDhWrFhRt4g3Jyen3p6Wp556CoPBwFNPPcXhw4fx9/dnwoQJ/Otf/7JkTCFsQsotsxhFKenufRj08kSlWToNC2NTxyuJP/QV+U++TbdrPlCaR9in06dO0/2UfnRp6GTHXqx7Rrd7Lsb8hoGoyt0cTT5CcL8Q1ZEckkFzsHlMcXEx3t7eFBUVyfoXYTcK0o7jFtOZtpSR/Owy+j03QXUkdr23jp73XEQFblRk5NIu2k91JGFnUt7+lbgHh5NnDCaw+rBVDvm3BXvaDiCmbCvrbp/PsLmTVcexG035/W1TRxsJ0Vrtuf012lLGbo/+9H3mMtVxAOjxt6GkuffBndPsemiu6jjCDp1ctg6AAx2GtZriAnA8Vl/3ws+y7sVSpLwIoVhxbhFxG98DoOiBZ2zmh7zBaODY9Q8CEPHT+3LSOtFknil6eamOH6Y4iXX5TtLXvXQ9+BPmWocabtgMKS9CKLbtrg/wophMlxgGvTBedZx6+v/fdZzCh441B0l+5WfVcYQdqamsJfrERgCCrhmqOI11db8tgTI8CNDyyfhyp+o4DknKixAKnS6uInrFWwDk3fyIRa4afSHa+LmzM/ZmAKreldGRaLy9X6biRQlFeBF5ZS/VcazKua0raf7DATj2mYyOLMG2flIK0cokPfg5IebD5BuDGfjmjarjNCj4mTsAGHDkG47vbviK8EL82bGv9JHRXv8hmFxa36kuSgbro6O2SVJeLEHKixCKaGaNjp+/CsDecQ/i4umqOFHDIq/qza42A/Wzhj7+qeo4wk64btbLS1mf1rXe5Yzgm0YB0K1gHdVlVYrTOB4pL0IosuOVFURV7qaEtvSe/TfVcc6p4IrbAei08kO53pE4L82sEXFUv55Ru8tb13qXM6Kv7MFxgz9tKCfjv1tUx3E4Ul6EUKT2jbcB2BJ7B95hPmrDnEfszOspw4POVRmk/3er6jjCxmWv2k+gOY9KXOh68wDVcZQwOhnZFzICgBOLV6sN44CkvAihwMFVmfQ7vgIzBsJfvVd1nPPy7eRJSqcrAMh77b+K0whbd+hzfWSU7jUQV283xWnUqR6mXxLBK3mN4iSOR8qLEAoceGwOAMl+Y4m4pIviNI3jdrt+1FHPnZ9TWVqtOI2waev08nKyR+tc73LGmUsidD+1kYqTpxWncSxSXoSwspL8cuKS5wFguPcexWkaL+7RSygw+uOvHSdllpzzRZxdx4P6epe2Y1vnepczOo+J5qgxBDcqSZuXpDqOQ5HyIoSVJf9jEb6c5JBTOH2fHKc6TqOZ3JzZ0/t6AGo+kdGRaFj+jjw6V+/DjIGoKYNVx1HKYDRwoJO+96Voiax7aUlSXoSwJk0j4MvZAGSNuQujs32d/6Ld/froqM/BJZTnlyhOI2zR/k/1vS773Hrb/EJ0a9BG6ute2qdKeWlJUl6EsKL0/2whpjyZ07gS89o01XGarMetA8hyisKDCna+sFR1HGGDqlbp613yolr3epczOt/2+7qXks2UHC1VnMZxSHkRwopOzHwfgC1h19K+q5/iNE1nMBo4EH8DAMavFitOI2xR4F69vLhc3LrXu5zRYWhncp3CcaaGtLnrVcdxGFJehLCS0rxSYtMXAeDx0J2K0zRfx4evBaB33o8U5RYrTiNsSVFuMdEVOwDoPFn2vJxxMELf+1L6rRwy3VKkvAhhJdufWExbysh2jqTP/fb7r9LoK3twwKUrrlSx48VvVccRNmTfp0mYMHPQKYKgviGq49gMU6K+7iVwt6x7aSlSXoSwkrZffgzAwYtvw2gyKE7TfAajgcMJ+t4Xp2++VJxG2JKSHzcCkNtpiOIktqXL7fqel24V2zix/5TaMA5CyosQVrD3u730KVlHLUZiXp6sOs4F6/DgNQD0zf+BokNy1JHQee3Wz2VSO7B1HyL9ZwF9OpDlEo0JM3s//FV1HIcg5UUIK8h98RMAtgeOxT+ug9owLSBiYm+ynaNwo5Jdr3yvOo6wAbVVtUQV/gZA4MQExWlsz6FIfe9L5QpZ99ISpLwIYWE1lbXEbJkPgHnKbYrTtBCDgez++t4X0xIZHQnY/+0evCihhLZETuypOo7NMV2ir3sJTpd1Ly1ByosQFrbj1ZUEm49QYPAj7ukJquO0GP97fj/q6PByTp8oU5xGqJa3RB8Z7fMdiJOrfZ180Roibx8BQNfTqZzad1xtGAcg5UUIC6v5QL+O0c5eN+HS1kVxmpYTc2Mc2aYueFDBrlkrVMcRihk26eWlOEZGRg0J6BnAPpcYAPbOk/O9XCgpL0JYUHHOKeJylwHg9+hUxWlalsFo4ECvKwCoXLxMcRqhWoccvby0SZTycjaHuwwH4PSPvyhOYv+kvAhhQanPfoUrVex16UnPm2JVx2lxvlMuB6Dbge+pOV2jOI1QpXDfCSKqMgDoctMgxWlsl/OoiwAISJcjji6UlBchLMhj6WcAHB5xEwb7PbXLWfW6awiFhna0106QOmej6jhCkf2f6UcZZTlH0y6qveI0titiqr7nJboihaKDp9SGsXNSXoSwkEObDhN3ai0A0c/eoDaMhTi5OZEeMR6Ak5/K6Ki1KvtZHxkd6iTndzmX4L7BZDlHYURj78cbVMexa1JehLCQ9OcWYkQj1XsoHQaHqY5jMU5X66OjLju/QTNritMIFbz26OVFGyTrXc4nt7M+OipbLuteLoSUFyEsJGStPjIqufwmxUksK+ahMVTiQnhNJgeWp6uOI6ystqqWqJObAQi6UsrL+RhH6KMjvz2y7uVCSHkRwgL2LUsj5vR2qnGi53PXqo5jUW2DPUn100/AdehdGR21NplLd+FJKcV40mVCjOo4Ni98sr7npVvZVkqOlipOY7+kvAhhAYf/T9/rsi1wHN4Rjr+AsexifXTUboOUl9Ymf+nvJ6drF4/JRU5Odz4dh4RxyBSGE7VkfJKkOo7dkvIiRAvTzBpdNi0AoObaGxWnsY6Ih/Ty0qM4iVN7jylOI6zJtEk/ykxOTtd4B8P0vS8l38m6l+aS8iJEC9v76W+E1mRRQltin75cdRyr6JTQgTS3OIxopL/1o+o4woo65Op7D9peIuWlsbSL9HUvvrtk3UtzSXkRooUd+/ciALZ3uoK2AR6K01jP0T6XAmD+/gfFSYS1nEg/Tnh1JgBRt8jJ6Rqr0836npfuxZsoK6hQnMY+SXkRogVptWYiU/SrLDvdcJ3iNNbld/M4ALrl/EhNZa3iNMIa9i/QT06336UbPp19FaexH6EjI8k3BuNKFWnzN6uOY5ekvAjRgnZ99BvBtYcpxpM+j41WHceqYm4bxCmDD+20QnZ/LD+QW4PyVfrI6EiYjIyawmA0kBWq730pWibrXprD4uVl9uzZhIeH4+bmRnx8PJs3n/uH2qlTp7j33nsJDg7G1dWV6Oholi9fbumYQrSIE+8tBmBn+OW4+7opTmNdTm5OpIfqha3gPzI6ag28605OJ2fWbaraofq6F+9UWffSHBYtL4sWLWL69Ok8++yzbNu2jdjYWMaMGcOxYw0fjVBVVcUll1xCdnY2X375JRkZGcydO5cOHTpYMqYQLaK22kx0qj4ycrnJsc/tcjbaWH10FLRdyoujqzldQ/Qp/R+jwVfJnpem6nijvucl5tRGKoqqFKexPwZN0yx2Pu/4+HgGDBjAv//9bwDMZjOhoaHcf//9PP7443/Zfs6cObz66qukp6fj7OzcrNcsLi7G29uboqIivLy8Lii/EE2x7d3f6HtvAsV44lZ0DBev1rXnBaBgVx5+vYIBOJaaR0CvQMWJhKVkLNxO1xv6cgpvvKoLMTrJKoSm0Mwahc4BtDcXsO3fG+l7rxTApvz+ttjftqqqKpKTk0lMTPzfixmNJCYmkpTU8Il5li1bRkJCAvfeey+BgYH07NmTl156idrasy/+q6yspLi4uN5NCBVOzdVHRrsjJrTK4gLg1zOINI++AOx9Rw6ZdmT5yzYBcKDdACkuzWAwGtgfMgyAk0tl3UtTWexvXEFBAbW1tQQG1v+XV2BgIHl5eQ0+5sCBA3z55ZfU1tayfPlynn76aV577TVefPHFs77OzJkz8fb2rruFhoa26PsQojHMtRrRO/WRkfMNrXNkdEZ+H310ZPpR1qo5MsMWvbwUd49XnMR+VQ3S17203S7rXprKpuqy2WwmICCADz74gH79+jFp0iSefPJJ5syZc9bHzJgxg6Kiorpbbm6uFRMLodvzyWY61uZQQlt6/n2M6jhK+d74+yHTuSsxV9UoTiMsJThXX+/iMVLKS3MFXff7+V5OrKe6Qr5XmsJi5cXPzw+TyUR+fn69+/Pz8wkKCmrwMcHBwURHR2My/e/6GN27dycvL4+qqoYXNLm6uuLl5VXvJoS1nZijj4xSO03AzdddcRq1YqbGcxJffLWTZPxHDpl2RMWHiomoTAOg83UDFKexXxETe1Nk8MaLEjK+2KE6jl2xWHlxcXGhX79+rFq1qu4+s9nMqlWrSEhoeGHSkCFDyMzMxGw21923d+9egoODcXFxsVRUIS6IZtbo8vuJ6QzXte6REYCzuxO7O1wCwPHPVipOIyxh/6KtGNE4ZOqEf6+G/zEqzs/obGJvwFAAjn8p616awqJjo+nTpzN37lzmz59PWload999N2VlZUydOhWAyZMnM2PGjLrt7777bgoLC3nwwQfZu3cv33//PS+99BL33nuvJWMKcUEyP99Cx5qDlNCW3v8YqzqOTagZoZcX3+SfFCcRllD0k75HLTdYRkYXqry/vu7Ffause2kKJ0s++aRJkzh+/DjPPPMMeXl5xMXFsWLFirpFvDk5ORiN/+tPoaGh/Pjjjzz88MP07t2bDh068OCDD/LYY49ZMqYQFyTv3a+JAlI6jGeYf+seGZ3R5a5L4DP92i3FuUV4hXqrjiRakFuqXl6q4gYqTmL//K66CL6H6Px1mGvMcuRWI1n0PC8qyHlehLVluXajc1UGv96zkItmT1Idx2ZkuXSlc/VeNj2+hPiZE1XHES3oqKkDweYj7Hj7F2Lvv0h1HLtWXV5NZRtf2lLGvq9Sibqql+pIytjEeV6EaA2yfkinc1UGVTjT6x/jVMexKbnd9NHR6W9ldORIjm49TLD5CLUYiZzUT3Ucu+fs4UxGO/3yCnmLZN1LY0l5EeICHHxrKQA72o/CN0z29P2RxxV6eemUsRLH2r/buh1crI+MMt160iagjeI0jqEoTl/34pwk5aWxpLwIcQHar18KQMXYK9UGsUHd7xlJDSY612SStSZbdRzRQk7/op+cLj9cFuu2FJ/L9dFbxOF1aGZp+o0h5UWIZjq85Qi9yjZhxkD3xy5XHcfmtAn2Is17EAA5H8noyFF4Z+h7XgwDZbFuS+l2ywBO40qAOZ9Da/apjmMXpLwI0UwZry4DYI/nIDnXxVmc7KePjlx/kfO9OILaqloiT20BIOhyKS8txaOdG2me+p6s3M/kkOnGkPIiRDO1+XkpACeHT1Saw5a1u340AN2OrKK26uwXWBX2IeuHdDwppZQ2REzooTqOQyns+ftRW+ukvDSGlBchmqEwq4g+J1cDEDF9otowNqzbLQM4hTe+2kn2fp6sOo64QEe/+X2xrnc/TC6m82wtmqLtpXp56ZQt5aUxpLwI0Qy7X12OC9Xsd+1Oh5HRquPYLCc3J9KDRgJw/HNZ92LvtE36Yt2iaBkZtbSutyZQg4mONQc5tuWg6jg2T8qLEM1g+nYpAIf6TVSawx5UDNNHR96bpbzYO/8sfc+LyzA50qil+XRsS5qHft6crE/XKU5j+6S8CNFElcWV9Dq0HAD/OyaqDWMHOt6aCEC3k0lUnixXnEY0V0VhBVEVqQB0ukb2vFhCfld9dFSzRkZH5yPlRYgm2v3Oajwp5agxhG4391cdx+ZFjo3ksLEjrlSRPm+j6jiimTK/2IYTteQbgwiJD1UdxyG5XaKXl5BMKS/nI+VFiCYqX7AUgIxuV8hF1BrBYDRwIOxiAE59vVpxGtFcJ1boI6PsgIEYjAbFaRxT1NShmDHQuTKDor35quPYNPnJK0QTaLVmotO/AcBt0kS1YezJxXp5aZ8q5cVeuWzXy0tFTxkZWUpgN18yXPQLM+7/RNa9nIuUFyGaYO/CZALM+RTjSeyDI1THsRsR0/QjjrqXbqE4t0hxGtEcHY/oRxp5JspiXUs60kUfHVWslNHRuUh5EaIJ8j78HoBdwaNx93ZRnMZ+dEjoRLZzJCbMpM2Vf1HamxPpx+lUkwVA5PWyzsuSjCP18hKQJuXlXKS8CNEEfpv1o4xqx1yqOIn9yYnUR0cV38noyN4cWKRfEmC/Sze8w3zUhnFwEZOHAdClPJWKIycVp7FdUl6EaKSjKfn0KNd/iHd7eJziNPbHZYxeXoLSpLzYm7LV+sjoaEdZ72JpnQYGsd8UjRGNzPkbVMexWVJehGik9DdX6P/bpi/+vYMVp7E/0XeOAKDb6R3k7y5QG0Y0Sds9+mLd2n5SXizNYIDsMH10VPK9jI7ORsqLEI1kWqmPjE4MlJFRc7TrHsg+t54A7P1grdowotE0s0bECb28+I2XxbrWoA3Vy4vvTikvZyPlRYhGKCuqoffRHwEIniblpbnyYvTRUc1KGR3Zi5w1+2mnFVKJC5FX9VYdp1XoeKNeXiKLk6k5Vao4jW2S8iJEI2z790Z8KKLQ2J7Ok2TXeXN5XKofMt1pv5QXe3F4ib7XZV+bPrh6yhF21hB9SRi5xk44U8P+z35THccmSXkRohFKFukjowNRYzE4mRSnsV9Rtw/HjIEu1Rkc2XJYdRzRCNUb9MW6BZEyMrIWoxEyg/W9L4VLZXTUECkvQpxHbS2E79HLi/vVMjK6EF5hvmR49AUga94axWlEY7TP1Pe8OCXIHkdrqhqklxfP7VJeGiLlRYjzSFmWQ0ztTmoxEn3/GNVx7F5+T33di7ZKRke2rrqsisjS7QB0vFr2vFhT0HW/r3s58RvmikrFaWyPlBchzuPQ3B8AyGw/COeg9orT2L82l+nlpXPWKtA0xWnEuWR+nYoblZw0+BJ2cRfVcVqVmInRHCMANyo5+OUW1XFsjpQXIc7De6M+MiofISOjltB12lCqcaJDTQ5HNmSpjiPO4fj3+shofzu5krS1ObsYSAvQ974c+1JGR38m5UWIcziadZoBRT8DEHbPeMVpHINXSFv2tNVHEAc/ltGRLTNu1ctLSYyMjFQo76eXF7fNUl7+TMqLEOew451faUM5x52DaTcyVnUch3Gi53D9//zyi9og4pyCc/UjjdqMlMW6KrS/Ui8vXfI3oFXXKE5jW6S8CHEONcv0kVFuz0v183aLFtH2shEAhGX/IutebFRRThFdqtIBCL9OyosKPa/vyUl8aKuVcmR5iuo4NkXKixBnUVUF3Q98D4Dn9TIyakndpw2mGidCanM5vCFbdRzRgDNXks5x6kxAD3/FaVonD08Tu32GAnBk0TrFaWyLlBchzmLbon100TKpwpkuf0tUHceheAa1Ib1tfwCy58voyBYV/ayXl0PBAxQnad2KYvXRkdNGWffyR1JehDiLvI/1kdG+4IswensqTuN4CnqOAEBbu1ZpDtEwt536Yt3KWFmsq5L3BL28hOeuA7NZcRrbIeVFiLPw36yPjKoT5RBpS/Acry/aDcuWPS+2KCxfLy/txsp6F5V63NKXMjzwNZ+gYF2a6jg2Q8qLEA3I3lVK/zL9l2rEfVJeLKHrbUOowURoTTaHNuaojiP+4OjWwwSbj1CDichr+6iO06r5BjiT2nYwADn/ldHRGVJehGjA7ndW40oVh90i8BrQVXUch+QZ4klGm9+vc/SJ7H2xJTlf6ntdMt160iagjeI04kSMPjrSfpXycoaUFyEaYPxBHxnl9ZFDpC1J1r3Ypopf9fJyLEwW69qCNuP08hKa9aucWuB3Ul6E+JPyMo1eufpi3fa3yMjIktpeqq976ZQle15siVe6Xl4YIOtdbEG3yQOpxIWA6iMUpxxQHccmWKW8zJ49m/DwcNzc3IiPj2fz5s2NetzChQsxGAxMnDjRsgGF+IPk+bvoyCEqDO6ETRmhOo5D6zptKLUYCa/ZT+5vh1XHEYC5xkyXk1sBCLhMyostCI5wZ6eb/lkcmC+jI7BCeVm0aBHTp0/n2WefZdu2bcTGxjJmzBiOHTt2zsdlZ2fzyCOPMGzYMEtHFKKeE//R97pkhl6MwcNdcRrH1raDN3vb6AtCD3wse19sQfbKvXhTTDnudLm8h+o44nf5XfXRUfUqKS9ghfLy+uuvc8cddzB16lRiYmKYM2cOHh4ezJs376yPqa2t5aabbuKf//wnERERlo4oRB1Ng5Dt+noX7VIZGVlDQYw+OtLWrFUbRABwZKm+Z3yfVz+c3Z0UpxFnuF6il5fgfVJewMLlpaqqiuTkZBIT/3d2UqPRSGJiIklJSWd93PPPP09AQADTpk0772tUVlZSXFxc7yZEc+3ddJK+lRsBiLxfyos1tPl93UuorHuxCebf9PJyMlJGRrak69TB1GKkY+UBStMPqY6jnEXLS0FBAbW1tQQGBta7PzAwkLy8vAYfs379ej766CPmzp3bqNeYOXMm3t7edbfQ0NALzi1ar72zf8KJWg62icEjJlx1nFYhetowzBjoUrOX3M1HVcdp9dof0MuL81ApL7YkNMaTXS76qQX2fyLXObKpo41KSkq45ZZbmDt3Ln5+fo16zIwZMygqKqq75ebmWjilcGQuP+sjo4J42etiLW1Dfcn0iAXgwCeyS1ylyuJKospSAAi9Ug6TtjVHI/XR0emV8n1i0YGmn58fJpOJ/Pz8evfn5+cTFBT0l+33799PdnY2EyZMqLvP/Pu1HJycnMjIyKBLly71HuPq6oqrq6sF0ovWpviUmT55PwAQdJtcRdqajnUfTnRyCrWr1wKTVMdptfYvSSWGak4Y2hN6UWfVccSfOI+6CPa8TkC6lBeL7nlxcXGhX79+rFq1qu4+s9nMqlWrSEhI+Mv23bp1Y+fOnaSkpNTdLr/8ckaOHElKSoqMhIRFbX0/mQCOU2r0pMN1Q1THaVU8xv2+7uWArHtR6fj3+sjoQPuBGIxyckZb02XKUAA6V+yh/OBxxWnUsvjYaPr06cydO5f58+eTlpbG3XffTVlZGVOnTgVg8uTJzJgxAwA3Nzd69uxZ7+bj44Onpyc9e/bExcXF0nFFK1ayUB8Z7Y8YDc7OitO0LtHT9FMiRFWnkZt87tMoCMsxJevlpTRG1rvYorC+7Ul36glA5vz1itOoZfHyMmnSJGbNmsUzzzxDXFwcKSkprFixom4Rb05ODkePyiI9oZbZDJ126+d3cbpCRkbW1jbcj33uvQDYL+telAk+pJeXNiOlvNgigwFyO+vrXsqWt+7vE4OmOdaFEoqLi/H29qaoqAgvLy/VcYSdSP0pn96j9XVYldlHcQ3765osYVkb+t3PkG3/ZlX3exm159+q47Q6RTlFeIf5AHB89zH8Y/zVBhIN+vmORSR+eD0ZbfvStSRZdZwW1ZTf3zZ1tJEQqmTPWQFApnc/KS6KuI/R1710zJR1Lyoc+EK/JECOU2cpLjYs/BZ9xBpZmsLp/CLFadSR8iIE4LFWHxkVD5VDpFWJmqbvDu9avYvDOwoUp2l9in7eAsChIDlE2pZ1GRbCAVMkJszsm79RdRxlpLyIVq8gr4b+hT8C0PFOKS+qeHYJ4IBbDACZH8tJuKzNLVVf71IVJ+tdbJnBANmd9KJf/F3rXfci5UW0ettnb8SHIgpNfgSMl391qpTXVR8dVf+8Vm2QVigsXy8vPqOlvNg6baheXrxTpbwI0WpVfK2PjA52Gwsmk+I0rZvr7+teOuyTdS/WlJd8mGDzYWoxEnldX9VxxHmE3qSXl+iiLVSdKlecRg0pL6JVq62FLhl6eXG7SkZGqkXeppeXrlWpHNl9UnGa1uPgl/p6l0y3nrQNbKM4jTifqEvCOWzsiAvV7PvvJtVxlJDyIlq1Hd/m0KN2p/4vznvHqI7T6nl3DSLbtStGNPbNk3Uv1lLxqz4yyu8kIyN7YDAa2B+i730pXNo6R0dSXkSrdvhD/VpG+9on4BzYTnEaAXA0Wt/7UrVyrdogrYhXul5etAFSXuxFdYJeXjy3S3kRotXx2qCPjMqGy8jIVriO1stLsKx7sQpzjZkuhfo5XgIvkwXr9iLk+t/XvRQmUVNepTiN9Ul5Ea3WsZzT9D/1MwCd7pZLAtiKLr+ve+lemUJeRus9CZe1ZP+0D2+KKMedLpf3UB1HNFL05d0oMPjhQQV7P3esM+02hpQX0Wql/vtX2lDOMecQ/Ef1Vh1H/M47pgO5Ll0wYSbjo9Z98TlrOPKNPjLa59kXZw+5IKm9MDkZ2Buo730p+Lr1jY6kvIhWq/qb3w+R7nGpfuYnYTOOROl7XypXyujI0mo36uXlZKSsd7E3p+P18uK+VcqLEK1CbS1EZ34PQJtrZWRka5wT9fISmCHlxdL8DujlxXmIlBd7E3jt7+tejq2ntqpWcRrrkvIiWqUdX+6jizmTKpyJvnuU6jjiTyKm6uWlx+lk8jNLFKdxXFWlVUSWpQDQ8SopL/am6zW9KcILb4rZ+0WK6jhWJeVFtEpH5+kjo4yAi3Dy9VScRvyZT2wYR5zDcKKW9Hmt9+Jzlpb5dSquVHHC0J5OwzurjiOayMnVREagXvTzP1+tOI11SXkRrVL7JH1kdHqUjIxs1aFI/YdyxQoZHVnK8eX6yOhA+wEYjLLuyx5VJFwMQJvNUl6EcGj5+0vpU6L/Qux8r5zfxVY5XayXl4B0KS+WYtqql5fS7jIyslchN+vlpXvBOqpKW8/5XqS8iFZn9zurcaWKQy4R+A2OVh1HnEXnW/Xy0rNiC8eyW+fF5ywtOFcvL21GSnmxV12u6Mlxgz9tKSPt0y2q41iNlBfR6pi/1UdGubHj5RBpG+bbL4J85w64UE3avCTVcRxOcW4RnavSAQi/Vs6sa6+MTkb2dRwJQOGXrWd0JOVFtCo11Rrds/TFut7Xy8jIphkM5ESMAKDsBxkdtbT9XyRjRCPXFE5AzwDVccQFqBmmj458t61SnMR6pLyIViV1wS46aIcox52ufxuhOo44D9NIfXTkv0fKS0s79bM+YsgNlpGRvet06+/rXoqSKC9oHSNWKS+iVTn+iT4ySgsZhamNm+I04nzCf1/30qt8E8dzTytO41jcd2wCoDpORkb2LmxUJEdMHXGlqtWcWkDKi2hV/LfoI6OaS2RkZA/aDYziuFMQblSy5+NNquM4DM2sEZ7/GwC+4wYpTiMulMFoYH+4frLN4qWtY92LlBfRauSlnaR3mf6vki73S3mxCwYDOZ31vS+l38voqKUc3nSIIPNRajAReV1f1XFES7hYHx35pUp5EcKhpL+9Eidq2e/eA79+YarjiEYyDNfLi99uKS8tJecLfa/LPo9YPPw8FKcRLSHiNv2Io+5lWyjKKVKcxvKkvIjW4wd9ZHQ0Tva62JOwyb+veylLouBI6zkJlyVV/aqXl2MRMjJyFB0GhZLlHIUJMxlzHf8q01JeRKtQU2WmR84PAPjeLJcEsCfth3bnhMkfDyrY9UnrOQmXJfnu09cPmYZIeXEkOZH66Kjie8cfHUl5Ea3Czo+34q8dp9jgRbfbBquOI5rCYCAn7CIASr+T0dGFqiqtIrokGYCOV8UrTiNaktMYfdFuUJqUFyEcwon/6COjtI6jMbk5K04jmuz3dS/tdkp5uVD7vkrFndOcNPgSlhilOo5oQdF3jACg6+lUCvYcUxvGwqS8iFYhaJteXszjZGRkjzrdopeX3qUbKMyvVpzGvhV8p693yfQbJFeSdjD+Mf5kuPUGYO/ctWrDWJiUF+Hwjqbk07NCXysR/cBYxWlEc7Qf3pNTpna0pYyd87epjmPXTFv19S5lPWW9iyM62k1f91Kz4mfFSSxLyotweHvfXgFAWpt+tO8RpDiNaBajkezQYQAUfyujowvR8bC+56XtKFnv4ojaTLwEgIh9K9HMmuI0liPlRTg855X6JQGO9ZeRkT3ThumjI59UKS/NdSKjgPDqTAAib5RrGjmimLuHU4kLHWsPkrVyn+o4FiPlRTi06vJqehz+EQD/W6W82LPQm39f91K8npMFtYrT2KcDn+sjo/0u3fDp7Ks4jbCENgFt2O07FIDcD39UnMZypLwIh7bng/V4U8xxQwDdbu6vOo64AH6jYik2euNNMamfpqiOY5fKVuvl5UiorHdxZEWDxgDgvm6l4iSWI+VFOLSiBfrIKL3zOIxO8tfdrplMZHfU/0V56hsZHTWH5259vYs2UNa7OLKQKaMBiDm2hqpSxzwrtfw0Fw6tY6peXgyXycjIEdQO0UdH3jukvDSVucZMZKG+5yXwCtnz4siiru7NMWMgbSljz9wNquNYhFXKy+zZswkPD8fNzY34+Hg2b9581m3nzp3LsGHD8PX1xdfXl8TExHNuL8TZHFm3n4jKdKpxIuah0arjiBbQ8abf170UreNUoVlxGvuStSIDb4opw4MuV/RUHUdYkNHJSEaY/jPv1GLHHB1ZvLwsWrSI6dOn8+yzz7Jt2zZiY2MZM2YMx441fPa/tWvXcsMNN7BmzRqSkpIIDQ1l9OjRHD582NJRhYM58I6+1yXVexjtOnsrTiNagv+YvpQa2tKOk6T+N1V1HLty5Gt9ZLTXewBObk6K0whLM4zWy0tQimMu2rV4eXn99de54447mDp1KjExMcyZMwcPDw/mzZvX4PafffYZ99xzD3FxcXTr1o0PP/wQs9nMqlWrLB1VOBiPNXp5KRoiIyOH4eREdochABQuldFRU2i/6eWlqKusd2kNou7Rz/fSrWI7BbvzFadpeRYtL1VVVSQnJ5OYmPi/FzQaSUxMJCkpqVHPUV5eTnV1Ne3atWvw65WVlRQXF9e7CVF+rJQeBWsB6Pg3KS+OpGawPjry2i7lpSkCs/Ty4jZC1ru0BoG9A0l3iwNg77uOd7Zdi5aXgoICamtrCQwMrHd/YGAgeXl5jXqOxx57jJCQkHoF6I9mzpyJt7d33S00NPSCcwv7t+edVbhSxUGnCKIu66o6jmhBHW78fd3LqV8pOinrXhqjNK+UyNO7AAifJHteWoujvfVDprUfHW90ZNNHG7388sssXLiQJUuW4Obm1uA2M2bMoKioqO6Wm5tr5ZTCFlV8pY+MsrqPl4vPORj/cf2pMLjjxwlSFuxRHccuZC7cigkzh0ydCOobojqOsBKva/XyEpW1Eq3WsYq+RcuLn58fJpOJ/Pz687b8/HyCgs59jZlZs2bx8ssvs3LlSnr37n3W7VxdXfHy8qp3E62bZtaIytDLi8d1lylOI1qciwtZwYMBKFwio6PGOPWjPjLKCZa9Lq1JjzsGU4YHAeZ89i/dqTpOi7JoeXFxcaFfv371FtueWXybkJBw1sf93//9Hy+88AIrVqygf385K6pomn2LUwgyH6GUNvS6b7jqOMICqhP0z9Vzm5SXxnBL0ctLdV8pL62Jm7cru/1HAnDoI8caHVl8bDR9+nTmzp3L/PnzSUtL4+6776asrIypU6cCMHnyZGbMmFG3/SuvvMLTTz/NvHnzCA8PJy8vj7y8PEpLSy0dVTiIox/qe112BSbi7uOqOI2whODr9fLS6+QvFBc57pVzW4Jm1uiSvxEAv8uHKE4jrK1yuH7ItNfGHxQnaVkWLy+TJk1i1qxZPPPMM8TFxZGSksKKFSvqFvHm5ORw9OjRuu3fe+89qqqquOaaawgODq67zZo1y9JRhYNo/5teXioT5SgjRxVw2UBOG9wI5BgpizJUx7Fp2T/tw187zmlcibquj+o4wso636f/HOxVtJ5T2afUhmlBBk3THOqfLcXFxXh7e1NUVCTrX1qhE+nH8e0eiBGNI5sPETKgg+pIwkLSg0fSLW8tX1/yHletvEt1HJu1btonDJs3lR1eQ4ktWqc6jlDggGt3IqrS2fDAQoa8NUl1nLNqyu9vmz7aSIimSntzBUY00t3ipLg4uNMJ+izfK3mN4iS2TdugX9vmZHcZGbVWOXETANCWfac4ScuR8iIcimm5/s15tJ8cZeTogm68GIDehWsoKXKsw0BbUocsvby4jxqsOIlQxfdm/edhzMHl1FbWKE7TMqS8CIdRXV5NTK6+ot5viqx3cXRBlw+k3OBBAMdJ/Xy36jg26eT+QrpUpQEQOVnKS2vV447BnDT40k4rZM+831THaRFSXoTD2PXBRrwposDgR8yUAarjCEtzcWF/8DAACr9crTiMbcr8j34ZlgPOXWnf1U9xGqGKk5sTuzuNA6Dw028Vp2kZUl6EwyhaoB9llN55HCYXk+I0whoqh+qjI59tUl4aUv6TPjI6HCZ7XVo7bby+7qXjdsdY9yLlRTiMjjv08mKaICOj1iLkpt/XvZxcS+kpx5jltyTv3fr5XbTBsli3tYt5eAw1mOhSuYcj6w+ojnPBpLwIh3Dwl2wiq/ZQg4mYh8eojiOsJGR8H4qMPnhTzK5Pt6mOY1Oqy6uJLtoMQMg1sueltWsf6Uuqlz5mPfC2/e99kfIiHEL228sA2O09BO8wH7VhhPWYTGR2GAFA0RIZHf1R5uLteFBBoaEdEePkyuoCTg7Rjzpqs9r+171IeREOwWvNNwAUDrtCcRJhbdXD9NGR73YpL390/Bt9ZLTPfzBGJ/lRLyD8fn3dS48Tv1CUW6w4zYWRv9HC7hVln6TXSf0CfZ0fkvLS2nScrJeXnkXrKSusVJzGdrhs0RfrlsfKyEjouoyLJts5Cheq2TnLvi/UKOVF2L3ds37AiVr2ufYgfFQX1XGElXW4JIbjxgA8qCDtk02q49gGTaPzEb28+IyXxbrif3L76v/AM3z9leIkF0bKi7B7hmX6yOhQX9nr0hoZjAYyO+l7X4qXyugI4EjSQQLNR6nGieib5JxH4n/877oagN6Hvqfi5GnFaZpPyouwa1UllfTI1S/17jdNyktrVXuRXl7a7ZDyApDzub7XJcOjL2383BWnEbak6y0DOWLqiCelpM5aqTpOs0l5EXZt5ztr8aKEPGMwPab0Vx1HKBI6RS8vMcW/UX68THEa9ap/0cvL8WgZGYn6DCYj+3pdBUD1IvsdHUl5EXat7HN9ZJQRfbkcUdGKdRoRwSFTJ1yoZu/HG1THUS5gn36kkevFUl7EX3lP1UdHPQ8so7qsSnGa5pGf9sJuaWaNqDT9/C7uky5XnEaoZDAaOBCm730pauXrXk4eLCby9E5ALsYoGtbrriEcNwTgo51i1ztrVMdpFikvwm5lLEgmuPYwpbSh14MXq44jVBul/x3wS23d5WXvxxswYeagcxcCYoNVxxE2yORiYk/XKwEonW+foyMpL8Ju5X+gj4x2hozF3ddNcRqhWtQdIwHoVpbMyaxTasMoVL5CP+dRbueLFCcRtqztFH101C1jKdUV9nddMCkvwm4Fb9HLS814OcpIQPCAjmS5RGPCzJ45v6iOo4zf7l8BMAwfrjiJsGWxD47ghKE9/tpxtr9uf6MjKS/CLh1al0X06Z36hRgflatIC92hbokAVH33k+IkapQdL6db6RYAwifLnhdxdk7uzuzpcR0AlfM+U5ym6aS8CLu0/w19oe5O72G0j2qnOI2wFe6XjwYgfK/9nr/iQmR8koQzNRwxdaTDkHDVcYSN87nnRgBiD3xN+YkKxWmaRsqLsEtnLsR4aoSMjMT/dLt7JDWY6Fyzj4Nrs1THsbri7/SRUVbocDAYFKcRtq7nnYM5ZArDixJ2/Os71XGaRMqLsDunDhTS65T+QzpCLsQo/qBtiBd7vAYBkDW39Y2OfHboa31qh8jISJyfwWRk34AbADAuXKA4TdNIeRF2Z+cry3Gilr2uvQgb0Vl1HGFjTvbXR0eua1vX6KiyuJJuRb8B0PEmWawrGifkkZsAiDu6nKKDJxWnaTwpL8LuOC/Tz0twNH6i2iDCJvndqJeX7kdWUVNZqziN9WT8dwtuVHLcEEDnMdGq4wg7EX1VTzJce+FKFTue/lJ1nEaT8iLsSsnRUmLzVgAQ8sA1itMIW9Tt5v6cMvjgwynS/7tVdRyrOfmNPjLKDLkIg1HWu4jGMRggb9TNAPgs+URtmCaQ8iLsys5XluPOabKdI4m8spfqOMIGmVydSAsZBUDBgtYzOmq7TV8HVjlIRkaiaWJm3kINJnqXbiTz2zTVcRpFyouwK4av9N2a2f2uln9dirOqHqGPjny3to7yUltZQ9cC/YKUQdfJYl3RNP69g0kO0s+Xdfj5jxSnaRwpL8JulJ+ooNeh5QAE3C0jI3F24XdcAkCP4iRKDhcrTmN5exduoy1lnDT4EnVlT9VxhD2adjsAPZI/tYsrTUt5EXYj9dUfaUsZh0xhdL+5n+o4woZ1Gt6ZLOconKgl7V37O/V5U+Uv1kdGewOHYXKWH+ui6fo+OY48YzB+2nG2P/+t6jjnJX/Lhd2oXaSPjDJjZWQkzu9gtD46Ov2t44+O2mzSr6RdGS/rXUTzOLs7sWfArQAYPrb90ZGUF2EXKosr6Zmt/2ug3R1XK04j7IHb75cK6JTu2OWlqrSK7gX6npfgm0cpTiPsWZd/3QZAv+MryF59QHGac5PyIuxC6us/400xR40h9Lx9kOo4wg70uHcE1TgRXp1J1irb/kF8IdI+3UJbyigw+NFlohyBJ5ovbFQkW/3GYEQj+5F/q45zTlJehF2oXKCfmC6jx1UYneSvrTg/zw5e7PEZDED2ez8oTmM5J79cBcC+DiPle0NcuPsfAKDP9o8oPVqiOMzZyd90YfNqKqrpkbkUAO+pMjISjVc0RD/8s80v3ytOYjk+2/TyUn2RjIzEhev7xFgOOEfjTTHbH5qvOs5ZSXkRNi/ljTX4aic5bvCn1z3DVMcRdqTDnXp56VWwhrLj5YrTtLzygnJiipIA6HTrxYrTCEdgdDJy8PL7AQhd8jbmGrPiRA2T8iJsXsXHCwFI6341Tq4mxWmEPYm4LIZDpjDcOc2ud1arjtPi0j9cjwvVHDaFEjYqUnUc4SD6vTWFIrwIr97Hpqe/Ux2nQVYpL7NnzyY8PBw3Nzfi4+PZvHnzObdfvHgx3bp1w83NjV69erF8+XJrxBQ2qLK4kl6ZXwPgfdcNitMIe2MwGtjfXd/7UvmV442OSr7RR0YHwkfJ6QNEi/Hq4ElKwt36/3/7BTSzpjjRX1m8vCxatIjp06fz7LPPsm3bNmJjYxkzZgzHjh1rcPuNGzdyww03MG3aNLZv387EiROZOHEiu3btsnRUYYNSXl6BD0UcNXag191DVccRdsj9ar28dMn43iZ/CF8I/52/7026WEZGomX1+HA65bjTo3wr22b+qDrOXxg0TbPod3N8fDwDBgzg3//WD7sym82EhoZy//338/jjj/9l+0mTJlFWVsZ33/1vV9WgQYOIi4tjzpw553294uJivL29KSoqwsvLq+XeiFBiY9j1DM5ZxNp+0xmx9TXVcYQdKi8ox+DfHndOs39JqsMcTlx08CSe4e0xonF482E6DAhRHUk4mLV9pzNi+xvs9BpMz5PrLb53rym/vy2656Wqqork5GQSExP/94JGI4mJiSQlJTX4mKSkpHrbA4wZM+as21dWVlJcXFzvJhxD2bEyYnP0E9MF3H+94jTCXnn4ebDTX98zcfgDxxkdpb+3FiMa+527SXERFtH1w0c5jSu9ijey9eWfVcepx6LlpaCggNraWgIDA+vdHxgYSF5eXoOPycvLa9L2M2fOxNvbu+4WGhraMuGFcjteWEYbyjno1IXut/RXHUfYsbLh+ujIZ6PjlJeK7/T1Loe7yshIWEZw32A297kLAK8XHqG2qlZxov+x+6ONZsyYQVFRUd0tNzdXdSTRQpy+/ByArPjrZTGiuCCd79PLS4+ijRRnFypOc+E0DcIy9MseuE9IPM/WQjRfry+e5pTBh66nU/ntrk9Ux6lj0fLi5+eHyWQiPz+/3v35+fkEBQU1+JigoKAmbe/q6oqXl1e9m7B/RdknictbAUCHR+QoI3FhwoeHsdelBybM7HnD9hYfNtXB1fvpXLOPapzofp+cnE5Yjm9ke7Zf9gwAkfOfoviwbZx116LlxcXFhX79+rFq1aq6+8xmM6tWrSIhIaHBxyQkJNTbHuCnn3466/bCMe16/mtcqGava0+iJvZQHUc4gMOx+t6X2mX2PzrKfl8vYLu9h9A2RP7BJiwr4b/3ctC5C4HmPJIvfUp1HMAKY6Pp06czd+5c5s+fT1paGnfffTdlZWVMnToVgMmTJzNjxoy67R988EFWrFjBa6+9Rnp6Os899xxbt27lvvvus3RUYUPcl+ojo8PDZK+LaBl+U/Ty0v3gD1Sftp3ZfXO4/6rvlTw1aKziJKI1cPNyofCFdwEYnvoOu+asV5zICuVl0qRJzJo1i2eeeYa4uDhSUlJYsWJF3aLcnJwcjh49Wrf94MGDWbBgAR988AGxsbF8+eWXLF26lJ49e1o6qrARRzYfIu6kfv6KqKflKCPRMmJuH8xJgy/ttEJ2zFb/w7e5qkoq6ZGvf38ET5XyIqyjz2Oj+TXyNoxotHngNkrzy5Tmsfh5XqxNzvNi/1aP+z8uXvEYqd5D6X1qneo4woFsjJrC4MxPWRP7ICNT3lQdp1m2v7aaPo+MIt8YhH/VEYwmWcwurKPwwCmqonoQZD7CuqjbGLb3oxZ9fps5z4sQTaWZNTqu+Q8AJVfcojiNcDTO110JQNSuJXZ7tt3iL/SR0d7wMVJchFW1i/Ah77XPqMWIU3kx1eXVyrJIeRE2ZfeCHURX7qISF3o9f63qOMLB9Jw+mjI86FibQ/rn21XHaZbgVL28GMbJyEhYX9xDI8iYv4lBOV/g7OGsLIeUF2FTjs2aD8CO0Al4hfkqTiMcjXt7D3aG6L/08+csUZym6fK3HSb69E7MGOh63yWq44hWKmZyf+Xn3pLyImxGVWkVsan6yMjpjqmK0whHVTNBHx113GJ/5SX9Lf0Q6T1tBuLfrb3iNEKoI+VF2Iztz39Le+0EecZgYv8xRnUc4aB6/GM81TgRWbmbnFX7VMdpEpeV+rW+CuPHKU4ihFpSXoTNcJqvr1zfM/BWTK5OitMIR+Ub4csO35EAZL1uP3tfKgor6J2nXxIg6M7LFacRQi0pL8Im5CcfIu6Yvks89GkZGQnLKh+jj47a/WI/5WXnW6tpQzmHTaFEXRunOo4QSkl5ETYh7YlPMWFmu9dFRF0apTqOcHAxT0zEjIFeZb9xcH2O6jiNUvnFNwBkxlyufLGkEKpJeRHKmWs1Oq+eB0DptbcpTiNaA79ewaT6DAdg/0tfKE5zfuYaM9F79fUuba6XkZEQUl6EctvfXkdYzX6K8aTvS9eojiNaiZLx+qUngtYuVJzk/DI+20qgOY9iPOl133DVcYRQTsqLUK7srQ8BSO1+PW0C2ihOI1qLmKevpgYTMRXJZP9k20cdHftwGQC7OozF1ctVcRoh1JPyIpQqSC9g4EF9t33AE7crTiNak/Zd/UhpnwhA9iuLFKc5t+AtenmpHS8jIyFAyotQbPcjH+NGJXs8+hN980DVcUQrU3GFPjrqsN52R0f7f9hLdOVOqnGi9+OXqo4jhE2Q8iKU0WrNRKx8D4Dj19ytOI1ojXo9PZFKXIiq3E3m0l2q4zQo57XFAKT6jcK7czvFaYSwDVJehDLbX/6R0OosTuJD3/+7XnUc0Qr5hPuwPUg/W23OKwsUp2lYyAZ9rFox4TrFSYSwHVJehDI177wLwPbYqXgGeihOI1orw403AtBt83+orapVnKa+rBUZdD2dSjVO9Hhyouo4QtgMKS9CiZxfs+mf/z0A4S/fpTiNaM3inrmcUwYfQsyH2P76GtVx6jkzMkrxS8S3i4yMhDhDyotQYu+jH2BEY1v7RCLGRquOI1oxV283UnvcAMDp9z9RG+ZPQtb/PjK6TEZGQvyRlBdhdWUFFcRtmQuA9jdZqCvU83tEv55W3+yvKc4tUpxGd+CHDKJO76QKZxkZCfEnUl6E1SU/9B/8tAJynDrT57krVMcRgu639CfTJQYPKtjx1GLVcQDI/tdnAOzwv4T2kb6K0whhW6S8CKvSas10XPw6AAcuexCjs0lxIiHAYDSQO+pWALyXfKw2DPq1jKJ++xSAmhtuUZxGCNsj5UVYVcrMH4ioyqAIL+LeloswCtsR89LN1GCid8lG9n6t9pwvqbPXEVp7UP8+eVb2TgrxZ1JehHW9ru91Se57Jz6hnorDCPE/gXHBbO2gF4Wjz7ynNEvp7PkA7Ii+Dvd27kqzCGGLpLwIq0n7PIU+J1dTg4nod+5XHUeIv3B9+F4A+u7+lJIjJUoyVBSUEbtPX3fjdd9kJRmEsHVSXoTVHH/iDQC2hF1Lx8GdFKcR4q/iHh7JAZeueFLK9un/UZIh+akleFLKQacIet8zVEkGIWydlBdhFTnrc0jI1k+/3v7FhxWnEaJhBqOBnPH3ABCy9F00s2b1DJ4L3gcg+6IpGE0Gq7++EPZAyouwiux7/g9natje7mK5erSwaXFvTKEMDyIrd5Py1i9Wfe29X+0ktmQ9NZjoNut2q762EPZEyouwuBO78xi480MAtBlPKk4jxLn5hHmzrYe+1qR65iyrvnbec/pC4a0dJhLYJ8Sqry2EPZHyIixu97TXcaOSHW0S6DN9pOo4QpxXp7f+Ti1GBh7/nr1f7bTKa5blldBnl77OxuWhe6zymkLYKykvwqJO7D1B30361aMrpj+JwSgzfGH7wkZFsin0GgCOP/p/VnnNrQ98iielHHDuStzDUvKFOBcpL8KiUm57m7aUke4eR/xzl6qOI0Sj+bz0GADxWZ9zaH22RV+r5nQNnZe8BsDhiffKQl0hzkPKi7CYwgOn6LvhbQBK7ntC9roIuxJzc1+2+l6CE7Xsv+Nli77Wlse+pFNNFgUGP/q9O82iryWEI5DyIixmxy2z8OUUmW496P/SVarjCNFkLi88DcDg9I/I/jnTIq+hmTXazX0FgJ3D78fDz8MiryOEI5HyIiyiMC2fARvfBKDgwRcxOMkFGIX96X3vMLb4X4ozNRy+/RmLvMaWF3+ka0UKZXjQa869FnkNIRyNlBdhEbtunklbytjlPoCB/5ILywn75fnOSwAMOfg5GQu3t+hzm2vMeL6snz5ga7+78OvavkWfXwhHJeVFtLhDG3OI36afr6L8yX/J4kNh17pNimV92I0AnP7bAy161t1Njy6me8U2ivGk538fb7HnFcLRWay8FBYWctNNN+Hl5YWPjw/Tpk2jtLT0nNvff//9dO3aFXd3dzp16sQDDzxAUVGRpSIKC8mc8gKuVLHdZwQDZiSqjiPEBYtY9DKltCG2eD0b7vq0RZ6zsriS4NlPAZA84hHad/NvkecVojWwWHm56aab2L17Nz/99BPfffcdv/76K3feeedZtz9y5AhHjhxh1qxZ7Nq1i08++YQVK1YwbZqsvLcnGV/tYljmPADcZv1LjjASDiEkPpQt454FoNuHj1C478QFP2fS1bMIr87kmDGQfp9Nv+DnE6I1MWia1uJXHktLSyMmJoYtW7bQv39/AFasWMGll17KoUOHCAlp3GmvFy9ezM0330xZWRlOTk6NekxxcTHe3t4UFRXh5eXV7PcgmkHTSA4YS7+ClWzqcBXxh75SnUiIFlNVVs3B9n2IqtxNUodrGJTzRbPLee7a/fiN7Ik7p9lwz2cMmX1jC6cVwv405fe3Rfa8JCUl4ePjU1dcABITEzEajWzatKnRz3PmDZyruFRWVlJcXFzvJtTY/tIP9CtYSSUuBM63zllJhbAWlzbO1Mz9hGqcSDj8JRtu/7hZz1NbVcuJK27DndMk+4xi8Ds3tHBSIRyfRcpLXl4eAQEB9e5zcnKiXbt25OXlNeo5CgoKeOGFF845agKYOXMm3t7edbfQ0NBm5xbNV11ejffz+q7v9X0fJHxUF8WJhGh53W/pz4YxLwDQ9+P72DN/S5OfY/3YF4kr/pUS2uL/1RwZrQrRDE0qL48//jgGg+Gct/T09AsOVVxczPjx44mJieG5554757YzZsygqKio7pabm3vBry+abt1Nc4ioyuC4wZ9+X8uVo4XjGrbsUbb6j8WDCtrfdnmTLh3w2+NLGbrmeQBS7nyPThdHWiilEI6tcQtJfvf3v/+dW2+99ZzbREREEBQUxLFjx+rdX1NTQ2FhIUFBQed8fElJCWPHjsXT05MlS5bg7Ox8zu1dXV1xdXVtVH5hGcd2HaPPUn0x496bnmdImLfiREJYjsnFRPS2ReyNHEp05U4OjRhO9g8/E35J1Dkfl/zqamJfuQETZtbF/I2hc262UmIhHE+Tyou/vz/+/uc/nC8hIYFTp06RnJxMv379AFi9ejVms5n4+PizPq64uJgxY8bg6urKsmXLcHNza0o8oci+iY8whJNkuMeR8NHtquMIYXFeHb0oW7ecrCGj6Fy9l8Ixg9g8Y16DJ2TUzBrrp80j/pO7caGazQGXkZD8bwwyLRKi2SxytBHAuHHjyM/PZ86cOVRXVzN16lT69+/PggULADh8+DCjRo3i008/ZeDAgRQXFzN69GjKy8tZsmQJbdq0qXsuf39/TKbGnV5ejjayrtQ3V9P74VGYMbD7w9/oNW2g6khCWM3x3cc4NvAyepTra1+2+o3FfOddhF2rfx9kL9qE0/v/pt/JVQAkdbiGvrv/g6u3/MNMiD9ryu9vi5WXwsJC7rvvPr799luMRiNXX301b7/9Nm3btgUgOzubzp07s2bNGkaMGMHatWsZOXJkg8+VlZVFeHh4o15Xyov1nC6q5Kh/bzpX72V1zL1cvPvfqiMJYXVVpVVsHPkkQ7e+gRO1DW5TiQtJY//JsG8exeQi1/kSoiE2UV5UkfJiPauGP8+oX58l3xiE64F0fGSti2jFDq7eT/bf3yF813d0rMkCINc5guxeE4h66346DO2sOKEQtk3Ki5QXi9u7eAfh1w3AhWo2TV9I/GuTVEcSwmbUnK7BYDTIXhYhmqApv7+btGBXCIDqsiqYMkUvLsETiZ91nepIQtgUJzf50SqEJclVpUWTbRz3AtEVOzhhaE/YD3OQwyaEEEJYk5QX0SSpH21hyLqZAKQ/+B5BsYGKEwkhhGhtpLyIRis5XIzXXTfgRC2/hU1iyBvXqo4khBCiFZLyIhpFM2vsHvo3wmv2c8jUiZi176qOJIQQopWS8iIa5ddbP2JQ9kJqMFHwzkK8wtupjiSEEKKVkvIizmv3f7cz4D8PALB+7L+IuztBcSIhhBCtmZQXcU4n0o7hfetEPKggOXAcw797VHUkIYQQrZyUF3FWVaVVHEq4ho61OWQ5RxG5aQEGk/yVEUIIoZb8JhIN0swaG/veR2zROorwovrLZXiH+aiOJYQQQkh5EQ1bm/giI/bNxYyB/f/8jOjLu6mOJIQQQgBSXkQD1t/6ISPXPAPAxhv+Td9nLlOcSAghhPgfKS+invUPfUnC/L8BsCrhSYYuuEdxIiGEEKI+KS+izuZHFzPoresxYWZDt9u4eP0LqiMJIYQQfyHlRQCw4eHF9J2ln/p/fcRkElI/wGCUCy4KIYSwPVJeBL/e9D6D3rweJ2r5pfMUBu2Zh9HZpDqWEEII0SApL02w5Z/LqSqtUh2jxWhmjV8veoqLFtz1+6hoGkPTP8LJVYqLEEII2yXlpZF2f7yZAc+N50i7Hmx6chmaWVMd6YKUHStjQ/hNXLTuXwCsGvosg3fPxeQixUUIIYRtk/LSSOU5BeQbgwivziT+pSvY5j+afUt2qY7VLAd/3seRsEEMzf2cGkysmzyXUeuekzUuQggh7IKUl0Ya8OyleOTuZe2gx6nEhX6FPxNxVSy/xtzF0S2HVMdrFM2sse5v/8Xnkv5End5FvjGIPe+sZtj821VHE0IIIRpNyksTeIZ4MiJpJvlr0vgt5CpMmLko7X3aDezCL3EPcGzHUdURzypvRz5bQq9k2Ae34E0xqV5Dqdm0jd73XaQ6mhBCCNEkUl6aodOICAYd/oodb/9CivdFuFLF8B3v4BkXwS+xD3Bw9X7VEevUnK5h7XXv4hbXjYFHvqEKZ9Ze8i96HFtDh/7BquMJIYQQTSbl5QLE3n8RsYVr2fbqKlI9B+POaYanvkPoqCh+C7mSHW//omxhr7nGzMaHvyDXqwcjFt+LD6dI9+jD/oVbGbHyCUyuTkpyCSGEEBfKoGmafR828yfFxcV4e3tTVFSEl5eX1V5XM2tsn7WK2lmvM+D4D3X3H3COJmf4ZKKev4UOCZ0snqP8RAXJ0z8jaNFbRFXqC4pPGNqz+9p/MuQ/d8nRREIIIWxSU35/S3mxgP3fpXHo0bfon/4f2lAOgBkDu7wGUzhkAqH3XE7Epd1a7Ogec42Zne+t59TcL+i5ayHttRMAFOPJtpGP0PfTh/DqqObPQgghhGgMKS+Ky8sZpUdLSHn6K9p8NZ8+p9bW+9phUygHOwyheuAQ/C4dSMdRXfHu5N2o5z196jQ5P2WQt2wzpnVriMpdTYA5v+7ruU7h7B97H3HvTMMn3KcF35EQQghhGVJebKS8/NHh33LJfONbPFZ/S++C1bjy1zP1HjMGku/RmQqP9lS1bU+tmwcGsxnQcC49hXvJMbzLj9Cpej8mzPUeW4Q3O7tMxHXyJPo+PlrGQ0IIIeyKlBcbLC9/VJpXyt7/bqZo+QY8UzfQ8WQqQeamHWZ9Ch+yvWM5FTscnytH0v22BFy9XC2UWAghhLAsKS82Xl4aUpxbxKHVeylJO0TV0RPUHDuBobISDAYwGDD4eOPUIQCP8EBCRnYloHeQnBFXCCGEw2jK7285XtZGeIV6EzNlADBAdRQhhBDCpsl5XoQQQghhV6S8CCGEEMKuSHkRQgghhF2R8iKEEEIIuyLlRQghhBB2RcqLEEIIIeyKxcpLYWEhN910E15eXvj4+DBt2jRKS0sb9VhN0xg3bhwGg4GlS5daKqIQQggh7JDFystNN93E7t27+emnn/juu+/49ddfufPOOxv12DfffBODQU7AJoQQQoi/sshJ6tLS0lixYgVbtmyhf//+ALzzzjtceumlzJo1i5CQkLM+NiUlhddee42tW7cSHBxsiXhCCCGEsGMW2fOSlJSEj49PXXEBSExMxGg0smnTprM+rry8nBtvvJHZs2cTFBTUqNeqrKykuLi43k0IIYQQjssi5SUvL4+AgIB69zk5OdGuXTvy8vLO+riHH36YwYMHc8UVVzT6tWbOnIm3t3fdLTQ0tNm5hRBCCGH7mlReHn/8cQwGwzlv6enpzQqybNkyVq9ezZtvvtmkx82YMYOioqK6W25ubrNeXwghhBD2oUlrXv7+979z6623nnObiIgIgoKCOHbsWL37a2pqKCwsPOs4aPXq1ezfvx8fH59691999dUMGzaMtWvXNvg4V1dXXF1dG/sWhBBCCGHnmlRe/P398ff3P+92CQkJnDp1iuTkZPr16wfo5cRsNhMfH9/gYx5//HFuv/32evf16tWLN954gwkTJjQ6o6ZpALL2RQghhLAjZ35vn/k9fk6ahYwdO1br06ePtmnTJm39+vVaVFSUdsMNN9R9/dChQ1rXrl21TZs2nfU5AG3JkiVNet3c3FwNkJvc5CY3uclNbnZ4y83NPe/veoscKg3w2Wefcd999zFq1CiMRiNXX301b7/9dt3Xq6urycjIoLy8vEVfNyQkhNzcXDw9PVv8XDHFxcWEhoaSm5uLl5dXiz63LZD3Z/8c/T06+vsDx3+P8v7sn6Xeo6ZplJSUnPN0KmdYrLy0a9eOBQsWnPXr4eHh5901dL6vN8RoNNKxY8cmP64pvLy8HPYvJcj7cwSO/h4d/f2B479HeX/2zxLv0dvbu1HbybWNhBBCCGFXpLwIIYQQwq5IeWkCV1dXnn32WYc9NFven/1z9Pfo6O8PHP89yvuzf7bwHg1acxaWCCGEEEIoIntehBBCCGFXpLwIIYQQwq5IeRFCCCGEXZHyIoQQQgi7IuXlD/71r38xePBgPDw8/nKByLPRNI1nnnmG4OBg3N3dSUxMZN++ffW2KSws5KabbsLLywsfHx+mTZtGaWmpBd7BuTU1R3Z29lmvHr548eK67Rr6+sKFC63xlv6iOX/WI0aM+Ev+u+66q942OTk5jB8/Hg8PDwICAnj00Uepqamx5FtpUFPfX2FhIffffz9du3bF3d2dTp068cADD1BUVFRvO5Wf4ezZswkPD8fNzY34+Hg2b958zu0XL15Mt27dcHNzo1evXixfvrze1xvzPWlNTXl/c+fOZdiwYfj6+uLr60tiYuJftr/11lv/8lmNHTvW0m/jnJryHj/55JO/5Hdzc6u3jT1/hg39PDEYDIwfP75uG1v6DH/99VcmTJhASEgIBoOBpUuXnvcxa9eupW/fvri6uhIZGcknn3zyl22a+n3dZE26cJCDe+aZZ7TXX39dmz59uubt7d2ox7z88suat7e3tnTpUm3Hjh3a5ZdfrnXu3FmrqKio22bs2LFabGys9ttvv2nr1q3TIiMj613nyVqamqOmpkY7evRovds///lPrW3btlpJSUnddoD28ccf19vuj+/fmprzZz18+HDtjjvuqJe/qKio7us1NTVaz549tcTERG379u3a8uXLNT8/P23GjBmWfjt/0dT3t3PnTu2qq67Sli1bpmVmZmqrVq3SoqKitKuvvrredqo+w4ULF2ouLi7avHnztN27d2t33HGH5uPjo+Xn5ze4/YYNGzSTyaT93//9n7Znzx7tqaee0pydnbWdO3fWbdOY70lraer7u/HGG7XZs2dr27dv19LS0rRbb71V8/b21g4dOlS3zZQpU7SxY8fW+6wKCwut9Zb+oqnv8eOPP9a8vLzq5c/Ly6u3jT1/hidOnKj33nbt2qWZTCbt448/rtvGlj7D5cuXa08++aT29ddfa3D+6wkeOHBA8/Dw0KZPn67t2bNHe+eddzSTyaStWLGibpum/pk1h5SXBnz88ceNKi9ms1kLCgrSXn311br7Tp06pbm6umqff/65pmmatmfPHg3QtmzZUrfNDz/8oBkMBu3w4cMtnv1sWipHXFycdtttt9W7rzF/4a2hue9x+PDh2oMPPnjWry9fvlwzGo31fsC+9957mpeXl1ZZWdki2RujpT7DL774QnNxcdGqq6vr7lP1GQ4cOFC799576/67trZWCwkJ0WbOnNng9tddd502fvz4evfFx8drf/vb3zRNa9z3pDU19f39WU1Njebp6anNnz+/7r4pU6ZoV1xxRUtHbbamvsfz/Xx1tM/wjTfe0Dw9PbXS0tK6+2ztMzyjMT8H/vGPf2g9evSod9+kSZO0MWPG1P33hf6ZNYaMjS5AVlYWeXl5JCYm1t3n7e1NfHw8SUlJACQlJeHj40P//v3rtklMTMRoNLJp0yarZW2JHMnJyaSkpDBt2rS/fO3ee+/Fz8+PgQMHMm/evGZdl+pCXch7/Oyzz/Dz86Nnz57MmDGj3gVDk5KS6NWrF4GBgXX3jRkzhuLiYnbv3t3yb+QsWurvUlFREV5eXjg51b+0mbU/w6qqKpKTk+t9/xiNRhITE+u+f/4sKSmp3vagfxZntm/M96S1NOf9/Vl5eTnV1dW0a9eu3v1r164lICCArl27cvfdd3PixIkWzd5YzX2PpaWlhIWFERoayhVXXFHv+8jRPsOPPvqI66+/njZt2tS731Y+w6Y63/dgS/yZNYbFLszYGuTl5QHU+6V25r/PfC0vL4+AgIB6X3dycqJdu3Z121hDS+T46KOP6N69O4MHD653//PPP8/FF1+Mh4cHK1eu5J577qG0tJQHHnigxfI3RnPf44033khYWBghISGkpqby2GOPkZGRwddff133vA19xme+Zi0t8RkWFBTwwgsvcOedd9a7X8VnWFBQQG1tbYN/tunp6Q0+5myfxR+/387cd7ZtrKU57+/PHnvsMUJCQur9Ihg7dixXXXUVnTt3Zv/+/TzxxBOMGzeOpKQkTCZTi76H82nOe+zatSvz5s2jd+/eFBUVMWvWLAYPHszu3bvp2LGjQ32GmzdvZteuXXz00Uf17relz7CpzvY9WFxcTEVFBSdPnrzgv/eN4fDl5fHHH+eVV1455zZpaWl069bNSolaVmPf34WqqKhgwYIFPP3003/52h/v69OnD2VlZbz66qst9ovP0u/xj7/Ie/XqRXBwMKNGjWL//v106dKl2c/bWNb6DIuLixk/fjwxMTE899xz9b5m6c9QNN3LL7/MwoULWbt2bb0Frddff33d/+/Vqxe9e/emS5curF27llGjRqmI2iQJCQkkJCTU/ffgwYPp3r0777//Pi+88ILCZC3vo48+olevXgwcOLDe/fb+GdoChy8vf//737n11lvPuU1ERESznjsoKAiA/Px8goOD6+7Pz88nLi6ubptjx47Ve1xNTQ2FhYV1j78QjX1/F5rjyy+/pLy8nMmTJ5932/j4eF544QUqKytb5NoX1nqPZ8THxwOQmZlJly5dCAoK+stK+fz8fAC7+QxLSkoYO3Ysnp6eLFmyBGdn53Nu39KfYUP8/PwwmUx1f5Zn5Ofnn/X9BAUFnXP7xnxPWktz3t8Zs2bN4uWXX+bnn3+md+/e59w2IiICPz8/MjMzrf6L70Le4xnOzs706dOHzMxMwHE+w7KyMhYuXMjzzz9/3tdR+Rk21dm+B728vHB3d8dkMl3w34lGabHVMw6kqQt2Z82aVXdfUVFRgwt2t27dWrfNjz/+qGzBbnNzDB8+/C9HqJzNiy++qPn6+jY7a3O11J/1+vXrNUDbsWOHpmn/W7D7x5Xy77//vubl5aWdPn265d7AeTT3/RUVFWmDBg3Shg8frpWVlTXqtaz1GQ4cOFC777776v67trZW69ChwzkX7F522WX17ktISPjLgt1zfU9aU1Pfn6Zp2iuvvKJ5eXlpSUlJjXqN3NxczWAwaN98880F522O5rzHP6qpqdG6du2qPfzww5qmOcZnqGn67xFXV1etoKDgvK+h+jM8g0Yu2O3Zs2e9+2644Ya/LNi9kL8TjcraYs/kAA4ePKht37697nDg7du3a9u3b693WHDXrl21r7/+uu6/X375Zc3Hx0f75ptvtNTUVO2KK65o8FDpPn36aJs2bdLWr1+vRUVFKTtU+lw5Dh06pHXt2lXbtGlTvcft27dPMxgM2g8//PCX51y2bJk2d+5cbefOndq+ffu0d999V/Pw8NCeeeYZi7+fhjT1PWZmZmrPP/+8tnXrVi0rK0v75ptvtIiICO2iiy6qe8yZQ6VHjx6tpaSkaCtWrND8/f2VHSrdlPdXVFSkxcfHa7169dIyMzPrHZpZU1OjaZraz3DhwoWaq6ur9sknn2h79uzR7rzzTs3Hx6fuyK5bbrlFe/zxx+u237Bhg+bk5KTNmjVLS0tL05599tkGD5U+3/ektTT1/b388suai4uL9uWXX9b7rM78DCopKdEeeeQRLSkpScvKytJ+/vlnrW/fvlpUVJRVi/SFvMd//vOf2o8//qjt379fS05O1q6//nrNzc1N2717d9029vwZnjF06FBt0qRJf7nf1j7DkpKSut91gPb6669r27dv1w4ePKhpmqY9/vjj2i233FK3/ZlDpR999FEtLS1Nmz17doOHSp/rz6wlSHn5gylTpmjAX25r1qyp24bfz4dxhtls1p5++mktMDBQc3V11UaNGqVlZGTUe94TJ05oN9xwg9a2bVvNy8tLmzp1ar1CZC3ny5GVlfWX96tpmjZjxgwtNDRUq62t/ctz/vDDD1pcXJzWtm1brU2bNlpsbKw2Z86cBre1hqa+x5ycHO2iiy7S2rVrp7m6umqRkZHao48+Wu88L5qmadnZ2dq4ceM0d3d3zc/PT/v73/9e71Bja2nq+1uzZk2Df6cBLSsrS9M09Z/hO++8o3Xq1ElzcXHRBg4cqP322291Xxs+fLg2ZcqUett/8cUXWnR0tObi4qL16NFD+/777+t9vTHfk9bUlPcXFhbW4Gf17LPPapqmaeXl5dro0aM1f39/zdnZWQsLC9PuuOOOFv2l0BxNeY8PPfRQ3baBgYHapZdeqm3btq3e89nzZ6hpmpaenq4B2sqVK//yXLb2GZ7tZ8SZ9zRlyhRt+PDhf3lMXFyc5uLiokVERNT7nXjGuf7MWoJB0xQc0yqEEEII0UxynhchhBBC2BUpL0IIIYSwK1JehBBCCGFXpLwIIYQQwq5IeRFCCCGEXZHyIoQQQgi7IuVFCCGEEHZFyosQQggh7IqUFyGEEELYFSkvQgghhLArUl6EEEIIYVekvAghhBDCrvw/kENWvzcBaGkAAAAASUVORK5CYII="
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from matplotlib import pyplot as plt\n",
+    "plt.plot(t, normal_res, color='blue')\n",
+    "plt.plot(t, cache_res, color='red')"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-03T03:53:58.803332700Z",
+     "start_time": "2024-06-03T03:53:57.867105200Z"
+    }
+   },
+   "id": "97ace2f9183fef16"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "4edc26eea7760479"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tensorneat/tmp.ipynb b/tensorneat/tmp.ipynb
new file mode 100644
index 0000000..60f38da
--- /dev/null
+++ b/tensorneat/tmp.ipynb
@@ -0,0 +1,221 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2024-06-06T11:55:39.434327400Z",
+     "start_time": "2024-06-06T11:55:39.361327400Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "Array([[[2, 4],\n        [1, 3]],\n\n       [[4, 3],\n        [2, 1]],\n\n       [[3, 1],\n        [4, 2]],\n\n       [[1, 2],\n        [3, 4]],\n\n       [[2, 4],\n        [1, 3]],\n\n       [[4, 3],\n        [2, 1]],\n\n       [[3, 1],\n        [4, 2]],\n\n       [[1, 2],\n        [3, 4]]], dtype=int32)"
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import jax, jax.numpy as jnp\n",
+    "a = jnp.array([\n",
+    "    [1, 2],\n",
+    "    [3, 4]\n",
+    "])\n",
+    "def rot_boards(board):\n",
+    "    def rot(a, _):\n",
+    "        a = jnp.rot90(a)\n",
+    "        return a, a  # carry, y\n",
+    "    \n",
+    "    _, boards = jax.lax.scan(rot, board, jnp.arange(4, dtype=jnp.int32))\n",
+    "    return boards\n",
+    "a1 = rot_boards(a)\n",
+    "a2 = rot_boards(a)\n",
+    "\n",
+    "a = jnp.concatenate([a1, a2], axis=0)\n",
+    "a"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "Array([[2, 4, 1, 3],\n       [4, 3, 2, 1],\n       [3, 1, 4, 2],\n       [1, 2, 3, 4],\n       [2, 4, 1, 3],\n       [4, 3, 2, 1],\n       [3, 1, 4, 2],\n       [1, 2, 3, 4]], dtype=int32)"
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a = a.reshape(8, -1)\n",
+    "a"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-06T11:55:31.121054800Z",
+     "start_time": "2024-06-06T11:55:31.075517200Z"
+    }
+   },
+   "id": "639cdecea840351d"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "outputs": [],
+   "source": [
+    "action = [\"up\", \"right\", \"down\", \"left\"]\n",
+    "lr_flip_action = [\"up\", \"left\", \"down\", \"right\"]\n",
+    "def action_rot90(li):\n",
+    "    first = li[0]\n",
+    "    return li[1:] + [first]\n",
+    "\n",
+    "a = a\n",
+    "rl_flip_a = jnp.fliplr(a)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-06T11:22:36.417287600Z",
+     "start_time": "2024-06-06T11:22:36.414285500Z"
+    }
+   },
+   "id": "92b75cd0e870a28c"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[1 2]\n",
+      " [3 4]] ['up', 'right', 'down', 'left']\n",
+      "[[2 1]\n",
+      " [4 3]] ['up', 'left', 'down', 'right']\n",
+      "[[2 4]\n",
+      " [1 3]] ['right', 'down', 'left', 'up']\n",
+      "[[1 3]\n",
+      " [2 4]] ['left', 'down', 'right', 'up']\n",
+      "[[4 3]\n",
+      " [2 1]] ['down', 'left', 'up', 'right']\n",
+      "[[3 4]\n",
+      " [1 2]] ['down', 'right', 'up', 'left']\n",
+      "[[3 1]\n",
+      " [4 2]] ['left', 'up', 'right', 'down']\n",
+      "[[4 2]\n",
+      " [3 1]] ['right', 'up', 'left', 'down']\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(4):\n",
+    "    print(a, action)\n",
+    "    print(rl_flip_a, lr_flip_action)\n",
+    "    a = jnp.rot90(a)\n",
+    "    rl_flip_a = jnp.rot90(rl_flip_a)\n",
+    "    action = action_rot90(action)\n",
+    "    lr_flip_action = action_rot90(lr_flip_action)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-06T11:22:36.919614600Z",
+     "start_time": "2024-06-06T11:22:36.860704600Z"
+    }
+   },
+   "id": "55e802e0dbcc9c7f"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "Array([[4, 3],\n       [2, 1]], dtype=int32)"
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "jnp.rot90(a, k=2)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-06T11:12:48.186719Z",
+     "start_time": "2024-06-06T11:12:48.151161900Z"
+    }
+   },
+   "id": "16f8de3cadaa257a"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "Array([[2, 1],\n       [4, 3]], dtype=int32)"
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# flip left-right\n",
+    "jnp.fliplr(a)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-06T11:14:28.668195300Z",
+     "start_time": "2024-06-06T11:14:28.631570500Z"
+    }
+   },
+   "id": "1fffa4e597ab5732"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "ca53c916dcff12ae"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}