Files
tensorneat-mend/tensorneat/examples/jumanji/2048_test.ipynb
wls2002 10ec1c2df9 add jumanji env;
add repeat times for rl_env
2024-06-05 14:24:17 +08:00

1286 lines
109 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-06-05T05:07:22.736605400Z",
"start_time": "2024-06-05T05:06:39.100164300Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"initializing\n",
"initializing finished\n",
"start compile\n",
"compile finished, cost time: 18.307454s\n",
"Generation: 1.0, Cost time: 4551.03ms\n",
" \tnode counts: max: 21, min: 21, mean: 21.00\n",
" \tconn counts: max: 20, min: 20, mean: 20.00\n",
" \tspecies: 1, [10000]\n",
" \tfitness: valid cnt: 10000, max: 10124.0000, min: 44.0000, mean: 1758.1263, std: 1212.6823\n",
"Generation: 2.0, Cost time: 4636.33ms\n",
" \tnode counts: max: 22, min: 21, mean: 21.03\n",
" \tconn counts: max: 22, min: 20, mean: 20.05\n",
" \tspecies: 1, [10000]\n",
" \tfitness: valid cnt: 10000, max: 11000.0000, min: 48.0000, mean: 1870.1300, std: 1263.3086\n",
"Generation: 3.0, Cost time: 6271.12ms\n",
" \tnode counts: max: 23, min: 21, mean: 21.03\n",
" \tconn counts: max: 22, min: 20, mean: 20.05\n",
" \tspecies: 1, [10000]\n",
" \tfitness: valid cnt: 10000, max: 14624.0000, min: 28.0000, mean: 1943.9924, std: 1293.7146\n",
"\n",
"Fitness limit reached!\n"
]
}
],
"source": [
"import jax.numpy as jnp\n",
"\n",
"from pipeline import Pipeline\n",
"from algorithm.neat import *\n",
"\n",
"from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048\n",
"from utils import Act, Agg\n",
"\n",
"if __name__ == \"__main__\":\n",
" pipeline = Pipeline(\n",
" algorithm=NEAT(\n",
" species=DefaultSpecies(\n",
" genome=DefaultGenome(\n",
" num_inputs=16,\n",
" num_outputs=4,\n",
" max_nodes=100,\n",
" max_conns=1000,\n",
" node_gene=DefaultNodeGene(\n",
" activation_default=Act.sigmoid,\n",
" activation_options=(Act.sigmoid, Act.relu, Act.tanh, Act.identity, Act.inv),\n",
" aggregation_default=Agg.sum,\n",
" aggregation_options=(Agg.sum, Agg.mean, Agg.max, Agg.product),\n",
" ),\n",
" mutation=DefaultMutation(\n",
" node_add=0.03,\n",
" conn_add=0.03,\n",
" )\n",
" ),\n",
" pop_size=10000,\n",
" species_size=100,\n",
" survival_threshold=0.01,\n",
" ),\n",
" ),\n",
" problem=Jumanji_2048(\n",
" max_step=1000,\n",
" ),\n",
" generation_limit=10000,\n",
" fitness_target=13000,\n",
" )\n",
"\n",
" # initialize state\n",
" state = pipeline.setup()\n",
" # print(state)\n",
" # run until terminate\n",
" state, best = pipeline.auto_run(state)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"genome = pipeline.algorithm.genome"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-05T05:08:14.332101Z",
"start_time": "2024-06-05T05:08:14.324101300Z"
}
},
"id": "a0915ecf8179f347"
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [],
"source": [
"transformed = genome.transform(state, *best)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-05T05:08:49.132030500Z",
"start_time": "2024-06-05T05:08:48.495809200Z"
}
},
"id": "cd1fa65e8a9d6e13"
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [],
"source": [
"def policy(board):\n",
" action_scores = genome.forward(state, transformed, board)\n",
" return action_scores"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-05T05:09:32.355055100Z",
"start_time": "2024-06-05T05:09:32.350057Z"
}
},
"id": "61bc1895af304651"
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
" [0, 1, 0, 0],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 2],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 2],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 3],\n",
" [0, 0, 0, 0],\n",
" [2, 0, 0, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 3],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 1],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 3],\n",
" [0, 0, 0, 1],\n",
" [0, 0, 0, 0],\n",
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 3],\n",
" [0, 0, 0, 1],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 3],\n",
" [0, 0, 2, 1],\n",
" [0, 1, 0, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 3],\n",
" [1, 0, 2, 1],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 3],\n",
" [0, 1, 2, 1],\n",
" [0, 1, 0, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 3],\n",
" [0, 2, 2, 1],\n",
" [1, 0, 0, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
" [0, 2, 2, 1],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
" [0, 2, 2, 1],\n",
" [0, 0, 1, 0],\n",
" [0, 0, 2, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
" [0, 0, 3, 1],\n",
" [0, 0, 1, 1],\n",
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 3],\n",
" [0, 0, 3, 2],\n",
" [0, 0, 1, 2],\n",
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 3],\n",
" [0, 0, 3, 3],\n",
" [0, 1, 1, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
" [0, 1, 3, 1],\n",
" [0, 0, 1, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
" [0, 1, 3, 1],\n",
" [0, 0, 0, 1],\n",
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
" [0, 2, 3, 2],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
" [0, 2, 3, 2],\n",
" [0, 0, 0, 1],\n",
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
" [1, 2, 3, 2],\n",
" [0, 0, 0, 1],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
" [1, 2, 3, 2],\n",
" [0, 0, 0, 2],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
" [1, 2, 3, 3],\n",
" [0, 0, 0, 1],\n",
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
" [1, 2, 3, 3],\n",
" [0, 1, 0, 1],\n",
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
" [1, 2, 3, 3],\n",
" [0, 2, 1, 1],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 4],\n",
" [1, 3, 3, 3],\n",
" [0, 0, 1, 1],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
" [1, 0, 3, 3],\n",
" [0, 1, 1, 2],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
" [1, 1, 3, 3],\n",
" [0, 0, 1, 2],\n",
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
" [1, 1, 3, 3],\n",
" [0, 1, 2, 2],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
" [1, 2, 3, 3],\n",
" [0, 1, 2, 2],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 4],\n",
" [1, 1, 2, 4],\n",
" [0, 0, 1, 3],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
" [1, 1, 2, 3],\n",
" [0, 1, 1, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
" [1, 2, 2, 3],\n",
" [0, 0, 1, 0],\n",
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
" [1, 2, 2, 3],\n",
" [0, 1, 1, 0],\n",
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
" [1, 2, 2, 3],\n",
" [0, 1, 2, 1],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
" [1, 2, 3, 3],\n",
" [0, 1, 1, 1],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
" [1, 1, 2, 4],\n",
" [0, 0, 1, 2],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
" [0, 2, 2, 4],\n",
" [0, 0, 1, 2],\n",
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
" [1, 2, 2, 4],\n",
" [0, 0, 1, 2],\n",
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 5],\n",
" [2, 2, 2, 4],\n",
" [1, 0, 1, 2],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
" [1, 2, 2, 4],\n",
" [0, 0, 1, 2],\n",
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
" [1, 2, 2, 4],\n",
" [0, 1, 1, 2],\n",
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
" [1, 2, 2, 4],\n",
" [0, 1, 2, 2],\n",
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
" [2, 2, 3, 4],\n",
" [1, 1, 0, 2],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
" [0, 3, 3, 4],\n",
" [0, 0, 2, 2],\n",
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
" [1, 3, 3, 4],\n",
" [0, 0, 2, 2],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
" [0, 1, 4, 4],\n",
" [0, 1, 0, 3],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
" [0, 2, 4, 4],\n",
" [0, 1, 0, 3],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 5],\n",
" [0, 0, 2, 5],\n",
" [1, 0, 1, 3],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 0, 2, 3],\n",
" [0, 1, 1, 1],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 1, 2, 3],\n",
" [0, 0, 1, 1],\n",
" [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 1, 2, 3],\n",
" [2, 0, 1, 1],\n",
" [0, 2, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 1, 2, 3],\n",
" [2, 2, 1, 1],\n",
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 1, 2, 3],\n",
" [2, 2, 2, 1],\n",
" [0, 0, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 1, 3, 3],\n",
" [2, 2, 2, 1],\n",
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [0, 1, 2, 4],\n",
" [0, 2, 3, 1],\n",
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 2, 4, 0],\n",
" [2, 3, 1, 1],\n",
" [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 2, 4, 1],\n",
" [3, 3, 1, 0],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 2, 4, 2],\n",
" [3, 3, 1, 0],\n",
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 2, 4, 2],\n",
" [3, 3, 2, 0],\n",
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 2, 4, 2],\n",
" [0, 0, 4, 2],\n",
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 2, 5, 3],\n",
" [0, 1, 1, 1],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 2, 5, 3],\n",
" [0, 0, 1, 2],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [1, 2, 5, 3],\n",
" [1, 2, 1, 0],\n",
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [2, 3, 5, 3],\n",
" [1, 1, 1, 0],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [2, 3, 5, 3],\n",
" [0, 0, 1, 2],\n",
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [2, 3, 5, 3],\n",
" [0, 1, 1, 2],\n",
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 6],\n",
" [2, 3, 5, 3],\n",
" [1, 2, 1, 2],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, False, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 0],\n",
" [3, 4, 1, 6],\n",
" [2, 3, 5, 3],\n",
" [1, 2, 1, 2]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
" [2, 4, 5, 3],\n",
" [1, 3, 1, 2],\n",
" [0, 2, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
" [2, 4, 5, 3],\n",
" [1, 3, 1, 2],\n",
" [1, 0, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
" [2, 4, 5, 3],\n",
" [2, 3, 1, 2],\n",
" [0, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 6],\n",
" [3, 4, 5, 3],\n",
" [1, 3, 1, 2],\n",
" [0, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [1, 4, 5, 3],\n",
" [0, 3, 1, 2],\n",
" [2, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [1, 4, 5, 3],\n",
" [2, 3, 1, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [1, 4, 5, 3],\n",
" [2, 3, 1, 2],\n",
" [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [1, 4, 5, 3],\n",
" [3, 3, 2, 2],\n",
" [1, 2, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [1, 4, 5, 3],\n",
" [0, 1, 4, 3],\n",
" [0, 1, 2, 1]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [1, 4, 5, 4],\n",
" [1, 2, 4, 1],\n",
" [0, 0, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 5, 4],\n",
" [0, 2, 4, 1],\n",
" [0, 0, 2, 1]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 5, 4],\n",
" [0, 2, 4, 2],\n",
" [0, 0, 2, 1]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 5, 4],\n",
" [2, 4, 2, 0],\n",
" [2, 1, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 5, 5, 4],\n",
" [2, 1, 2, 0],\n",
" [0, 1, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 5, 5, 4],\n",
" [2, 2, 2, 0],\n",
" [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 5, 5, 4],\n",
" [2, 2, 2, 1],\n",
" [0, 2, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 5, 5, 4],\n",
" [2, 3, 2, 1],\n",
" [0, 1, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [0, 3, 6, 4],\n",
" [2, 3, 2, 1],\n",
" [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 6, 4],\n",
" [1, 1, 2, 1],\n",
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 6, 4],\n",
" [0, 2, 2, 1],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 6, 4],\n",
" [1, 0, 3, 1],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 6, 4],\n",
" [0, 1, 3, 1],\n",
" [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 6, 4],\n",
" [0, 2, 3, 1],\n",
" [0, 2, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 6, 4],\n",
" [0, 3, 3, 1],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 6, 4],\n",
" [1, 3, 3, 1],\n",
" [0, 1, 1, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [2, 4, 6, 4],\n",
" [2, 1, 4, 1],\n",
" [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 4, 6, 4],\n",
" [1, 1, 4, 1],\n",
" [0, 0, 2, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 4, 6, 4],\n",
" [0, 2, 4, 1],\n",
" [0, 0, 1, 3]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 4, 6, 4],\n",
" [2, 4, 1, 0],\n",
" [1, 3, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 5, 6, 4],\n",
" [2, 3, 1, 1],\n",
" [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 5, 6, 4],\n",
" [2, 3, 1, 2],\n",
" [1, 0, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 5, 6, 4],\n",
" [2, 3, 1, 2],\n",
" [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 5, 6, 4],\n",
" [2, 3, 1, 3],\n",
" [0, 0, 2, 1]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 5, 6, 4],\n",
" [2, 3, 1, 3],\n",
" [2, 1, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [3, 5, 6, 4],\n",
" [3, 3, 2, 3],\n",
" [0, 1, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 1, 6],\n",
" [4, 5, 6, 4],\n",
" [1, 3, 3, 3],\n",
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
" [1, 5, 6, 4],\n",
" [0, 3, 3, 3],\n",
" [2, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
" [1, 5, 6, 4],\n",
" [2, 3, 3, 3],\n",
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
" [1, 5, 6, 4],\n",
" [0, 2, 3, 4],\n",
" [1, 0, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 1, 6],\n",
" [2, 5, 6, 5],\n",
" [1, 2, 3, 2],\n",
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([False, False, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
" [5, 2, 1, 6],\n",
" [2, 5, 6, 5],\n",
" [1, 2, 3, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 2, 2, 6],\n",
" [2, 5, 6, 5],\n",
" [1, 2, 3, 2],\n",
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 5, 3, 6],\n",
" [2, 5, 6, 5],\n",
" [1, 2, 3, 2],\n",
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
" [1, 2, 6, 5],\n",
" [0, 1, 3, 2],\n",
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
" [1, 2, 6, 5],\n",
" [0, 2, 3, 2],\n",
" [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
" [1, 3, 6, 5],\n",
" [0, 0, 3, 2],\n",
" [1, 0, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 6, 3, 6],\n",
" [2, 3, 6, 5],\n",
" [1, 0, 3, 2],\n",
" [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
" [1, 3, 6, 5],\n",
" [0, 0, 3, 2],\n",
" [1, 0, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
" [2, 3, 6, 5],\n",
" [0, 1, 3, 2],\n",
" [0, 0, 1, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
" [2, 3, 6, 5],\n",
" [0, 1, 3, 2],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
" [2, 3, 6, 5],\n",
" [1, 1, 3, 3],\n",
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
" [2, 3, 6, 5],\n",
" [0, 0, 2, 4],\n",
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
" [2, 3, 6, 5],\n",
" [1, 1, 2, 4],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
" [2, 3, 6, 5],\n",
" [0, 2, 2, 4],\n",
" [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
" [2, 3, 6, 5],\n",
" [1, 2, 2, 4],\n",
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
" [2, 3, 6, 5],\n",
" [2, 1, 3, 4],\n",
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 6, 3, 6],\n",
" [3, 3, 6, 5],\n",
" [0, 1, 3, 4],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [0, 3, 6, 5],\n",
" [0, 1, 3, 4],\n",
" [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [0, 3, 6, 5],\n",
" [0, 2, 3, 4],\n",
" [0, 2, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [0, 3, 6, 5],\n",
" [0, 3, 3, 4],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [1, 4, 6, 5],\n",
" [0, 2, 3, 4],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [1, 4, 6, 5],\n",
" [2, 3, 4, 0],\n",
" [1, 2, 2, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [1, 4, 6, 5],\n",
" [0, 2, 3, 4],\n",
" [1, 0, 1, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [2, 4, 6, 5],\n",
" [1, 2, 3, 4],\n",
" [0, 0, 1, 3]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [2, 4, 6, 5],\n",
" [1, 2, 3, 4],\n",
" [1, 3, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [2, 4, 6, 5],\n",
" [2, 2, 3, 4],\n",
" [1, 3, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [3, 4, 6, 5],\n",
" [1, 2, 3, 4],\n",
" [1, 3, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [3, 4, 6, 5],\n",
" [2, 2, 3, 4],\n",
" [0, 3, 1, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [3, 4, 6, 5],\n",
" [0, 3, 3, 4],\n",
" [1, 0, 3, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [3, 4, 6, 5],\n",
" [1, 3, 4, 4],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [3, 4, 6, 5],\n",
" [0, 1, 3, 5],\n",
" [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 6],\n",
" [3, 4, 6, 6],\n",
" [0, 2, 3, 2],\n",
" [2, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 2, 3, 0],\n",
" [0, 1, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [0, 2, 3, 3],\n",
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [0, 0, 2, 4],\n",
" [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [0, 1, 2, 4],\n",
" [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [0, 2, 2, 4],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [0, 0, 3, 4],\n",
" [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [0, 1, 3, 4],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [1, 1, 3, 4],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 1, 3, 4],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 1, 3, 4],\n",
" [0, 1, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 2, 3, 4],\n",
" [0, 1, 2, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [1, 3, 3, 4],\n",
" [0, 0, 1, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [0, 1, 4, 4],\n",
" [0, 1, 1, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [0, 2, 4, 4],\n",
" [1, 0, 1, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [1, 2, 4, 4],\n",
" [0, 1, 1, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 1, 2, 5],\n",
" [0, 0, 2, 3]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 1, 3, 5],\n",
" [1, 0, 0, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 1, 3, 5],\n",
" [0, 1, 1, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 2, 3, 5],\n",
" [0, 1, 1, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [1, 3, 3, 5],\n",
" [0, 0, 2, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [0, 1, 4, 5],\n",
" [0, 2, 2, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [0, 1, 4, 5],\n",
" [1, 0, 3, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [1, 1, 4, 5],\n",
" [0, 1, 3, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [1, 2, 4, 5],\n",
" [0, 1, 3, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [1, 2, 4, 5],\n",
" [0, 1, 1, 4]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [1, 2, 4, 5],\n",
" [0, 2, 2, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [1, 3, 4, 5],\n",
" [1, 0, 2, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 3, 4, 5],\n",
" [0, 1, 2, 4]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 3, 4, 5],\n",
" [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 3, 5, 5],\n",
" [1, 2, 1, 1]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 6, 2],\n",
" [2, 3, 6, 1],\n",
" [1, 2, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 7, 2],\n",
" [2, 3, 2, 1],\n",
" [1, 2, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 7, 2],\n",
" [2, 3, 3, 1],\n",
" [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 7, 2],\n",
" [2, 3, 3, 2],\n",
" [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 7, 3],\n",
" [2, 3, 3, 1],\n",
" [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 7, 3],\n",
" [2, 3, 3, 2],\n",
" [1, 2, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 7, 3],\n",
" [0, 2, 4, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 7, 3],\n",
" [1, 2, 4, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 7, 3],\n",
" [2, 2, 4, 2],\n",
" [2, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [3, 4, 7, 3],\n",
" [3, 2, 4, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 6, 3, 7],\n",
" [4, 4, 7, 3],\n",
" [1, 2, 4, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [2, 4, 7, 3],\n",
" [0, 2, 4, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [2, 4, 7, 3],\n",
" [1, 2, 4, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [2, 4, 7, 3],\n",
" [2, 2, 4, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [3, 4, 7, 3],\n",
" [1, 2, 4, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [3, 4, 7, 3],\n",
" [2, 2, 4, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [3, 4, 7, 3],\n",
" [3, 4, 2, 0],\n",
" [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(48., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [2, 2, 2, 1],\n",
" [1, 0, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [1, 2, 3, 1],\n",
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [1, 2, 3, 1],\n",
" [2, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [1, 2, 3, 2],\n",
" [2, 0, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [1, 2, 3, 2],\n",
" [1, 0, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [2, 2, 3, 2],\n",
" [1, 0, 2, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [0, 3, 3, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [1, 3, 3, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [2, 3, 3, 2],\n",
" [1, 1, 2, 1]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [2, 4, 2, 1],\n",
" [2, 2, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [3, 4, 2, 1],\n",
" [0, 2, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [3, 4, 2, 2],\n",
" [0, 2, 1, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 3],\n",
" [1, 3, 4, 3],\n",
" [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 4],\n",
" [1, 3, 4, 2],\n",
" [0, 0, 2, 1]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 4],\n",
" [1, 3, 4, 2],\n",
" [2, 1, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 4],\n",
" [1, 3, 4, 2],\n",
" [1, 0, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 4],\n",
" [2, 3, 4, 3],\n",
" [1, 0, 2, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 4],\n",
" [2, 3, 4, 3],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 4],\n",
" [2, 3, 4, 3],\n",
" [1, 0, 2, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 4],\n",
" [2, 3, 4, 3],\n",
" [0, 1, 1, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 4],\n",
" [2, 3, 4, 4],\n",
" [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 5],\n",
" [2, 3, 4, 2],\n",
" [0, 1, 1, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 5],\n",
" [2, 3, 4, 2],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 5],\n",
" [2, 3, 4, 3],\n",
" [1, 0, 1, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 5],\n",
" [2, 3, 4, 3],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 5],\n",
" [2, 3, 4, 3],\n",
" [0, 1, 2, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 5],\n",
" [2, 3, 4, 3],\n",
" [1, 0, 1, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 5],\n",
" [2, 3, 4, 4],\n",
" [1, 0, 1, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 5],\n",
" [1, 2, 3, 5],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [1, 2, 3, 2],\n",
" [0, 1, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [1, 2, 3, 2],\n",
" [1, 0, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 2, 3, 3],\n",
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [0, 0, 3, 4],\n",
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [0, 1, 3, 4],\n",
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [0, 2, 3, 4],\n",
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [0, 2, 3, 4],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 4, 1],\n",
" [1, 2, 0, 0]], dtype=int32), action_mask=Array([False, True, True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 4, 1],\n",
" [0, 1, 1, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 4, 1],\n",
" [0, 1, 2, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 4, 1],\n",
" [0, 1, 1, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 4, 1],\n",
" [1, 0, 2, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 4, 1],\n",
" [1, 1, 2, 3]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 4, 1],\n",
" [2, 2, 3, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [3, 3, 4, 2],\n",
" [1, 2, 3, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [0, 4, 4, 2],\n",
" [1, 1, 2, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [1, 4, 4, 2],\n",
" [1, 1, 2, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 4, 4, 2],\n",
" [1, 1, 2, 3]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 5, 2, 0],\n",
" [2, 2, 3, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 6, 3, 7],\n",
" [4, 6, 7, 6],\n",
" [3, 2, 2, 1],\n",
" [1, 0, 3, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 2, 7, 6],\n",
" [3, 0, 2, 1],\n",
" [1, 0, 3, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 2, 7, 6],\n",
" [2, 3, 2, 1],\n",
" [0, 1, 3, 2]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 2, 7, 6],\n",
" [2, 3, 2, 1],\n",
" [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 2, 7, 6],\n",
" [2, 4, 3, 2],\n",
" [1, 1, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 2, 7, 6],\n",
" [2, 4, 3, 2],\n",
" [0, 0, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 2, 7, 6],\n",
" [2, 4, 3, 3],\n",
" [1, 0, 2, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 2, 7, 6],\n",
" [2, 2, 4, 4],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 3, 7, 6],\n",
" [2, 0, 4, 4],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 3, 7, 6],\n",
" [0, 0, 2, 5],\n",
" [0, 2, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 3, 7, 6],\n",
" [0, 2, 3, 5],\n",
" [0, 2, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 3, 7, 6],\n",
" [0, 3, 3, 5],\n",
" [0, 1, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 4, 7, 6],\n",
" [1, 1, 3, 5],\n",
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [0, 5, 7, 6],\n",
" [0, 2, 3, 5],\n",
" [0, 1, 0, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [0, 5, 7, 6],\n",
" [1, 2, 3, 5],\n",
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [1, 5, 7, 6],\n",
" [0, 2, 3, 5],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [2, 5, 7, 6],\n",
" [0, 2, 3, 5],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [2, 5, 7, 6],\n",
" [1, 2, 3, 5],\n",
" [1, 0, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [2, 5, 7, 6],\n",
" [2, 2, 3, 5],\n",
" [0, 1, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [3, 5, 7, 6],\n",
" [0, 2, 3, 5],\n",
" [1, 1, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [3, 5, 7, 6],\n",
" [1, 2, 3, 5],\n",
" [1, 1, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [3, 5, 7, 6],\n",
" [2, 2, 3, 5],\n",
" [1, 1, 1, 2]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [3, 5, 7, 6],\n",
" [3, 3, 5, 1],\n",
" [2, 1, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 5, 1],\n",
" [1, 1, 2, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 5, 1],\n",
" [1, 0, 2, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 5, 1],\n",
" [0, 1, 1, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [2, 3, 5, 1],\n",
" [2, 0, 2, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [3, 3, 5, 1],\n",
" [1, 0, 2, 3]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [0, 4, 5, 1],\n",
" [1, 1, 2, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"TimeStep(step_type=Array(2, dtype=int8), reward=Array(0., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[5, 7, 3, 7],\n",
" [4, 5, 7, 6],\n",
" [1, 4, 5, 1],\n",
" [2, 1, 2, 3]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
"3004.0\n"
]
}
],
"source": [
"import jax, jumanji\n",
"\n",
"env = jumanji.make(\"Game2048-v1\")\n",
"key = jax.random.PRNGKey(48)\n",
"jit_reset = jax.jit(env.reset)\n",
"jit_step = jax.jit(env.step)\n",
"state, timestep = jax.jit(env.reset)(key)\n",
"jit_policy = jax.jit(policy)\n",
"total_reward = 0\n",
"while True:\n",
" board, action_mask = timestep[\"observation\"]\n",
" action = jit_policy(timestep[\"observation\"][0].reshape(-1))\n",
" score_with_mask = jnp.where(action_mask, action, -jnp.inf)\n",
" action = jnp.argmax(score_with_mask)\n",
" state, timestep = jit_step(state, action)\n",
" done = jnp.all(~timestep[\"observation\"][1])\n",
" print(timestep)\n",
" total_reward += timestep[\"reward\"]\n",
" if done:\n",
" break\n",
"print(total_reward)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-05T05:15:43.041491500Z",
"start_time": "2024-06-05T05:15:37.325953600Z"
}
},
"id": "f166e09c5be1a8fb"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
},
"id": "187326d08ac1eeb4"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}