1875 lines
159 KiB
Plaintext
1875 lines
159 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "initial_id",
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"ExecuteTime": {
|
|
"end_time": "2024-06-05T07:40:13.841629100Z",
|
|
"start_time": "2024-06-05T07:40:13.076164500Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"initializing\n",
|
|
"initializing finished\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import jax.numpy as jnp\n",
|
|
"\n",
|
|
"from pipeline import Pipeline\n",
|
|
"from algorithm.neat import *\n",
|
|
"from algorithm.neat.gene.node.default_without_response import NodeGeneWithoutResponse\n",
|
|
"\n",
|
|
"from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048\n",
|
|
"from tensorneat.utils import Act, Agg\n",
|
|
"\n",
|
|
"pipeline = Pipeline(\n",
|
|
" algorithm=NEAT(\n",
|
|
" species=DefaultSpecies(\n",
|
|
" genome=DefaultGenome(\n",
|
|
" num_inputs=16,\n",
|
|
" num_outputs=4,\n",
|
|
" max_nodes=100,\n",
|
|
" max_conns=1000,\n",
|
|
" node_gene=NodeGeneWithoutResponse(\n",
|
|
" activation_default=Act.sigmoid,\n",
|
|
" activation_options=(\n",
|
|
" Act.sigmoid,\n",
|
|
" Act.relu,\n",
|
|
" Act.tanh,\n",
|
|
" Act.identity,\n",
|
|
" ),\n",
|
|
" aggregation_default=Agg.sum,\n",
|
|
" aggregation_options=(Agg.sum,),\n",
|
|
" activation_replace_rate=0.02,\n",
|
|
" aggregation_replace_rate=0.02,\n",
|
|
" bias_mutate_rate=0.03,\n",
|
|
" bias_init_std=0.5,\n",
|
|
" bias_mutate_power=0.2,\n",
|
|
" bias_replace_rate=0.01,\n",
|
|
" ),\n",
|
|
" conn_gene=DefaultConnGene(\n",
|
|
" weight_mutate_rate=0.015,\n",
|
|
" weight_replace_rate=0.003,\n",
|
|
" weight_mutate_power=0.5,\n",
|
|
" ),\n",
|
|
" mutation=DefaultMutation(\n",
|
|
" node_add=0.1, conn_add=0.2, conn_delete=0.2\n",
|
|
" ),\n",
|
|
" ),\n",
|
|
" pop_size=1000,\n",
|
|
" species_size=5,\n",
|
|
" survival_threshold=0.1,\n",
|
|
" max_stagnation=7,\n",
|
|
" genome_elitism=3,\n",
|
|
" compatibility_threshold=1.2,\n",
|
|
" ),\n",
|
|
" ),\n",
|
|
" problem=Jumanji_2048(max_step=10000, repeat_times=5),\n",
|
|
" generation_limit=100,\n",
|
|
" fitness_target=13000,\n",
|
|
" save_path=\"2048.pkl\",\n",
|
|
")\n",
|
|
"state = pipeline.setup()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"data = np.load('2048.npz')\n",
|
|
"nodes, conns = data['nodes'], data['conns']"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"ExecuteTime": {
|
|
"end_time": "2024-06-05T07:40:13.932015100Z",
|
|
"start_time": "2024-06-05T07:40:13.876631500Z"
|
|
}
|
|
},
|
|
"id": "a0915ecf8179f347"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"outputs": [],
|
|
"source": [
|
|
"genome = pipeline.algorithm.species.genome\n",
|
|
"transformed = genome.transform(state, nodes, conns)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"ExecuteTime": {
|
|
"end_time": "2024-06-05T07:40:14.585804800Z",
|
|
"start_time": "2024-06-05T07:40:14.568805Z"
|
|
}
|
|
},
|
|
"id": "cd1fa65e8a9d6e13"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"outputs": [],
|
|
"source": [
|
|
"def policy(board):\n",
|
|
" action_scores = genome.forward(state, transformed, board)\n",
|
|
" return action_scores"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"ExecuteTime": {
|
|
"end_time": "2024-06-05T07:40:15.124383600Z",
|
|
"start_time": "2024-06-05T07:40:15.118384200Z"
|
|
}
|
|
},
|
|
"id": "61bc1895af304651"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [0, 0, 1, 0],\n",
|
|
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [1, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 1],\n",
|
|
" [1, 1, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [1, 1, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 1, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [2, 1, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 2, 0, 0],\n",
|
|
" [0, 1, 0, 0],\n",
|
|
" [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 1, 2],\n",
|
|
" [0, 0, 0, 1],\n",
|
|
" [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [0, 0, 0, 2],\n",
|
|
" [0, 0, 1, 1],\n",
|
|
" [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 2],\n",
|
|
" [0, 1, 2, 1],\n",
|
|
" [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 0, 0, 2],\n",
|
|
" [0, 0, 2, 1],\n",
|
|
" [0, 1, 3, 2]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
|
|
" [2, 0, 0, 0],\n",
|
|
" [2, 1, 0, 0],\n",
|
|
" [1, 3, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 0, 1],\n",
|
|
" [3, 2, 0, 0],\n",
|
|
" [1, 3, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [1, 0, 0, 0],\n",
|
|
" [3, 2, 0, 0],\n",
|
|
" [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 1, 0, 1],\n",
|
|
" [3, 2, 0, 0],\n",
|
|
" [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [1, 1, 0, 0],\n",
|
|
" [3, 2, 0, 0],\n",
|
|
" [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 1, 0, 0],\n",
|
|
" [3, 2, 1, 0],\n",
|
|
" [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [1, 1, 0, 0],\n",
|
|
" [3, 2, 1, 1],\n",
|
|
" [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 2, 0, 0],\n",
|
|
" [3, 2, 1, 1],\n",
|
|
" [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 0, 1],\n",
|
|
" [3, 3, 1, 2],\n",
|
|
" [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 1, 0],\n",
|
|
" [3, 0, 1, 1],\n",
|
|
" [1, 4, 2, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 1, 0],\n",
|
|
" [3, 0, 2, 1],\n",
|
|
" [1, 4, 2, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 0, 1],\n",
|
|
" [3, 0, 1, 1],\n",
|
|
" [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
|
|
" [1, 0, 0, 0],\n",
|
|
" [3, 0, 1, 2],\n",
|
|
" [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 1, 0],\n",
|
|
" [3, 0, 1, 3],\n",
|
|
" [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 0, 1],\n",
|
|
" [3, 0, 2, 0],\n",
|
|
" [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 0, 1],\n",
|
|
" [3, 0, 2, 1],\n",
|
|
" [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [1, 0, 0, 0],\n",
|
|
" [3, 0, 2, 2],\n",
|
|
" [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 1, 2],\n",
|
|
" [3, 0, 2, 2],\n",
|
|
" [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 1, 1, 0],\n",
|
|
" [3, 0, 2, 3],\n",
|
|
" [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [1, 0, 1, 0],\n",
|
|
" [3, 1, 2, 3],\n",
|
|
" [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 0, 2, 0],\n",
|
|
" [3, 1, 2, 3],\n",
|
|
" [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 1, 0, 1],\n",
|
|
" [3, 1, 3, 3],\n",
|
|
" [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 1, 1],\n",
|
|
" [3, 2, 0, 3],\n",
|
|
" [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [1, 0, 0, 1],\n",
|
|
" [3, 2, 1, 3],\n",
|
|
" [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [2, 0, 0, 1],\n",
|
|
" [3, 2, 1, 3],\n",
|
|
" [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [2, 1, 0, 2],\n",
|
|
" [3, 2, 1, 3],\n",
|
|
" [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [0, 2, 1, 2],\n",
|
|
" [3, 2, 1, 3],\n",
|
|
" [0, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 1, 0, 2],\n",
|
|
" [1, 3, 2, 3],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 1, 2],\n",
|
|
" [1, 3, 2, 3],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [0, 0, 1, 2],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 1, 1, 2],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 1, 2, 2],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 1, 0, 2],\n",
|
|
" [2, 3, 3, 3],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
|
|
" [0, 0, 2, 2],\n",
|
|
" [0, 2, 3, 4],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
|
|
" [0, 0, 2, 2],\n",
|
|
" [0, 3, 3, 4],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 0, 2, 2],\n",
|
|
" [1, 3, 3, 4],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [0, 0, 2, 2],\n",
|
|
" [2, 3, 3, 4],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
|
|
" [0, 1, 2, 2],\n",
|
|
" [2, 3, 3, 4],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [1, 1, 2, 2],\n",
|
|
" [2, 3, 3, 4],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
|
|
" [1, 2, 2, 2],\n",
|
|
" [2, 3, 3, 4],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
|
|
" [2, 2, 2, 2],\n",
|
|
" [2, 3, 3, 4],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
|
|
" [1, 2, 2, 2],\n",
|
|
" [3, 3, 3, 4],\n",
|
|
" [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n",
|
|
" [0, 2, 2, 2],\n",
|
|
" [1, 3, 3, 4],\n",
|
|
" [4, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
|
|
" [3, 2, 0, 2],\n",
|
|
" [1, 4, 4, 0],\n",
|
|
" [4, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
|
|
" [3, 2, 1, 0],\n",
|
|
" [1, 4, 0, 2],\n",
|
|
" [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
|
|
" [3, 2, 0, 0],\n",
|
|
" [1, 4, 1, 2],\n",
|
|
" [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
|
|
" [3, 2, 1, 1],\n",
|
|
" [1, 4, 1, 2],\n",
|
|
" [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
|
|
" [3, 2, 0, 1],\n",
|
|
" [1, 4, 2, 2],\n",
|
|
" [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n",
|
|
" [3, 2, 1, 1],\n",
|
|
" [1, 4, 2, 2],\n",
|
|
" [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(76., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 2],\n",
|
|
" [1, 3, 2, 2],\n",
|
|
" [0, 1, 4, 3],\n",
|
|
" [0, 4, 1, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 0],\n",
|
|
" [0, 3, 2, 3],\n",
|
|
" [0, 1, 4, 3],\n",
|
|
" [1, 4, 1, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n",
|
|
" [0, 3, 2, 0],\n",
|
|
" [0, 1, 4, 4],\n",
|
|
" [2, 4, 1, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
|
|
" [0, 3, 2, 2],\n",
|
|
" [0, 1, 4, 4],\n",
|
|
" [3, 4, 1, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 1],\n",
|
|
" [1, 0, 3, 3],\n",
|
|
" [0, 0, 1, 5],\n",
|
|
" [3, 4, 1, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 1, 2, 3],\n",
|
|
" [1, 0, 3, 5],\n",
|
|
" [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 2, 2, 3],\n",
|
|
" [1, 1, 3, 5],\n",
|
|
" [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 0, 3, 3],\n",
|
|
" [1, 2, 3, 5],\n",
|
|
" [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
|
|
" [0, 0, 0, 3],\n",
|
|
" [1, 2, 4, 5],\n",
|
|
" [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 1],\n",
|
|
" [0, 0, 1, 3],\n",
|
|
" [1, 2, 4, 5],\n",
|
|
" [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
|
|
" [1, 3, 0, 1],\n",
|
|
" [1, 2, 4, 5],\n",
|
|
" [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
|
|
" [2, 3, 0, 1],\n",
|
|
" [2, 2, 4, 5],\n",
|
|
" [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
|
|
" [1, 3, 0, 1],\n",
|
|
" [3, 2, 4, 5],\n",
|
|
" [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [1, 3, 1, 1],\n",
|
|
" [1, 2, 4, 5],\n",
|
|
" [4, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [1, 3, 1, 1],\n",
|
|
" [2, 2, 4, 5],\n",
|
|
" [4, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(44., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 1, 3, 2],\n",
|
|
" [0, 3, 4, 5],\n",
|
|
" [0, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [0, 1, 3, 2],\n",
|
|
" [0, 3, 4, 5],\n",
|
|
" [1, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 2, 3, 2],\n",
|
|
" [0, 3, 4, 5],\n",
|
|
" [1, 5, 2, 6]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
|
|
" [0, 2, 3, 2],\n",
|
|
" [0, 3, 4, 5],\n",
|
|
" [2, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
|
|
" [2, 2, 3, 2],\n",
|
|
" [0, 3, 4, 5],\n",
|
|
" [2, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 2, 3, 3],\n",
|
|
" [1, 3, 4, 5],\n",
|
|
" [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 2, 4],\n",
|
|
" [1, 3, 4, 5],\n",
|
|
" [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 0, 2, 4],\n",
|
|
" [2, 3, 4, 5],\n",
|
|
" [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
|
|
" [2, 4, 0, 0],\n",
|
|
" [2, 3, 4, 5],\n",
|
|
" [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [1, 4, 1, 0],\n",
|
|
" [3, 3, 4, 5],\n",
|
|
" [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [1, 4, 1, 0],\n",
|
|
" [1, 3, 4, 5],\n",
|
|
" [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [0, 4, 1, 0],\n",
|
|
" [2, 3, 4, 5],\n",
|
|
" [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
|
|
" [0, 4, 1, 1],\n",
|
|
" [2, 3, 4, 5],\n",
|
|
" [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n",
|
|
" [1, 4, 1, 1],\n",
|
|
" [2, 3, 4, 5],\n",
|
|
" [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
|
|
" [1, 4, 2, 1],\n",
|
|
" [2, 3, 4, 5],\n",
|
|
" [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
|
|
" [2, 4, 2, 1],\n",
|
|
" [2, 3, 4, 5],\n",
|
|
" [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
|
|
" [0, 4, 2, 2],\n",
|
|
" [3, 3, 4, 5],\n",
|
|
" [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 2, 0],\n",
|
|
" [1, 4, 2, 2],\n",
|
|
" [3, 3, 4, 5],\n",
|
|
" [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
|
|
" [1, 4, 3, 2],\n",
|
|
" [3, 3, 4, 5],\n",
|
|
" [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
|
|
" [1, 4, 3, 2],\n",
|
|
" [4, 4, 5, 2],\n",
|
|
" [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [2, 1, 3, 0],\n",
|
|
" [1, 5, 5, 3],\n",
|
|
" [5, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
|
|
" [2, 0, 3, 0],\n",
|
|
" [1, 1, 5, 3],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
|
|
" [2, 0, 3, 1],\n",
|
|
" [1, 1, 5, 3],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
|
|
" [2, 2, 3, 1],\n",
|
|
" [1, 1, 5, 3],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
|
|
" [2, 2, 3, 2],\n",
|
|
" [1, 1, 5, 3],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
|
|
" [1, 3, 3, 2],\n",
|
|
" [0, 2, 5, 3],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [0, 3, 3, 3],\n",
|
|
" [1, 2, 5, 3],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
|
|
" [0, 3, 3, 0],\n",
|
|
" [1, 2, 5, 4],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
|
|
" [0, 0, 1, 4],\n",
|
|
" [1, 2, 5, 4],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [0, 0, 1, 2],\n",
|
|
" [1, 2, 5, 5],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [0, 1, 1, 2],\n",
|
|
" [1, 2, 5, 5],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [2, 1, 1, 2],\n",
|
|
" [2, 2, 5, 5],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 1, 1, 2],\n",
|
|
" [3, 2, 5, 5],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 1, 2, 2],\n",
|
|
" [1, 3, 2, 6],\n",
|
|
" [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 1, 0, 0],\n",
|
|
" [1, 3, 2, 2],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 1, 0, 1],\n",
|
|
" [1, 3, 2, 2],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 1, 0, 2],\n",
|
|
" [1, 3, 2, 2],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 1, 0, 1],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 0, 2],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [0, 0, 1, 2],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [1, 0, 1, 2],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [1, 1, 1, 2],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 2, 1, 2],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
|
|
" [1, 2, 1, 2],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
|
|
" [2, 2, 1, 2],\n",
|
|
" [2, 3, 2, 3],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [0, 2, 2, 2],\n",
|
|
" [3, 3, 2, 3],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
|
|
" [0, 2, 0, 2],\n",
|
|
" [3, 3, 3, 3],\n",
|
|
" [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n",
|
|
" [1, 2, 0, 2],\n",
|
|
" [3, 3, 0, 3],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [1, 2, 2, 2],\n",
|
|
" [3, 3, 1, 3],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 2],\n",
|
|
" [0, 1, 2, 3],\n",
|
|
" [0, 4, 1, 3],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n",
|
|
" [0, 2, 2, 2],\n",
|
|
" [0, 4, 1, 4],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 2, 3, 2],\n",
|
|
" [1, 4, 1, 4],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [2, 3, 2, 0],\n",
|
|
" [1, 4, 1, 4],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [0, 2, 3, 2],\n",
|
|
" [1, 4, 1, 4],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
|
|
" [1, 2, 3, 2],\n",
|
|
" [1, 4, 1, 4],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 2, 3, 3],\n",
|
|
" [2, 4, 1, 4],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [0, 1, 2, 4],\n",
|
|
" [2, 4, 1, 4],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 1, 2, 1],\n",
|
|
" [2, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [0, 2, 2, 1],\n",
|
|
" [2, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 2, 3, 1],\n",
|
|
" [2, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 2, 3, 2],\n",
|
|
" [2, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
|
|
" [2, 3, 2, 0],\n",
|
|
" [2, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
|
|
" [1, 3, 2, 0],\n",
|
|
" [3, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
|
|
" [2, 3, 2, 0],\n",
|
|
" [3, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 2],\n",
|
|
" [0, 2, 3, 2],\n",
|
|
" [3, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 2, 3, 3],\n",
|
|
" [3, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
|
|
" [1, 2, 4, 0],\n",
|
|
" [3, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
|
|
" [2, 2, 4, 0],\n",
|
|
" [3, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n",
|
|
" [3, 4, 0, 1],\n",
|
|
" [3, 4, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(48., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [2, 0, 1, 1],\n",
|
|
" [4, 5, 1, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [2, 0, 0, 1],\n",
|
|
" [4, 5, 2, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [2, 0, 1, 1],\n",
|
|
" [4, 5, 2, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 0, 2, 2],\n",
|
|
" [4, 5, 2, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
|
|
" [1, 0, 0, 2],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
|
|
" [1, 2, 0, 2],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n",
|
|
" [2, 2, 0, 2],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
|
|
" [3, 2, 0, 2],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [3, 2, 1, 2],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
|
|
" [3, 2, 1, 2],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [3, 2, 2, 3],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
|
|
" [3, 3, 3, 0],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n",
|
|
" [3, 3, 1, 0],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
|
|
" [3, 3, 0, 0],\n",
|
|
" [4, 5, 1, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
|
|
" [3, 3, 0, 0],\n",
|
|
" [4, 5, 2, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
|
|
" [3, 3, 1, 0],\n",
|
|
" [4, 5, 2, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
|
|
" [3, 3, 1, 1],\n",
|
|
" [4, 5, 2, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
|
|
" [3, 3, 2, 1],\n",
|
|
" [4, 5, 2, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
|
|
" [3, 3, 0, 2],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
|
|
" [3, 3, 1, 2],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
|
|
" [1, 4, 1, 2],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [1, 4, 2, 3],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n",
|
|
" [1, 4, 2, 3],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n",
|
|
" [1, 4, 2, 3],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
|
|
" [1, 4, 3, 3],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
|
|
" [1, 4, 1, 3],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 2],\n",
|
|
" [1, 4, 1, 3],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n",
|
|
" [1, 4, 1, 3],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 0],\n",
|
|
" [2, 4, 2, 3],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 1],\n",
|
|
" [2, 4, 2, 3],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
|
|
" [2, 4, 2, 3],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 2],\n",
|
|
" [2, 4, 2, 3],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 1],\n",
|
|
" [2, 4, 2, 3],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 1],\n",
|
|
" [3, 4, 3, 3],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n",
|
|
" [0, 3, 4, 4],\n",
|
|
" [4, 5, 4, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n",
|
|
" [1, 3, 3, 4],\n",
|
|
" [4, 5, 5, 5],\n",
|
|
" [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n",
|
|
" [1, 3, 1, 4],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 6, 6, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 0],\n",
|
|
" [1, 3, 1, 4],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 0],\n",
|
|
" [2, 3, 2, 4],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
|
|
" [3, 3, 2, 4],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(272., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n",
|
|
" [4, 2, 4, 0],\n",
|
|
" [4, 5, 3, 5],\n",
|
|
" [5, 8, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n",
|
|
" [2, 2, 4, 0],\n",
|
|
" [5, 5, 3, 1],\n",
|
|
" [5, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n",
|
|
" [0, 2, 4, 0],\n",
|
|
" [2, 5, 3, 1],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n",
|
|
" [0, 2, 4, 0],\n",
|
|
" [2, 5, 3, 2],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n",
|
|
" [1, 2, 4, 1],\n",
|
|
" [2, 5, 3, 2],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 1],\n",
|
|
" [1, 2, 4, 1],\n",
|
|
" [2, 5, 3, 2],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [2, 3, 4, 2],\n",
|
|
" [2, 5, 3, 2],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n",
|
|
" [0, 3, 4, 0],\n",
|
|
" [3, 5, 3, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
|
|
" [0, 3, 4, 1],\n",
|
|
" [3, 5, 3, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n",
|
|
" [3, 4, 1, 0],\n",
|
|
" [3, 5, 4, 1],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [2, 4, 1, 0],\n",
|
|
" [4, 5, 4, 1],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [2, 4, 1, 0],\n",
|
|
" [4, 5, 4, 2],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [2, 4, 1, 1],\n",
|
|
" [4, 5, 4, 2],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [0, 2, 4, 2],\n",
|
|
" [4, 5, 4, 2],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [1, 2, 0, 1],\n",
|
|
" [4, 5, 5, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 1, 2, 1],\n",
|
|
" [1, 4, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n",
|
|
" [0, 1, 2, 2],\n",
|
|
" [1, 4, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 1, 3, 2],\n",
|
|
" [1, 4, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [0, 1, 3, 2],\n",
|
|
" [2, 4, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 1, 3, 2],\n",
|
|
" [2, 4, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [2, 3, 2, 1],\n",
|
|
" [2, 4, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n",
|
|
" [1, 3, 2, 1],\n",
|
|
" [3, 4, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [1, 3, 3, 1],\n",
|
|
" [3, 4, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
|
|
" [1, 4, 1, 0],\n",
|
|
" [3, 4, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [2, 0, 1, 1],\n",
|
|
" [3, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [2, 0, 2, 1],\n",
|
|
" [3, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [2, 0, 2, 2],\n",
|
|
" [3, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
|
|
" [3, 2, 0, 0],\n",
|
|
" [3, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
|
|
" [1, 2, 1, 0],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 2, 1, 2],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
|
|
" [1, 2, 1, 2],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [2, 2, 1, 2],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 0],\n",
|
|
" [3, 1, 2, 0],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
|
|
" [3, 1, 2, 0],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
|
|
" [3, 2, 2, 1],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
|
|
" [3, 3, 1, 0],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
|
|
" [3, 3, 1, 1],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
|
|
" [3, 3, 2, 1],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
|
|
" [3, 3, 2, 2],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n",
|
|
" [4, 3, 0, 0],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
|
|
" [2, 3, 1, 0],\n",
|
|
" [5, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n",
|
|
" [2, 3, 2, 0],\n",
|
|
" [5, 5, 6, 3],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n",
|
|
" [2, 3, 2, 0],\n",
|
|
" [6, 6, 3, 1],\n",
|
|
" [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [0, 3, 2, 1],\n",
|
|
" [3, 6, 3, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
|
|
" [0, 3, 2, 0],\n",
|
|
" [3, 6, 3, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n",
|
|
" [3, 2, 0, 0],\n",
|
|
" [3, 6, 3, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [2, 2, 0, 1],\n",
|
|
" [4, 6, 3, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [2, 2, 0, 2],\n",
|
|
" [4, 6, 3, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [2, 2, 1, 0],\n",
|
|
" [4, 6, 3, 3],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 0, 3, 1],\n",
|
|
" [0, 4, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 1, 3, 2],\n",
|
|
" [1, 4, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [1, 3, 2, 0],\n",
|
|
" [1, 4, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [1, 3, 2, 0],\n",
|
|
" [2, 4, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [2, 3, 2, 1],\n",
|
|
" [2, 4, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 3, 2, 1],\n",
|
|
" [3, 4, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [0, 3, 2, 2],\n",
|
|
" [3, 4, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 0],\n",
|
|
" [1, 3, 2, 2],\n",
|
|
" [3, 4, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
|
|
" [0, 1, 3, 3],\n",
|
|
" [3, 4, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
|
|
" [1, 4, 1, 0],\n",
|
|
" [3, 4, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [2, 2, 1, 0],\n",
|
|
" [3, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
|
|
" [0, 0, 3, 1],\n",
|
|
" [3, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [1, 0, 3, 2],\n",
|
|
" [3, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
|
|
" [1, 3, 2, 0],\n",
|
|
" [3, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
|
|
" [2, 3, 2, 0],\n",
|
|
" [3, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n",
|
|
" [2, 3, 2, 1],\n",
|
|
" [3, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 2],\n",
|
|
" [2, 3, 2, 1],\n",
|
|
" [3, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 1],\n",
|
|
" [2, 3, 2, 1],\n",
|
|
" [3, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n",
|
|
" [3, 3, 2, 2],\n",
|
|
" [3, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 1],\n",
|
|
" [0, 3, 2, 2],\n",
|
|
" [4, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 0],\n",
|
|
" [3, 3, 1, 0],\n",
|
|
" [4, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 3],\n",
|
|
" [0, 0, 4, 1],\n",
|
|
" [4, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 0],\n",
|
|
" [4, 1, 1, 0],\n",
|
|
" [4, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 0],\n",
|
|
" [1, 1, 1, 0],\n",
|
|
" [5, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n",
|
|
" [1, 1, 2, 0],\n",
|
|
" [5, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n",
|
|
" [1, 1, 2, 1],\n",
|
|
" [5, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 0],\n",
|
|
" [1, 1, 2, 2],\n",
|
|
" [5, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n",
|
|
" [2, 1, 2, 2],\n",
|
|
" [5, 5, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 1],\n",
|
|
" [0, 2, 1, 3],\n",
|
|
" [0, 6, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 3, 1],\n",
|
|
" [2, 2, 1, 3],\n",
|
|
" [1, 6, 6, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 0, 1],\n",
|
|
" [3, 1, 3, 0],\n",
|
|
" [1, 7, 4, 0],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [4, 2, 3, 0],\n",
|
|
" [1, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
|
|
" [0, 4, 2, 3],\n",
|
|
" [1, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n",
|
|
" [0, 4, 2, 3],\n",
|
|
" [2, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [1, 4, 2, 3],\n",
|
|
" [2, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 2],\n",
|
|
" [1, 4, 2, 3],\n",
|
|
" [2, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 2],\n",
|
|
" [1, 4, 2, 3],\n",
|
|
" [2, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 2],\n",
|
|
" [2, 4, 2, 3],\n",
|
|
" [2, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 2],\n",
|
|
" [1, 4, 2, 3],\n",
|
|
" [3, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 1],\n",
|
|
" [1, 4, 2, 3],\n",
|
|
" [3, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 1],\n",
|
|
" [1, 4, 3, 3],\n",
|
|
" [3, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 1],\n",
|
|
" [1, 4, 4, 0],\n",
|
|
" [3, 7, 4, 1],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 2, 0],\n",
|
|
" [1, 4, 1, 0],\n",
|
|
" [3, 7, 5, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 0],\n",
|
|
" [1, 4, 1, 0],\n",
|
|
" [3, 7, 5, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 0, 0],\n",
|
|
" [1, 4, 2, 1],\n",
|
|
" [3, 7, 5, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 2],\n",
|
|
" [1, 4, 2, 1],\n",
|
|
" [3, 7, 5, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 3, 2],\n",
|
|
" [2, 4, 2, 1],\n",
|
|
" [3, 7, 5, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n",
|
|
" [2, 4, 2, 1],\n",
|
|
" [3, 7, 5, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n",
|
|
" [2, 4, 3, 2],\n",
|
|
" [3, 7, 5, 2],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
|
|
" [2, 4, 3, 0],\n",
|
|
" [3, 7, 5, 3],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
|
|
" [2, 4, 3, 1],\n",
|
|
" [3, 7, 5, 3],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n",
|
|
" [2, 4, 3, 2],\n",
|
|
" [3, 7, 5, 3],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 2],\n",
|
|
" [2, 4, 3, 2],\n",
|
|
" [3, 7, 5, 3],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
|
|
" [2, 4, 4, 3],\n",
|
|
" [3, 7, 5, 3],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
|
|
" [2, 4, 4, 1],\n",
|
|
" [3, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n",
|
|
" [2, 4, 4, 2],\n",
|
|
" [3, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n",
|
|
" [2, 5, 2, 0],\n",
|
|
" [3, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [3, 5, 2, 1],\n",
|
|
" [3, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n",
|
|
" [0, 5, 2, 2],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n",
|
|
" [5, 3, 0, 1],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
|
|
" [1, 5, 3, 1],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 0],\n",
|
|
" [1, 5, 3, 1],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n",
|
|
" [1, 5, 3, 1],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n",
|
|
" [1, 5, 3, 1],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 0, 1],\n",
|
|
" [1, 5, 3, 1],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 1, 0],\n",
|
|
" [1, 5, 3, 2],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 0],\n",
|
|
" [1, 5, 3, 2],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n",
|
|
" [1, 5, 3, 2],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(2, dtype=int8), reward=Array(4., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n",
|
|
" [2, 5, 3, 2],\n",
|
|
" [4, 7, 5, 4],\n",
|
|
" [7, 8, 1, 5]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n",
|
|
"3716.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import jax, jumanji\n",
|
|
"\n",
|
|
"env = jumanji.make(\"Game2048-v1\")\n",
|
|
"key = jax.random.PRNGKey(0)\n",
|
|
"jit_reset = jax.jit(env.reset)\n",
|
|
"jit_step = jax.jit(env.step)\n",
|
|
"state, timestep = jax.jit(env.reset)(key)\n",
|
|
"jit_policy = jax.jit(policy)\n",
|
|
"total_reward = 0\n",
|
|
"while True:\n",
|
|
" board, action_mask = timestep[\"observation\"]\n",
|
|
" action = jit_policy(timestep[\"observation\"][0].reshape(-1))\n",
|
|
" score_with_mask = jnp.where(action_mask, action, -jnp.inf)\n",
|
|
" action = jnp.argmax(score_with_mask)\n",
|
|
" state, timestep = jit_step(state, action)\n",
|
|
" done = jnp.all(~timestep[\"observation\"][1])\n",
|
|
" print(timestep)\n",
|
|
" total_reward += timestep[\"reward\"]\n",
|
|
" if done:\n",
|
|
" break\n",
|
|
"print(total_reward)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"ExecuteTime": {
|
|
"end_time": "2024-06-05T07:41:33.703431900Z",
|
|
"start_time": "2024-06-05T07:41:26.102578200Z"
|
|
}
|
|
},
|
|
"id": "f166e09c5be1a8fb"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"outputs": [],
|
|
"source": [
|
|
"import jax.random\n",
|
|
"from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048\n",
|
|
"\n",
|
|
"\n",
|
|
"def random_policy(state, params, obs):\n",
|
|
" key = jax.random.key(obs.sum())\n",
|
|
" actions = jax.random.normal(key, (4,))\n",
|
|
" return actions\n",
|
|
"\n",
|
|
"problem = Jumanji_2048(max_step=10000, repeat_times=10, guarantee_invalid_action=True)\n",
|
|
"state = problem.setup()\n",
|
|
"jit_evaluate = jax.jit(lambda state, randkey: problem.evaluate(state, randkey, random_policy, None))"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"ExecuteTime": {
|
|
"end_time": "2024-06-05T08:06:59.491563700Z",
|
|
"start_time": "2024-06-05T08:06:59.465404900Z"
|
|
}
|
|
},
|
|
"id": "187326d08ac1eeb4"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1193.2001\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"reward = jit_evaluate(state, randkey)\n",
|
|
"print(reward)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"ExecuteTime": {
|
|
"end_time": "2024-06-05T08:07:21.630420300Z",
|
|
"start_time": "2024-06-05T08:07:21.107419400Z"
|
|
}
|
|
},
|
|
"id": "4b3506db87568d81"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n",
|
|
" [1, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [1, 1, 1, 1]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 1, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [2, 2, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [3, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [3, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [1, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [3, 2, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [2, 0, 1, 0],\n",
|
|
" [3, 2, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [2, 1, 0, 0],\n",
|
|
" [3, 2, 0, 1]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n",
|
|
" [0, 0, 0, 0],\n",
|
|
" [2, 1, 0, 0],\n",
|
|
" [3, 2, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n",
|
|
" [2, 2, 0, 0],\n",
|
|
" [3, 0, 0, 1],\n",
|
|
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 1],\n",
|
|
" [2, 2, 0, 0],\n",
|
|
" [3, 0, 0, 0],\n",
|
|
" [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [1, 1, 0, 0],\n",
|
|
" [2, 2, 0, 0],\n",
|
|
" [3, 1, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 1],\n",
|
|
" [2, 2, 0, 0],\n",
|
|
" [3, 1, 0, 2],\n",
|
|
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 0, 0, 0],\n",
|
|
" [2, 3, 1, 1],\n",
|
|
" [3, 1, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 0, 0, 0],\n",
|
|
" [2, 3, 0, 1],\n",
|
|
" [3, 1, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 0, 2, 1],\n",
|
|
" [0, 2, 3, 1],\n",
|
|
" [0, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 2, 1],\n",
|
|
" [0, 2, 3, 2],\n",
|
|
" [1, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
|
|
" [0, 3, 3, 2],\n",
|
|
" [1, 0, 1, 3],\n",
|
|
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 2, 1],\n",
|
|
" [1, 2, 3, 2],\n",
|
|
" [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [2, 1, 1, 0],\n",
|
|
" [1, 2, 3, 2],\n",
|
|
" [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 0, 2, 2],\n",
|
|
" [1, 2, 3, 2],\n",
|
|
" [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
|
|
" [2, 3, 3, 3],\n",
|
|
" [1, 0, 1, 3],\n",
|
|
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
|
|
" [2, 3, 3, 4],\n",
|
|
" [1, 0, 1, 0],\n",
|
|
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 3, 1],\n",
|
|
" [0, 2, 4, 4],\n",
|
|
" [0, 0, 0, 2],\n",
|
|
" [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [1, 0, 0, 4],\n",
|
|
" [0, 1, 3, 2],\n",
|
|
" [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n",
|
|
" [1, 4, 0, 0],\n",
|
|
" [1, 3, 2, 0],\n",
|
|
" [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n",
|
|
" [0, 0, 1, 4],\n",
|
|
" [0, 1, 3, 2],\n",
|
|
" [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 2, 2],\n",
|
|
" [0, 2, 3, 4],\n",
|
|
" [0, 0, 4, 2],\n",
|
|
" [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n",
|
|
" [1, 0, 2, 4],\n",
|
|
" [0, 1, 3, 2],\n",
|
|
" [2, 2, 4, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 2, 2],\n",
|
|
" [2, 2, 3, 4],\n",
|
|
" [0, 1, 4, 2],\n",
|
|
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 3],\n",
|
|
" [0, 3, 3, 4],\n",
|
|
" [1, 1, 4, 2],\n",
|
|
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 3],\n",
|
|
" [0, 1, 3, 4],\n",
|
|
" [0, 0, 4, 2],\n",
|
|
" [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 3],\n",
|
|
" [1, 1, 3, 4],\n",
|
|
" [0, 0, 4, 2],\n",
|
|
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n",
|
|
" [0, 1, 3, 4],\n",
|
|
" [0, 1, 4, 3],\n",
|
|
" [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n",
|
|
" [0, 2, 3, 4],\n",
|
|
" [0, 0, 4, 3],\n",
|
|
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n",
|
|
" [0, 2, 3, 4],\n",
|
|
" [2, 0, 4, 3],\n",
|
|
" [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 3, 2, 3],\n",
|
|
" [0, 2, 3, 4],\n",
|
|
" [0, 0, 4, 3],\n",
|
|
" [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 3, 2, 3],\n",
|
|
" [0, 2, 3, 4],\n",
|
|
" [0, 1, 4, 3],\n",
|
|
" [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
|
|
" [2, 3, 4, 0],\n",
|
|
" [1, 4, 3, 0],\n",
|
|
" [2, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
|
|
" [0, 2, 3, 4],\n",
|
|
" [0, 1, 4, 3],\n",
|
|
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
|
|
" [2, 3, 4, 0],\n",
|
|
" [1, 4, 3, 0],\n",
|
|
" [1, 2, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
|
|
" [2, 3, 4, 0],\n",
|
|
" [2, 4, 3, 1],\n",
|
|
" [0, 2, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
|
|
" [2, 3, 4, 0],\n",
|
|
" [2, 4, 3, 1],\n",
|
|
" [2, 1, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
|
|
" [2, 3, 4, 0],\n",
|
|
" [2, 4, 3, 1],\n",
|
|
" [2, 2, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
|
|
" [0, 2, 3, 4],\n",
|
|
" [2, 4, 3, 1],\n",
|
|
" [0, 1, 2, 3]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
|
|
" [2, 3, 4, 0],\n",
|
|
" [2, 4, 3, 1],\n",
|
|
" [1, 2, 3, 1]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n",
|
|
" [1, 2, 3, 4],\n",
|
|
" [2, 4, 3, 1],\n",
|
|
" [1, 2, 3, 1]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(44., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 0, 0, 1],\n",
|
|
" [1, 3, 0, 1],\n",
|
|
" [2, 4, 4, 4],\n",
|
|
" [1, 2, 4, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 0, 0],\n",
|
|
" [1, 3, 1, 0],\n",
|
|
" [2, 5, 4, 1],\n",
|
|
" [1, 2, 4, 2]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 1, 1],\n",
|
|
" [1, 3, 5, 2],\n",
|
|
" [2, 5, 0, 1],\n",
|
|
" [1, 2, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 4, 1, 2],\n",
|
|
" [1, 3, 5, 2],\n",
|
|
" [1, 2, 5, 1],\n",
|
|
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 2, 0],\n",
|
|
" [1, 3, 5, 2],\n",
|
|
" [1, 2, 5, 1],\n",
|
|
" [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(80., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 2, 2],\n",
|
|
" [2, 3, 6, 2],\n",
|
|
" [1, 3, 0, 0],\n",
|
|
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 4, 1, 3],\n",
|
|
" [2, 3, 6, 2],\n",
|
|
" [0, 0, 1, 3],\n",
|
|
" [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 3],\n",
|
|
" [2, 3, 6, 2],\n",
|
|
" [0, 0, 1, 3],\n",
|
|
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
|
|
" [0, 3, 6, 2],\n",
|
|
" [0, 0, 1, 3],\n",
|
|
" [1, 0, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
|
|
" [1, 3, 6, 2],\n",
|
|
" [0, 0, 1, 3],\n",
|
|
" [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
|
|
" [1, 3, 6, 2],\n",
|
|
" [1, 3, 1, 0],\n",
|
|
" [1, 2, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n",
|
|
" [2, 4, 6, 2],\n",
|
|
" [1, 2, 1, 0],\n",
|
|
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 5, 1, 3],\n",
|
|
" [2, 2, 6, 2],\n",
|
|
" [2, 0, 1, 0],\n",
|
|
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 1, 1, 0],\n",
|
|
" [3, 5, 6, 3],\n",
|
|
" [3, 2, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [1, 1, 1, 0],\n",
|
|
" [0, 5, 6, 3],\n",
|
|
" [4, 2, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n",
|
|
" [0, 0, 1, 2],\n",
|
|
" [1, 5, 6, 3],\n",
|
|
" [0, 4, 2, 3]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n",
|
|
" [1, 2, 0, 0],\n",
|
|
" [1, 5, 6, 3],\n",
|
|
" [4, 2, 3, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n",
|
|
" [0, 2, 1, 0],\n",
|
|
" [2, 5, 6, 0],\n",
|
|
" [4, 2, 3, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 1],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [0, 2, 3, 0],\n",
|
|
" [0, 0, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 0, 0],\n",
|
|
" [4, 5, 6, 3],\n",
|
|
" [2, 3, 1, 0],\n",
|
|
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
|
|
" [4, 5, 1, 0],\n",
|
|
" [2, 3, 1, 0],\n",
|
|
" [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
|
|
" [4, 5, 2, 0],\n",
|
|
" [2, 3, 0, 0],\n",
|
|
" [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
|
|
" [4, 5, 2, 0],\n",
|
|
" [2, 3, 1, 0],\n",
|
|
" [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
|
|
" [1, 4, 5, 2],\n",
|
|
" [0, 2, 3, 1],\n",
|
|
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
|
|
" [1, 4, 5, 2],\n",
|
|
" [2, 3, 1, 1],\n",
|
|
" [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
|
|
" [1, 4, 5, 2],\n",
|
|
" [1, 2, 3, 2],\n",
|
|
" [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n",
|
|
" [1, 4, 5, 2],\n",
|
|
" [1, 2, 3, 2],\n",
|
|
" [2, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 6, 0],\n",
|
|
" [3, 2, 5, 1],\n",
|
|
" [2, 4, 3, 3],\n",
|
|
" [2, 2, 1, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 6, 0],\n",
|
|
" [0, 2, 5, 1],\n",
|
|
" [3, 4, 3, 1],\n",
|
|
" [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 6],\n",
|
|
" [1, 2, 5, 1],\n",
|
|
" [3, 4, 3, 1],\n",
|
|
" [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[6, 0, 0, 1],\n",
|
|
" [1, 2, 5, 1],\n",
|
|
" [3, 4, 3, 1],\n",
|
|
" [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n",
|
|
" [6, 2, 5, 1],\n",
|
|
" [1, 4, 3, 2],\n",
|
|
" [4, 2, 1, 4]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 2, 0],\n",
|
|
" [6, 2, 5, 1],\n",
|
|
" [1, 4, 3, 2],\n",
|
|
" [4, 2, 1, 4]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n",
|
|
" [6, 4, 5, 2],\n",
|
|
" [1, 2, 3, 4],\n",
|
|
" [4, 1, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n",
|
|
" [6, 4, 5, 2],\n",
|
|
" [1, 2, 3, 4],\n",
|
|
" [0, 0, 4, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 3, 1],\n",
|
|
" [6, 4, 5, 2],\n",
|
|
" [1, 2, 3, 4],\n",
|
|
" [0, 1, 4, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 1],\n",
|
|
" [6, 4, 5, 2],\n",
|
|
" [1, 2, 3, 4],\n",
|
|
" [1, 4, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n",
|
|
" [2, 4, 5, 1],\n",
|
|
" [6, 2, 3, 2],\n",
|
|
" [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, True, False, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n",
|
|
" [2, 4, 5, 1],\n",
|
|
" [6, 2, 3, 2],\n",
|
|
" [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n",
|
|
" [2, 4, 5, 1],\n",
|
|
" [6, 2, 3, 2],\n",
|
|
" [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n",
|
|
" [2, 4, 5, 2],\n",
|
|
" [6, 2, 3, 2],\n",
|
|
" [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"TimeStep(step_type=Array(2, dtype=int8), reward=Array(8., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n",
|
|
" [2, 4, 5, 3],\n",
|
|
" [6, 2, 3, 4],\n",
|
|
" [2, 4, 2, 1]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n",
|
|
"636.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"randkey = jax.random.PRNGKey(14)\n",
|
|
"jit_policy = jax.jit(random_policy)\n",
|
|
"total_reward = 0\n",
|
|
"state, timestep = jax.jit(env.reset)(randkey )\n",
|
|
"while True:\n",
|
|
" board, action_mask = timestep[\"observation\"]\n",
|
|
" action = jit_policy(None, None, timestep[\"observation\"][0].reshape(-1))\n",
|
|
" score_with_mask = jnp.where(action_mask, action, -jnp.inf)\n",
|
|
" action = jnp.argmax(score_with_mask)\n",
|
|
" state, timestep = jit_step(state, action)\n",
|
|
" done = jnp.all(~timestep[\"observation\"][1])\n",
|
|
" print(timestep)\n",
|
|
" total_reward += timestep[\"reward\"]\n",
|
|
" if done:\n",
|
|
" break\n",
|
|
"print(total_reward)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"ExecuteTime": {
|
|
"end_time": "2024-06-05T08:09:58.242414600Z",
|
|
"start_time": "2024-06-05T08:09:56.452642800Z"
|
|
}
|
|
},
|
|
"id": "8bb888fb742b6b06"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [],
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"id": "3d1b5c8c646d4f07"
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 2
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython2",
|
|
"version": "2.7.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|