{ "cells": [ { "cell_type": "code", "execution_count": 6, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-06-05T07:40:13.841629100Z", "start_time": "2024-06-05T07:40:13.076164500Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "initializing\n", "initializing finished\n" ] } ], "source": [ "import jax.numpy as jnp\n", "\n", "from pipeline import Pipeline\n", "from algorithm.neat import *\n", "from algorithm.neat.gene.node.default_without_response import NodeGeneWithoutResponse\n", "\n", "from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048\n", "from tensorneat.utils import Act, Agg\n", "\n", "pipeline = Pipeline(\n", " algorithm=NEAT(\n", " species=DefaultSpecies(\n", " genome=DefaultGenome(\n", " num_inputs=16,\n", " num_outputs=4,\n", " max_nodes=100,\n", " max_conns=1000,\n", " node_gene=NodeGeneWithoutResponse(\n", " activation_default=Act.sigmoid,\n", " activation_options=(\n", " Act.sigmoid,\n", " Act.relu,\n", " Act.tanh,\n", " Act.identity,\n", " ),\n", " aggregation_default=Agg.sum,\n", " aggregation_options=(Agg.sum,),\n", " activation_replace_rate=0.02,\n", " aggregation_replace_rate=0.02,\n", " bias_mutate_rate=0.03,\n", " bias_init_std=0.5,\n", " bias_mutate_power=0.2,\n", " bias_replace_rate=0.01,\n", " ),\n", " conn_gene=DefaultConnGene(\n", " weight_mutate_rate=0.015,\n", " weight_replace_rate=0.003,\n", " weight_mutate_power=0.5,\n", " ),\n", " mutation=DefaultMutation(\n", " node_add=0.1, conn_add=0.2, conn_delete=0.2\n", " ),\n", " ),\n", " pop_size=1000,\n", " species_size=5,\n", " survival_threshold=0.1,\n", " max_stagnation=7,\n", " genome_elitism=3,\n", " compatibility_threshold=1.2,\n", " ),\n", " ),\n", " problem=Jumanji_2048(max_step=10000, repeat_times=5),\n", " generation_limit=100,\n", " fitness_target=13000,\n", " save_path=\"2048.pkl\",\n", ")\n", "state = pipeline.setup()" ] }, { "cell_type": "code", "execution_count": 7, "outputs": [], "source": [ "import numpy as np\n", "\n", "data = np.load('2048.npz')\n", "nodes, conns = data['nodes'], data['conns']" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-05T07:40:13.932015100Z", "start_time": "2024-06-05T07:40:13.876631500Z" } }, "id": "a0915ecf8179f347" }, { "cell_type": "code", "execution_count": 8, "outputs": [], "source": [ "genome = pipeline.algorithm.species.genome\n", "transformed = genome.transform(state, nodes, conns)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-05T07:40:14.585804800Z", "start_time": "2024-06-05T07:40:14.568805Z" } }, "id": "cd1fa65e8a9d6e13" }, { "cell_type": "code", "execution_count": 9, "outputs": [], "source": [ "def policy(board):\n", " action_scores = genome.forward(state, transformed, board)\n", " return action_scores" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-05T07:40:15.124383600Z", "start_time": "2024-06-05T07:40:15.118384200Z" } }, "id": "61bc1895af304651" }, { "cell_type": "code", "execution_count": 14, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [0, 0, 1, 0],\n", " [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [0, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [1, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [0, 0, 0, 1],\n", " [1, 1, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [1, 1, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 1, 0, 0],\n", " [0, 0, 0, 0],\n", " [2, 1, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [0, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 2, 0, 0],\n", " [0, 1, 0, 0],\n", " [2, 2, 1, 1]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 1, 2],\n", " [0, 0, 0, 1],\n", " [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [0, 0, 0, 2],\n", " [0, 0, 1, 1],\n", " [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 0, 2],\n", " [0, 1, 2, 1],\n", " [0, 0, 3, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 0, 0, 2],\n", " [0, 0, 2, 1],\n", " [0, 1, 3, 2]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n", " [2, 0, 0, 0],\n", " [2, 1, 0, 0],\n", " [1, 3, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 0, 1],\n", " [3, 2, 0, 0],\n", " [1, 3, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [1, 0, 0, 0],\n", " [3, 2, 0, 0],\n", " [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 1, 0, 1],\n", " [3, 2, 0, 0],\n", " [1, 3, 2, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [1, 1, 0, 0],\n", " [3, 2, 0, 0],\n", " [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 1, 0, 0],\n", " [3, 2, 1, 0],\n", " [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [1, 1, 0, 0],\n", " [3, 2, 1, 1],\n", " [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 2, 0, 0],\n", " [3, 2, 1, 1],\n", " [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 0, 1],\n", " [3, 3, 1, 2],\n", " [1, 3, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 1, 0],\n", " [3, 0, 1, 1],\n", " [1, 4, 2, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 1, 0],\n", " [3, 0, 2, 1],\n", " [1, 4, 2, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 0, 1],\n", " [3, 0, 1, 1],\n", " [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n", " [1, 0, 0, 0],\n", " [3, 0, 1, 2],\n", " [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 1, 0],\n", " [3, 0, 1, 3],\n", " [1, 4, 3, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 0, 1],\n", " [3, 0, 2, 0],\n", " [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 0, 1],\n", " [3, 0, 2, 1],\n", " [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [1, 0, 0, 0],\n", " [3, 0, 2, 2],\n", " [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 1, 2],\n", " [3, 0, 2, 2],\n", " [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 1, 1, 0],\n", " [3, 0, 2, 3],\n", " [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [1, 0, 1, 0],\n", " [3, 1, 2, 3],\n", " [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 0, 2, 0],\n", " [3, 1, 2, 3],\n", " [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 1, 0, 1],\n", " [3, 1, 3, 3],\n", " [1, 4, 3, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 1, 1],\n", " [3, 2, 0, 3],\n", " [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [1, 0, 0, 1],\n", " [3, 2, 1, 3],\n", " [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [2, 0, 0, 1],\n", " [3, 2, 1, 3],\n", " [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [2, 1, 0, 2],\n", " [3, 2, 1, 3],\n", " [1, 4, 4, 4]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [0, 2, 1, 2],\n", " [3, 2, 1, 3],\n", " [0, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 1, 0, 2],\n", " [1, 3, 2, 3],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 1, 2],\n", " [1, 3, 2, 3],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [0, 0, 1, 2],\n", " [2, 3, 2, 3],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 1, 1, 2],\n", " [2, 3, 2, 3],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 1, 2, 2],\n", " [2, 3, 2, 3],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 1, 0, 2],\n", " [2, 3, 3, 3],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n", " [0, 0, 2, 2],\n", " [0, 2, 3, 4],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n", " [0, 0, 2, 2],\n", " [0, 3, 3, 4],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 0, 2, 2],\n", " [1, 3, 3, 4],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [0, 0, 2, 2],\n", " [2, 3, 3, 4],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n", " [0, 1, 2, 2],\n", " [2, 3, 3, 4],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [1, 1, 2, 2],\n", " [2, 3, 3, 4],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n", " [1, 2, 2, 2],\n", " [2, 3, 3, 4],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n", " [2, 2, 2, 2],\n", " [2, 3, 3, 4],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n", " [1, 2, 2, 2],\n", " [3, 3, 3, 4],\n", " [3, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n", " [0, 2, 2, 2],\n", " [1, 3, 3, 4],\n", " [4, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n", " [3, 2, 0, 2],\n", " [1, 4, 4, 0],\n", " [4, 1, 4, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n", " [3, 2, 1, 0],\n", " [1, 4, 0, 2],\n", " [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n", " [3, 2, 0, 0],\n", " [1, 4, 1, 2],\n", " [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n", " [3, 2, 1, 1],\n", " [1, 4, 1, 2],\n", " [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n", " [3, 2, 0, 1],\n", " [1, 4, 2, 2],\n", " [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n", " [3, 2, 1, 1],\n", " [1, 4, 2, 2],\n", " [4, 1, 5, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(76., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 2],\n", " [1, 3, 2, 2],\n", " [0, 1, 4, 3],\n", " [0, 4, 1, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 0],\n", " [0, 3, 2, 3],\n", " [0, 1, 4, 3],\n", " [1, 4, 1, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n", " [0, 3, 2, 0],\n", " [0, 1, 4, 4],\n", " [2, 4, 1, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n", " [0, 3, 2, 2],\n", " [0, 1, 4, 4],\n", " [3, 4, 1, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 1],\n", " [1, 0, 3, 3],\n", " [0, 0, 1, 5],\n", " [3, 4, 1, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 1, 2, 3],\n", " [1, 0, 3, 5],\n", " [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 2, 2, 3],\n", " [1, 1, 3, 5],\n", " [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 0, 3, 3],\n", " [1, 2, 3, 5],\n", " [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n", " [0, 0, 0, 3],\n", " [1, 2, 4, 5],\n", " [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 1],\n", " [0, 0, 1, 3],\n", " [1, 2, 4, 5],\n", " [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n", " [1, 3, 0, 1],\n", " [1, 2, 4, 5],\n", " [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n", " [2, 3, 0, 1],\n", " [2, 2, 4, 5],\n", " [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n", " [1, 3, 0, 1],\n", " [3, 2, 4, 5],\n", " [3, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [1, 3, 1, 1],\n", " [1, 2, 4, 5],\n", " [4, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [1, 3, 1, 1],\n", " [2, 2, 4, 5],\n", " [4, 4, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(44., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 1, 3, 2],\n", " [0, 3, 4, 5],\n", " [0, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [0, 1, 3, 2],\n", " [0, 3, 4, 5],\n", " [1, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 2, 3, 2],\n", " [0, 3, 4, 5],\n", " [1, 5, 2, 6]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n", " [0, 2, 3, 2],\n", " [0, 3, 4, 5],\n", " [2, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n", " [2, 2, 3, 2],\n", " [0, 3, 4, 5],\n", " [2, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 2, 3, 3],\n", " [1, 3, 4, 5],\n", " [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 2, 4],\n", " [1, 3, 4, 5],\n", " [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 0, 2, 4],\n", " [2, 3, 4, 5],\n", " [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n", " [2, 4, 0, 0],\n", " [2, 3, 4, 5],\n", " [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [1, 4, 1, 0],\n", " [3, 3, 4, 5],\n", " [3, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [1, 4, 1, 0],\n", " [1, 3, 4, 5],\n", " [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [0, 4, 1, 0],\n", " [2, 3, 4, 5],\n", " [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n", " [0, 4, 1, 1],\n", " [2, 3, 4, 5],\n", " [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n", " [1, 4, 1, 1],\n", " [2, 3, 4, 5],\n", " [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n", " [1, 4, 2, 1],\n", " [2, 3, 4, 5],\n", " [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n", " [2, 4, 2, 1],\n", " [2, 3, 4, 5],\n", " [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n", " [0, 4, 2, 2],\n", " [3, 3, 4, 5],\n", " [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 2, 0],\n", " [1, 4, 2, 2],\n", " [3, 3, 4, 5],\n", " [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n", " [1, 4, 3, 2],\n", " [3, 3, 4, 5],\n", " [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n", " [1, 4, 3, 2],\n", " [4, 4, 5, 2],\n", " [4, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [2, 1, 3, 0],\n", " [1, 5, 5, 3],\n", " [5, 5, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n", " [2, 0, 3, 0],\n", " [1, 1, 5, 3],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n", " [2, 0, 3, 1],\n", " [1, 1, 5, 3],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n", " [2, 2, 3, 1],\n", " [1, 1, 5, 3],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n", " [2, 2, 3, 2],\n", " [1, 1, 5, 3],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n", " [1, 3, 3, 2],\n", " [0, 2, 5, 3],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [0, 3, 3, 3],\n", " [1, 2, 5, 3],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n", " [0, 3, 3, 0],\n", " [1, 2, 5, 4],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n", " [0, 0, 1, 4],\n", " [1, 2, 5, 4],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [0, 0, 1, 2],\n", " [1, 2, 5, 5],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [0, 1, 1, 2],\n", " [1, 2, 5, 5],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [2, 1, 1, 2],\n", " [2, 2, 5, 5],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 1, 1, 2],\n", " [3, 2, 5, 5],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(68., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 1, 2, 2],\n", " [1, 3, 2, 6],\n", " [5, 6, 2, 6]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 1, 0, 0],\n", " [1, 3, 2, 2],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 1, 0, 1],\n", " [1, 3, 2, 2],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 1, 0, 2],\n", " [1, 3, 2, 2],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 1, 0, 1],\n", " [2, 3, 2, 3],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 0, 2],\n", " [2, 3, 2, 3],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [0, 0, 1, 2],\n", " [2, 3, 2, 3],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [1, 0, 1, 2],\n", " [2, 3, 2, 3],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [1, 1, 1, 2],\n", " [2, 3, 2, 3],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 2, 1, 2],\n", " [2, 3, 2, 3],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n", " [1, 2, 1, 2],\n", " [2, 3, 2, 3],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n", " [2, 2, 1, 2],\n", " [2, 3, 2, 3],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [0, 2, 2, 2],\n", " [3, 3, 2, 3],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n", " [0, 2, 0, 2],\n", " [3, 3, 3, 3],\n", " [5, 6, 3, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n", " [1, 2, 0, 2],\n", " [3, 3, 0, 3],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [1, 2, 2, 2],\n", " [3, 3, 1, 3],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 2],\n", " [0, 1, 2, 3],\n", " [0, 4, 1, 3],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n", " [0, 2, 2, 2],\n", " [0, 4, 1, 4],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 2, 3, 2],\n", " [1, 4, 1, 4],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [2, 3, 2, 0],\n", " [1, 4, 1, 4],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [0, 2, 3, 2],\n", " [1, 4, 1, 4],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n", " [1, 2, 3, 2],\n", " [1, 4, 1, 4],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 2, 3, 3],\n", " [2, 4, 1, 4],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [0, 1, 2, 4],\n", " [2, 4, 1, 4],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 1, 2, 1],\n", " [2, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [0, 2, 2, 1],\n", " [2, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 2, 3, 1],\n", " [2, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 2, 3, 2],\n", " [2, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n", " [2, 3, 2, 0],\n", " [2, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n", " [1, 3, 2, 0],\n", " [3, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n", " [2, 3, 2, 0],\n", " [3, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 2],\n", " [0, 2, 3, 2],\n", " [3, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 2, 3, 3],\n", " [3, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n", " [1, 2, 4, 0],\n", " [3, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n", " [2, 2, 4, 0],\n", " [3, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n", " [3, 4, 0, 1],\n", " [3, 4, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(48., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [2, 0, 1, 1],\n", " [4, 5, 1, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [2, 0, 0, 1],\n", " [4, 5, 2, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [2, 0, 1, 1],\n", " [4, 5, 2, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 0, 2, 2],\n", " [4, 5, 2, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n", " [1, 0, 0, 2],\n", " [4, 5, 3, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n", " [1, 2, 0, 2],\n", " [4, 5, 3, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n", " [2, 2, 0, 2],\n", " [4, 5, 3, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n", " [3, 2, 0, 2],\n", " [4, 5, 3, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [3, 2, 1, 2],\n", " [4, 5, 3, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n", " [3, 2, 1, 2],\n", " [4, 5, 3, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [3, 2, 2, 3],\n", " [4, 5, 3, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n", " [3, 3, 3, 0],\n", " [4, 5, 3, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 0],\n", " [3, 3, 1, 0],\n", " [4, 5, 4, 5],\n", " [5, 6, 4, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n", " [3, 3, 0, 0],\n", " [4, 5, 1, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n", " [3, 3, 0, 0],\n", " [4, 5, 2, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n", " [3, 3, 1, 0],\n", " [4, 5, 2, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n", " [3, 3, 1, 1],\n", " [4, 5, 2, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n", " [3, 3, 2, 1],\n", " [4, 5, 2, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n", " [3, 3, 0, 2],\n", " [4, 5, 3, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n", " [3, 3, 1, 2],\n", " [4, 5, 3, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n", " [1, 4, 1, 2],\n", " [4, 5, 3, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [1, 4, 2, 3],\n", " [4, 5, 3, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n", " [1, 4, 2, 3],\n", " [4, 5, 3, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n", " [1, 4, 2, 3],\n", " [4, 5, 3, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n", " [1, 4, 3, 3],\n", " [4, 5, 3, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n", " [1, 4, 1, 3],\n", " [4, 5, 4, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 2],\n", " [1, 4, 1, 3],\n", " [4, 5, 4, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n", " [1, 4, 1, 3],\n", " [4, 5, 4, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 0],\n", " [2, 4, 2, 3],\n", " [4, 5, 4, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 1],\n", " [2, 4, 2, 3],\n", " [4, 5, 4, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n", " [2, 4, 2, 3],\n", " [4, 5, 4, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 2],\n", " [2, 4, 2, 3],\n", " [4, 5, 4, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 1],\n", " [2, 4, 2, 3],\n", " [4, 5, 4, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 1],\n", " [3, 4, 3, 3],\n", " [4, 5, 4, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n", " [0, 3, 4, 4],\n", " [4, 5, 4, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 1],\n", " [1, 3, 3, 4],\n", " [4, 5, 5, 5],\n", " [5, 6, 5, 7]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n", " [1, 3, 1, 4],\n", " [4, 5, 3, 5],\n", " [5, 6, 6, 7]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(128., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 0],\n", " [1, 3, 1, 4],\n", " [4, 5, 3, 5],\n", " [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 0],\n", " [2, 3, 2, 4],\n", " [4, 5, 3, 5],\n", " [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n", " [3, 3, 2, 4],\n", " [4, 5, 3, 5],\n", " [5, 7, 7, 1]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(128, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(272., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 0],\n", " [4, 2, 4, 0],\n", " [4, 5, 3, 5],\n", " [5, 8, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n", " [2, 2, 4, 0],\n", " [5, 5, 3, 1],\n", " [5, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n", " [0, 2, 4, 0],\n", " [2, 5, 3, 1],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 1],\n", " [0, 2, 4, 0],\n", " [2, 5, 3, 2],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 2, 0],\n", " [1, 2, 4, 1],\n", " [2, 5, 3, 2],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 1],\n", " [1, 2, 4, 1],\n", " [2, 5, 3, 2],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [2, 3, 4, 2],\n", " [2, 5, 3, 2],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 1],\n", " [0, 3, 4, 0],\n", " [3, 5, 3, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n", " [0, 3, 4, 1],\n", " [3, 5, 3, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n", " [3, 4, 1, 0],\n", " [3, 5, 4, 1],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [2, 4, 1, 0],\n", " [4, 5, 4, 1],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [2, 4, 1, 0],\n", " [4, 5, 4, 2],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [2, 4, 1, 1],\n", " [4, 5, 4, 2],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [0, 2, 4, 2],\n", " [4, 5, 4, 2],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(40., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [1, 2, 0, 1],\n", " [4, 5, 5, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 1, 2, 1],\n", " [1, 4, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n", " [0, 1, 2, 2],\n", " [1, 4, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 1, 3, 2],\n", " [1, 4, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [0, 1, 3, 2],\n", " [2, 4, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 1, 3, 2],\n", " [2, 4, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [2, 3, 2, 1],\n", " [2, 4, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 0],\n", " [1, 3, 2, 1],\n", " [3, 4, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [1, 3, 3, 1],\n", " [3, 4, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n", " [1, 4, 1, 0],\n", " [3, 4, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [2, 0, 1, 1],\n", " [3, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [2, 0, 2, 1],\n", " [3, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [2, 0, 2, 2],\n", " [3, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n", " [3, 2, 0, 0],\n", " [3, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n", " [1, 2, 1, 0],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 2, 1, 2],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n", " [1, 2, 1, 2],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [2, 2, 1, 2],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 0],\n", " [3, 1, 2, 0],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n", " [3, 1, 2, 0],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n", " [3, 2, 2, 1],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n", " [3, 3, 1, 0],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n", " [3, 3, 1, 1],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n", " [3, 3, 2, 1],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n", " [3, 3, 2, 2],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n", " [4, 3, 0, 0],\n", " [4, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n", " [2, 3, 1, 0],\n", " [5, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 0],\n", " [2, 3, 2, 0],\n", " [5, 5, 6, 3],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(64., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 0],\n", " [2, 3, 2, 0],\n", " [6, 6, 3, 1],\n", " [6, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [0, 3, 2, 1],\n", " [3, 6, 3, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n", " [0, 3, 2, 0],\n", " [3, 6, 3, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 1],\n", " [3, 2, 0, 0],\n", " [3, 6, 3, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [2, 2, 0, 1],\n", " [4, 6, 3, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [2, 2, 0, 2],\n", " [4, 6, 3, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [2, 2, 1, 0],\n", " [4, 6, 3, 3],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 0, 3, 1],\n", " [0, 4, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 1, 3, 2],\n", " [1, 4, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [1, 3, 2, 0],\n", " [1, 4, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [1, 3, 2, 0],\n", " [2, 4, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [2, 3, 2, 1],\n", " [2, 4, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 3, 2, 1],\n", " [3, 4, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [0, 3, 2, 2],\n", " [3, 4, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 0],\n", " [1, 3, 2, 2],\n", " [3, 4, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n", " [0, 1, 3, 3],\n", " [3, 4, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n", " [1, 4, 1, 0],\n", " [3, 4, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [2, 2, 1, 0],\n", " [3, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n", " [0, 0, 3, 1],\n", " [3, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [1, 0, 3, 2],\n", " [3, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n", " [1, 3, 2, 0],\n", " [3, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n", " [2, 3, 2, 0],\n", " [3, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 0],\n", " [2, 3, 2, 1],\n", " [3, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 2],\n", " [2, 3, 2, 1],\n", " [3, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 1],\n", " [2, 3, 2, 1],\n", " [3, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 0, 1],\n", " [3, 3, 2, 2],\n", " [3, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 1],\n", " [0, 3, 2, 2],\n", " [4, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 0, 0],\n", " [3, 3, 1, 0],\n", " [4, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 3],\n", " [0, 0, 4, 1],\n", " [4, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 0],\n", " [4, 1, 1, 0],\n", " [4, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 1, 0],\n", " [1, 1, 1, 0],\n", " [5, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n", " [1, 1, 2, 0],\n", " [5, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n", " [1, 1, 2, 1],\n", " [5, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 0, 0],\n", " [1, 1, 2, 2],\n", " [5, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 3, 0, 1],\n", " [2, 1, 2, 2],\n", " [5, 5, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(72., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 1],\n", " [0, 2, 1, 3],\n", " [0, 6, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 3, 1],\n", " [2, 2, 1, 3],\n", " [1, 6, 6, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(136., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 0, 1],\n", " [3, 1, 3, 0],\n", " [1, 7, 4, 0],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [4, 2, 3, 0],\n", " [1, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n", " [0, 4, 2, 3],\n", " [1, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 1],\n", " [0, 4, 2, 3],\n", " [2, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [1, 4, 2, 3],\n", " [2, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 2],\n", " [1, 4, 2, 3],\n", " [2, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 2],\n", " [1, 4, 2, 3],\n", " [2, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 0, 2],\n", " [2, 4, 2, 3],\n", " [2, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 1, 2],\n", " [1, 4, 2, 3],\n", " [3, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 2, 1],\n", " [1, 4, 2, 3],\n", " [3, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 1],\n", " [1, 4, 3, 3],\n", " [3, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 1],\n", " [1, 4, 4, 0],\n", " [3, 7, 4, 1],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 2, 0],\n", " [1, 4, 1, 0],\n", " [3, 7, 5, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 0],\n", " [1, 4, 1, 0],\n", " [3, 7, 5, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 0, 0],\n", " [1, 4, 2, 1],\n", " [3, 7, 5, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 3, 2],\n", " [1, 4, 2, 1],\n", " [3, 7, 5, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 3, 2],\n", " [2, 4, 2, 1],\n", " [3, 7, 5, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n", " [2, 4, 2, 1],\n", " [3, 7, 5, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n", " [2, 4, 3, 2],\n", " [3, 7, 5, 2],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n", " [2, 4, 3, 0],\n", " [3, 7, 5, 3],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n", " [2, 4, 3, 1],\n", " [3, 7, 5, 3],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 1],\n", " [2, 4, 3, 2],\n", " [3, 7, 5, 3],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([False, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 2],\n", " [2, 4, 3, 2],\n", " [3, 7, 5, 3],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n", " [2, 4, 4, 3],\n", " [3, 7, 5, 3],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n", " [2, 4, 4, 1],\n", " [3, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 0, 1],\n", " [2, 4, 4, 2],\n", " [3, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 0, 1],\n", " [2, 5, 2, 0],\n", " [3, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [3, 5, 2, 1],\n", " [3, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 1, 0],\n", " [0, 5, 2, 2],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 0, 0],\n", " [5, 3, 0, 1],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n", " [1, 5, 3, 1],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 0, 1, 0],\n", " [1, 5, 3, 1],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 1, 1, 0],\n", " [1, 5, 3, 1],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 0],\n", " [1, 5, 3, 1],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 0, 1],\n", " [1, 5, 3, 1],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 1, 1, 0],\n", " [1, 5, 3, 2],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 1, 0],\n", " [1, 5, 3, 2],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, True, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n", " [1, 5, 3, 2],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "TimeStep(step_type=Array(2, dtype=int8), reward=Array(4., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 1],\n", " [2, 5, 3, 2],\n", " [4, 7, 5, 4],\n", " [7, 8, 1, 5]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(256, dtype=int32)})\n", "3716.0\n" ] } ], "source": [ "import jax, jumanji\n", "\n", "env = jumanji.make(\"Game2048-v1\")\n", "key = jax.random.PRNGKey(0)\n", "jit_reset = jax.jit(env.reset)\n", "jit_step = jax.jit(env.step)\n", "state, timestep = jax.jit(env.reset)(key)\n", "jit_policy = jax.jit(policy)\n", "total_reward = 0\n", "while True:\n", " board, action_mask = timestep[\"observation\"]\n", " action = jit_policy(timestep[\"observation\"][0].reshape(-1))\n", " score_with_mask = jnp.where(action_mask, action, -jnp.inf)\n", " action = jnp.argmax(score_with_mask)\n", " state, timestep = jit_step(state, action)\n", " done = jnp.all(~timestep[\"observation\"][1])\n", " print(timestep)\n", " total_reward += timestep[\"reward\"]\n", " if done:\n", " break\n", "print(total_reward)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-05T07:41:33.703431900Z", "start_time": "2024-06-05T07:41:26.102578200Z" } }, "id": "f166e09c5be1a8fb" }, { "cell_type": "code", "execution_count": 17, "outputs": [], "source": [ "import jax.random\n", "from problem.rl_env.jumanji.jumanji_2048 import Jumanji_2048\n", "\n", "\n", "def random_policy(state, params, obs):\n", " key = jax.random.key(obs.sum())\n", " actions = jax.random.normal(key, (4,))\n", " return actions\n", "\n", "problem = Jumanji_2048(max_step=10000, repeat_times=10, guarantee_invalid_action=True)\n", "state = problem.setup()\n", "jit_evaluate = jax.jit(lambda state, randkey: problem.evaluate(state, randkey, random_policy, None))" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-05T08:06:59.491563700Z", "start_time": "2024-06-05T08:06:59.465404900Z" } }, "id": "187326d08ac1eeb4" }, { "cell_type": "code", "execution_count": 24, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1193.2001\n" ] } ], "source": [ "\n", "reward = jit_evaluate(state, randkey)\n", "print(reward)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-05T08:07:21.630420300Z", "start_time": "2024-06-05T08:07:21.107419400Z" } }, "id": "4b3506db87568d81" }, { "cell_type": "code", "execution_count": 34, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 1],\n", " [1, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [1, 1, 1, 1]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(2, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 1, 0, 0],\n", " [0, 0, 0, 0],\n", " [2, 2, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(4, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [3, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [3, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [1, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [3, 2, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [2, 0, 1, 0],\n", " [3, 2, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [2, 1, 0, 0],\n", " [3, 2, 0, 1]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 0, 0],\n", " [0, 0, 0, 0],\n", " [2, 1, 0, 0],\n", " [3, 2, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 0],\n", " [2, 2, 0, 0],\n", " [3, 0, 0, 1],\n", " [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 1, 1],\n", " [2, 2, 0, 0],\n", " [3, 0, 0, 0],\n", " [0, 1, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [1, 1, 0, 0],\n", " [2, 2, 0, 0],\n", " [3, 1, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 1, 1],\n", " [2, 2, 0, 0],\n", " [3, 1, 0, 2],\n", " [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 0, 0, 0],\n", " [2, 3, 1, 1],\n", " [3, 1, 1, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 0, 0, 0],\n", " [2, 3, 0, 1],\n", " [3, 1, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 0, 2, 1],\n", " [0, 2, 3, 1],\n", " [0, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 2, 1],\n", " [0, 2, 3, 2],\n", " [1, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n", " [0, 3, 3, 2],\n", " [1, 0, 1, 3],\n", " [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 2, 1],\n", " [1, 2, 3, 2],\n", " [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True, False, False, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [2, 1, 1, 0],\n", " [1, 2, 3, 2],\n", " [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 0, 2, 2],\n", " [1, 2, 3, 2],\n", " [2, 3, 1, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n", " [2, 3, 3, 3],\n", " [1, 0, 1, 3],\n", " [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(8, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n", " [2, 3, 3, 4],\n", " [1, 0, 1, 0],\n", " [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(28., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 3, 1],\n", " [0, 2, 4, 4],\n", " [0, 0, 0, 2],\n", " [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [1, 0, 0, 4],\n", " [0, 1, 3, 2],\n", " [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 1, 0],\n", " [1, 4, 0, 0],\n", " [1, 3, 2, 0],\n", " [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 2],\n", " [0, 0, 1, 4],\n", " [0, 1, 3, 2],\n", " [1, 2, 4, 1]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 2, 2],\n", " [0, 2, 3, 4],\n", " [0, 0, 4, 2],\n", " [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 2],\n", " [1, 0, 2, 4],\n", " [0, 1, 3, 2],\n", " [2, 2, 4, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 2, 2],\n", " [2, 2, 3, 4],\n", " [0, 1, 4, 2],\n", " [0, 0, 0, 1]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 2, 3],\n", " [0, 3, 3, 4],\n", " [1, 1, 4, 2],\n", " [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 3],\n", " [0, 1, 3, 4],\n", " [0, 0, 4, 2],\n", " [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 2, 3],\n", " [1, 1, 3, 4],\n", " [0, 0, 4, 2],\n", " [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n", " [0, 1, 3, 4],\n", " [0, 1, 4, 3],\n", " [0, 0, 0, 0]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n", " [0, 2, 3, 4],\n", " [0, 0, 4, 3],\n", " [0, 0, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 2, 3],\n", " [0, 2, 3, 4],\n", " [2, 0, 4, 3],\n", " [0, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 3, 2, 3],\n", " [0, 2, 3, 4],\n", " [0, 0, 4, 3],\n", " [0, 1, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 3, 2, 3],\n", " [0, 2, 3, 4],\n", " [0, 1, 4, 3],\n", " [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n", " [2, 3, 4, 0],\n", " [1, 4, 3, 0],\n", " [2, 0, 0, 0]], dtype=int32), action_mask=Array([False, True, True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n", " [0, 2, 3, 4],\n", " [0, 1, 4, 3],\n", " [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n", " [2, 3, 4, 0],\n", " [1, 4, 3, 0],\n", " [1, 2, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n", " [2, 3, 4, 0],\n", " [2, 4, 3, 1],\n", " [0, 2, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n", " [2, 3, 4, 0],\n", " [2, 4, 3, 1],\n", " [2, 1, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n", " [2, 3, 4, 0],\n", " [2, 4, 3, 1],\n", " [2, 2, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n", " [0, 2, 3, 4],\n", " [2, 4, 3, 1],\n", " [0, 1, 2, 3]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n", " [2, 3, 4, 0],\n", " [2, 4, 3, 1],\n", " [1, 2, 3, 1]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 2, 3, 1],\n", " [1, 2, 3, 4],\n", " [2, 4, 3, 1],\n", " [1, 2, 3, 1]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(44., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 0, 0, 1],\n", " [1, 3, 0, 1],\n", " [2, 4, 4, 4],\n", " [1, 2, 4, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(16, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 0, 0],\n", " [1, 3, 1, 0],\n", " [2, 5, 4, 1],\n", " [1, 2, 4, 2]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(32., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 1, 1],\n", " [1, 3, 5, 2],\n", " [2, 5, 0, 1],\n", " [1, 2, 0, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 4, 1, 2],\n", " [1, 3, 5, 2],\n", " [1, 2, 5, 1],\n", " [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 2, 0],\n", " [1, 3, 5, 2],\n", " [1, 2, 5, 1],\n", " [1, 2, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(32, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(80., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[4, 1, 2, 2],\n", " [2, 3, 6, 2],\n", " [1, 3, 0, 0],\n", " [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 4, 1, 3],\n", " [2, 3, 6, 2],\n", " [0, 0, 1, 3],\n", " [0, 0, 1, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 4, 1, 3],\n", " [2, 3, 6, 2],\n", " [0, 0, 1, 3],\n", " [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(8., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n", " [0, 3, 6, 2],\n", " [0, 0, 1, 3],\n", " [1, 0, 0, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n", " [1, 3, 6, 2],\n", " [0, 0, 1, 3],\n", " [0, 0, 1, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n", " [1, 3, 6, 2],\n", " [1, 3, 1, 0],\n", " [1, 2, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 4, 1, 3],\n", " [2, 4, 6, 2],\n", " [1, 2, 1, 0],\n", " [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(36., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 5, 1, 3],\n", " [2, 2, 6, 2],\n", " [2, 0, 1, 0],\n", " [0, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 1, 1, 0],\n", " [3, 5, 6, 3],\n", " [3, 2, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(16., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [1, 1, 1, 0],\n", " [0, 5, 6, 3],\n", " [4, 2, 2, 2]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 0],\n", " [0, 0, 1, 2],\n", " [1, 5, 6, 3],\n", " [0, 4, 2, 3]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 1, 0],\n", " [1, 2, 0, 0],\n", " [1, 5, 6, 3],\n", " [4, 2, 3, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 1],\n", " [0, 2, 1, 0],\n", " [2, 5, 6, 0],\n", " [4, 2, 3, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 2, 1, 1],\n", " [4, 5, 6, 3],\n", " [0, 2, 3, 0],\n", " [0, 0, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 0, 0],\n", " [4, 5, 6, 3],\n", " [2, 3, 1, 0],\n", " [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n", " [4, 5, 1, 0],\n", " [2, 3, 1, 0],\n", " [1, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n", " [4, 5, 2, 0],\n", " [2, 3, 0, 0],\n", " [1, 0, 0, 1]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n", " [4, 5, 2, 0],\n", " [2, 3, 1, 0],\n", " [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n", " [1, 4, 5, 2],\n", " [0, 2, 3, 1],\n", " [0, 0, 0, 2]], dtype=int32), action_mask=Array([False, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n", " [1, 4, 5, 2],\n", " [2, 3, 1, 1],\n", " [2, 0, 0, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n", " [1, 4, 5, 2],\n", " [1, 2, 3, 2],\n", " [0, 0, 0, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[3, 2, 6, 3],\n", " [1, 4, 5, 2],\n", " [1, 2, 3, 2],\n", " [2, 0, 1, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 6, 0],\n", " [3, 2, 5, 1],\n", " [2, 4, 3, 3],\n", " [2, 2, 1, 3]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(24., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 6, 0],\n", " [0, 2, 5, 1],\n", " [3, 4, 3, 1],\n", " [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 0, 0, 6],\n", " [1, 2, 5, 1],\n", " [3, 4, 3, 1],\n", " [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[6, 0, 0, 1],\n", " [1, 2, 5, 1],\n", " [3, 4, 3, 1],\n", " [3, 2, 1, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(20., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 1, 0, 0],\n", " [6, 2, 5, 1],\n", " [1, 4, 3, 2],\n", " [4, 2, 1, 4]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 0, 2, 0],\n", " [6, 2, 5, 1],\n", " [1, 4, 3, 2],\n", " [4, 2, 1, 4]], dtype=int32), action_mask=Array([ True, True, False, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 2, 1],\n", " [6, 4, 5, 2],\n", " [1, 2, 3, 4],\n", " [4, 1, 1, 0]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(12., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n", " [6, 4, 5, 2],\n", " [1, 2, 3, 4],\n", " [0, 0, 4, 2]], dtype=int32), action_mask=Array([False, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[0, 2, 3, 1],\n", " [6, 4, 5, 2],\n", " [1, 2, 3, 4],\n", " [0, 1, 4, 2]], dtype=int32), action_mask=Array([ True, False, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[2, 3, 1, 1],\n", " [6, 4, 5, 2],\n", " [1, 2, 3, 4],\n", " [1, 4, 2, 0]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 3, 1, 0],\n", " [2, 4, 5, 1],\n", " [6, 2, 3, 2],\n", " [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, True, False, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(0., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 1, 3, 1],\n", " [2, 4, 5, 1],\n", " [6, 2, 3, 2],\n", " [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, True, True, True], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n", " [2, 4, 5, 1],\n", " [6, 2, 3, 2],\n", " [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(1, dtype=int8), reward=Array(4., dtype=float32), discount=Array(1., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n", " [2, 4, 5, 2],\n", " [6, 2, 3, 2],\n", " [2, 4, 2, 4]], dtype=int32), action_mask=Array([ True, False, True, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "TimeStep(step_type=Array(2, dtype=int8), reward=Array(8., dtype=float32), discount=Array(0., dtype=float32), observation=Observation(board=Array([[1, 2, 3, 1],\n", " [2, 4, 5, 3],\n", " [6, 2, 3, 4],\n", " [2, 4, 2, 1]], dtype=int32), action_mask=Array([False, False, False, False], dtype=bool)), extras={'highest_tile': Array(64, dtype=int32)})\n", "636.0\n" ] } ], "source": [ "randkey = jax.random.PRNGKey(14)\n", "jit_policy = jax.jit(random_policy)\n", "total_reward = 0\n", "state, timestep = jax.jit(env.reset)(randkey )\n", "while True:\n", " board, action_mask = timestep[\"observation\"]\n", " action = jit_policy(None, None, timestep[\"observation\"][0].reshape(-1))\n", " score_with_mask = jnp.where(action_mask, action, -jnp.inf)\n", " action = jnp.argmax(score_with_mask)\n", " state, timestep = jit_step(state, action)\n", " done = jnp.all(~timestep[\"observation\"][1])\n", " print(timestep)\n", " total_reward += timestep[\"reward\"]\n", " if done:\n", " break\n", "print(total_reward)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-05T08:09:58.242414600Z", "start_time": "2024-06-05T08:09:56.452642800Z" } }, "id": "8bb888fb742b6b06" }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false }, "id": "3d1b5c8c646d4f07" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }