4. Implementing the MazeEnv¶
The complete code for this part of the tutorial can be found here
# file structure
- cutting_2d
- main.py # modified
- env
- core_env.py # modified
- inventory.py
- maze_state.py
- maze_action.py
- renderer.py
- maze_env.py # new
- space_interfaces
- dict_action_conversion.py # new
- dict_observation_conversion.py # new
Page Overview
4.1. MazeEnv¶
The MazeEnv wraps the CoreEnvs as a Gym-style environment in a reusable form, by utilizing the interfaces (mappings) from the MazeState to the observation and from the MazeAction to the action. After implementing the MazeEnv we will be ready to perform our first training run. To learn more about the usability and advantages of this concept you can follow up on Customizing Core and Maze Envs.
In the remainder of this part of the tutorial we will implement the Cutting2DEnvironment
(MazeEnv)
as well as a corresponding set of interfaces.
from maze.core.env.core_env import CoreEnv
from maze.core.env.maze_env import MazeEnv
from maze.core.env.action_conversion import ActionConversionInterface
from maze.core.env.observation_conversion import ObservationConversionInterface
from .core_env import Cutting2DCoreEnvironment
from ..space_interfaces.dict_observation_conversion import ObservationConversion
from ..space_interfaces.dict_action_conversion import ActionConversion
class Cutting2DEnvironment(MazeEnv[Cutting2DCoreEnvironment]):
"""Maze environment for 2d cutting.
:param core_env: The underlying core environment.
:param action_conversion: A action conversion interfaces.
:param observation_conversion: An observation conversion interface.
"""
def __init__(self,
core_env: CoreEnv,
action_conversion: ActionConversionInterface,
observation_conversion: ObservationConversionInterface):
super().__init__(core_env=core_env,
action_conversion_dict={0: action_conversion},
observation_conversion_dict={0: observation_conversion})
def maze_env_factory(max_pieces_in_inventory: int, raw_piece_size: (int, int),
static_demand: (int, int)) -> Cutting2DEnvironment:
"""Convenience factory function that compiles a trainable maze environment.
(for argument details see: Cutting2DCoreEnvironment)
"""
# init core environment
core_env = Cutting2DCoreEnvironment(max_pieces_in_inventory=max_pieces_in_inventory,
raw_piece_size=raw_piece_size,
static_demand=static_demand)
# init maze environment including observation and action interfaces
action_conversion = ActionConversion(max_pieces_in_inventory=max_pieces_in_inventory)
observation_conversion = ObservationConversion(raw_piece_size=raw_piece_size,
max_pieces_in_inventory=max_pieces_in_inventory)
return Cutting2DEnvironment(core_env, action_conversion, observation_conversion)
The MazeEnv is instantiated with the underlying CoreEnv and the two interfaces for MazeStates and MazeActions.
For convenience we also add a maze_env_factory
to instantiate the MazeEnv from the original environment parameter
set. This will be useful in the next part of the tutorial where we will train an agent based on this environment.
4.2. ObservationConversionInterface¶
The ObservationConversionInterface
converts CoreEnv MazeState objects into machine readable Gym-style observations
and defines the respective Gym observation space.
In the present cases the observation is defined as a dictionary with the following structure:
inventory: 2d array representing all pieces currently in inventory
inventory_size: count of pieces currently in inventory
order: 2d vector representing the customer order (current demand)
import numpy as np
from typing import Dict
from gym import spaces
from maze.core.annotations import override
from maze.core.env.observation_conversion import ObservationConversionInterface
from ..env.maze_state import Cutting2DMazeState
class ObservationConversion(ObservationConversionInterface):
"""Cutting 2d environment state to dictionary observation.
:param max_pieces_in_inventory: Size of the inventory. If inventory gets full, the oldest pieces get discarded.
:param raw_piece_size: Size of a fresh raw (= full-size) piece
"""
def __init__(self, raw_piece_size: (int, int), max_pieces_in_inventory: int):
self.max_pieces_in_inventory = max_pieces_in_inventory
self.raw_piece_size = raw_piece_size
@override(ObservationConversionInterface)
def maze_to_space(self, maze_state: Cutting2DMazeState) -> Dict[str, np.ndarray]:
"""Converts core environment state to a machine readable agent observation."""
# Convert inventory to numpy array and stretch it to full size (filling with zeros)
inventory_state = maze_state.inventory
inventory_state += [(0, 0)] * (self.max_pieces_in_inventory - len(maze_state.inventory))
# Compile dict space observation
return {'inventory': np.asarray(inventory_state, dtype=np.float32),
'inventory_size': np.asarray([len(maze_state.inventory)], dtype=np.float32),
'ordered_piece': np.asarray(maze_state.current_demand, dtype=np.float32)}
@override(ObservationConversionInterface)
def space_to_maze(self, observation: Dict[str, np.ndarray]) -> Cutting2DMazeState:
"""Converts agent observation to core environment state (not required for this example)."""
raise NotImplementedError
@override(ObservationConversionInterface)
def space(self) -> spaces.Dict:
"""Return the Gym dict observation space based on the given params.
:return: Gym space object
- inventory: max_pieces_in_inventory x 2 (x/y-dimensions of pieces in inventory)
- inventory_size: scalar number of pieces in inventory
- ordered_piece: 2d vector holding x/y-dimension of customer ordered piece
"""
return spaces.Dict({
'inventory': spaces.Box(low=np.zeros((self.max_pieces_in_inventory, 2), dtype=np.float32),
high=np.vstack([[self.raw_piece_size[0] + 1, self.raw_piece_size[1] + 1]] *
self.max_pieces_in_inventory).astype(np.float32),
dtype=np.float32),
'inventory_size': spaces.Box(low=np.float32(0), high=self.max_pieces_in_inventory + 1,
shape=(1,), dtype=np.float32),
'ordered_piece': spaces.Box(low=np.float32(0), high=np.float32(max(self.raw_piece_size) + 1),
shape=(2,), dtype=np.float32)
})
4.3. ActionConversionInterface¶
The ActionConversionInterface
converts agent actions into CoreEnv MazeAction objects
and defines the respective Gym action space.
In the present cases the action is defined as a dictionary with the following structure:
piece_idx: id of the inventory piece that should be used for cutting
rotation: defines whether to rotate the piece for cutting or not
order: defines the cutting order (xy vs. yx)
from typing import Dict
from gym import spaces
from maze.core.env.action_conversion import ActionConversionInterface
from ..env.maze_action import Cutting2DMazeAction
from ..env.maze_state import Cutting2DMazeState
class ActionConversion(ActionConversionInterface):
"""Converts agent actions to actual environment maze_actions.
:param max_pieces_in_inventory: Size of the inventory
"""
def __init__(self, max_pieces_in_inventory: int):
self.max_pieces_in_inventory = max_pieces_in_inventory
def space_to_maze(self, action: Dict[str, int], maze_state: Cutting2DMazeState) -> Cutting2DMazeAction:
"""Converts agent dictionary action to environment MazeAction object."""
return Cutting2DMazeAction(piece_id=action["piece_idx"],
rotate=bool(action["cut_rotation"]),
reverse_cutting_order=bool(action["cut_order"]))
def maze_to_space(self, maze_action: Cutting2DMazeAction) -> Dict[str, int]:
"""Converts environment MazeAction object to agent dictionary action."""
return {"piece_idx": maze_action.piece_id,
"cut_rotation": int(maze_action.rotate),
"cut_order": int(maze_action.reverse_cutting_order)}
def space(self) -> spaces.Dict:
"""Returns Gym dict action space."""
return spaces.Dict({
"piece_idx": spaces.Discrete(self.max_pieces_in_inventory), # Which piece should be cut
"cut_rotation": spaces.Discrete(2), # Rotate: (yes / no)
"cut_order": spaces.Discrete(2) # Cutting order: (xy / yx)
})
4.4. Updating the CoreEnv¶
For the sake of completeness we also show two more minor modifications required in the CoreEnv,
which are not too important for this tutorial at the moment.
In short, the StructuredEnv
interface supports interaction patterns
beyond standard Gym environments to model for example hierarchical or multi-agent RL problems.
We will get back to this in our more advanced tutorials.
The code below defines that the current version of the environment requires only one actor (id 0) with a single policy (id 0) that is never done.
from maze.core.env.structured_env import ActorID
class Cutting2DCoreEnvironment(CoreEnv):
...
def is_actor_done(self) -> bool:
"""Returns True if the just stepped actor is done, which is different to the done flag of the environment."""
return False
def actor_id(self) -> ActorID:
"""Returns the currently executed actor along with the policy id. The id is unique only with
respect to the policies (every policy has its own actor 0).
Note that identities of done actors can not be reused in the same rollout.
:return: The current actor, as tuple (policy id, actor number).
"""
return ActorID(step_key=0, agent_id=0)
...
4.5. Test Script¶
The following snippet will instantiate the environment and run it for 15 steps.
Note that (compared to the previous example) we are now:
working with observations and actions instead of MazeStates and MazeActions
able to sample actions from the action_space object
""" Test script CoreEnv """
from tutorial_maze_env.part03_maze_env.env.maze_env import maze_env_factory
def main():
# init maze environment including observation and action interfaces
env = maze_env_factory(max_pieces_in_inventory=10,
raw_piece_size=[100, 100],
static_demand=(30, 15))
# reset environment
obs = env.reset()
# run interaction loop
for i in range(15):
# sample random action
action = env.action_space.sample()
# take actual environment step
obs, reward, done, info = env.step(action)
print(f"reward {reward} | done {done} | info {info}")
if __name__ == "__main__":
""" main """
main()
reward -1 | done False | info {'msg': 'valid_cut'}
reward 0 | done False | info {'msg': 'valid_cut'}
reward 0 | done False | info {'msg': 'valid_cut'}
reward 0 | done False | info {'error': 'piece_id_out_of_bounds'}
reward 0 | done False | info {'error': 'piece_id_out_of_bounds'}
...