diff --git a/config/four_rooms.ini b/config/four_rooms.ini new file mode 100644 index 0000000000..0a931a2ff7 --- /dev/null +++ b/config/four_rooms.ini @@ -0,0 +1,62 @@ +[base] +env_name = four_rooms + +[vec] +total_agents = 4096 +num_buffers = 2 +num_threads = 8 + +[env] +size = 19 +# if 0, max_steps = 4 * size. Positive values override it. +max_steps = 0 + +[policy] +hidden_size = 128 +num_layers = 2 +expansion_factor = 1 + +[train] +total_timesteps = 100_000_000 +gamma = 0.99 +gae_lambda = 0.95 +learning_rate = 0.005 +minibatch_size = 32768 +horizon = 64 +ent_coef = 0.01 + +[sweep] +metric = score +metric_distribution = linear +goal = maximize +max_runs = 100 +gpus = 1 +downsample = 5 +sweep_only = hidden_size,num_layers,total_timesteps,learning_rate + +[sweep.policy.hidden_size] +distribution = uniform_pow2 +min = 64 +max = 1024 +mean = 256 +scale = auto + +[sweep.policy.num_layers] +distribution = int_uniform +min = 1 +max = 4 +mean = 2 +scale = auto + +[sweep.train.total_timesteps] +distribution = log_normal +min = 20_000_000 +max = 500_000_000 +mean = 100_000_000 +scale = auto + +[sweep.train.learning_rate] +distribution = log_normal +min = 0.0005 +max = 0.01 +scale = auto diff --git a/ocean/four_rooms/binding.c b/ocean/four_rooms/binding.c new file mode 100644 index 0000000000..7560c9ac28 --- /dev/null +++ b/ocean/four_rooms/binding.c @@ -0,0 +1,26 @@ +#include "four_rooms.h" + +#define OBS_SIZE (FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_OBS_CHANNELS) +#define NUM_ATNS 1 +#define ACT_SIZES {FOUR_ROOMS_NUM_ACTIONS} +#define OBS_TENSOR_T ByteTensor + +#define Env FourRooms +#include "vecenv.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->size = (int)dict_get(kwargs, "size")->value; + env->max_steps = (int)dict_get(kwargs, "max_steps")->value; + if (env->max_steps <= 0) { + env->max_steps = FOUR_ROOMS_TIMEOUT_SCALE * env->size; + } + env->grid = (unsigned char*)calloc(env->size * env->size, sizeof(unsigned char)); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/ocean/four_rooms/four_rooms.c b/ocean/four_rooms/four_rooms.c new file mode 100644 index 0000000000..5ccf7626b9 --- /dev/null +++ b/ocean/four_rooms/four_rooms.c @@ -0,0 +1,38 @@ +#include "four_rooms.h" + +int main() { + FourRooms env = {}; + env.size = 19; + env.max_steps = 0; + env.num_agents = 1; + env.rng = 0; + env.observations = (unsigned char*)calloc( + FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_OBS_CHANNELS, + sizeof(unsigned char) + ); + env.actions = (float*)calloc(1, sizeof(float)); + env.rewards = (float*)calloc(1, sizeof(float)); + env.terminals = (float*)calloc(1, sizeof(float)); + env.grid = (unsigned char*)calloc(env.size * env.size, sizeof(unsigned char)); + + c_reset(&env); + c_render(&env); + while (!WindowShouldClose()) { + if (IsKeyDown(KEY_LEFT_SHIFT)) { + env.actions[0] = DONE; + if (IsKeyDown(KEY_UP) || IsKeyDown(KEY_W)) env.actions[0] = FORWARD; + if (IsKeyDown(KEY_LEFT) || IsKeyDown(KEY_A)) env.actions[0] = LEFT; + if (IsKeyDown(KEY_RIGHT) || IsKeyDown(KEY_D)) env.actions[0] = RIGHT; + } else { + env.actions[0] = four_rooms_rand(&env, 3); + } + c_step(&env); + c_render(&env); + } + free(env.observations); + free(env.actions); + free(env.rewards); + free(env.terminals); + c_close(&env); + return 0; +} diff --git a/ocean/four_rooms/four_rooms.h b/ocean/four_rooms/four_rooms.h new file mode 100644 index 0000000000..b3267dea57 --- /dev/null +++ b/ocean/four_rooms/four_rooms.h @@ -0,0 +1,383 @@ +#include +#include +#include "raylib.h" + +#define FOUR_ROOMS_VIEW_SIZE 7 +#define FOUR_ROOMS_OBS_CHANNELS 3 +#define FOUR_ROOMS_NUM_ACTIONS 7 +#define FOUR_ROOMS_TIMEOUT_SCALE 4 + +enum { + LEFT = 0, + RIGHT = 1, + FORWARD = 2, + PICKUP = 3, + DROP = 4, + TOGGLE = 5, + DONE = 6, +}; + +enum { + UNSEEN = 0, + EMPTY = 1, + WALL = 2, + GOAL = 8, + AGENT = 10, +}; + +enum { + COLOR_BLACK = 0, + COLOR_GREEN = 1, + COLOR_GREY = 5, +}; + +static const Color PUFF_RED = (Color){187, 0, 0, 255}; +static const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255}; +static const Color PUFF_BACKGROUND2 = (Color){18, 72, 72, 255}; + +typedef struct { + float perf; + float score; + float episode_return; + float episode_length; + float n; +} Log; + +typedef struct { + Log log; + unsigned char* observations; + float* actions; + float* rewards; + float* terminals; + int num_agents; + int size; + int max_steps; + int tick; + float episode_return; + int agent_x, agent_y; + int agent_dir; + int goal_x, goal_y; + unsigned char* grid; + unsigned int rng; + int texture_loaded; + Texture2D puffers; +} FourRooms; + +static inline int four_rooms_rand(FourRooms* env, int n) { + return rand_r(&env->rng) % n; +} + +static inline int grid_idx(FourRooms* env, int x, int y) { + return y * env->size + x; +} + +void add_log(FourRooms* env) { + env->log.perf += (env->rewards[0] > 0) ? 1.0f : 0.0f; + env->log.score += env->rewards[0]; + env->log.episode_length += env->tick; + env->log.episode_return += env->episode_return; + env->log.n++; +} + +static inline void encode_cell(unsigned char object, unsigned char* object_idx, + unsigned char* color_idx, unsigned char* state) { + *state = 0; + if (object == WALL) { + *object_idx = WALL; + *color_idx = COLOR_GREY; + } else if (object == GOAL) { + *object_idx = GOAL; + *color_idx = COLOR_GREEN; + } else { + *object_idx = EMPTY; + *color_idx = COLOR_BLACK; + } +} + +static inline void observation_to_world(FourRooms* env, int obs_x, int obs_y, + int* world_x, int* world_y) { + int forward_x = 0; + int forward_y = 0; + if (env->agent_dir == 0) forward_x = 1; + else if (env->agent_dir == 1) forward_y = 1; + else if (env->agent_dir == 2) forward_x = -1; + else forward_y = -1; + + int right_x = -forward_y; + int right_y = forward_x; + int right_offset = obs_x - FOUR_ROOMS_VIEW_SIZE / 2; + int forward_offset = FOUR_ROOMS_VIEW_SIZE - 1 - obs_y; + + *world_x = env->agent_x + forward_x * forward_offset + right_x * right_offset; + *world_y = env->agent_y + forward_y * forward_offset + right_y * right_offset; +} + +static inline void compute_visibility(unsigned char view[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE], + unsigned char visible[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE]) { + memset(visible, 0, FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * sizeof(unsigned char)); + visible[FOUR_ROOMS_VIEW_SIZE - 1][FOUR_ROOMS_VIEW_SIZE / 2] = 1; + + // MiniGrid propagates visibility from the agent at bottom-center after rotating the view. + for (int y = FOUR_ROOMS_VIEW_SIZE - 1; y >= 0; y--) { + for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE - 1; x++) { + if (!visible[y][x] || view[y][x] == WALL) { + continue; + } + visible[y][x + 1] = 1; + if (y > 0) { + visible[y - 1][x] = 1; + visible[y - 1][x + 1] = 1; + } + } + + for (int x = FOUR_ROOMS_VIEW_SIZE - 1; x > 0; x--) { + if (!visible[y][x] || view[y][x] == WALL) { + continue; + } + visible[y][x - 1] = 1; + if (y > 0) { + visible[y - 1][x] = 1; + visible[y - 1][x - 1] = 1; + } + } + } +} + +void generate_observation(FourRooms* env) { + unsigned char view[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE]; + unsigned char visible[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE]; + + for (int y = 0; y < FOUR_ROOMS_VIEW_SIZE; y++) { + for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE; x++) { + int world_x, world_y; + observation_to_world(env, x, y, &world_x, &world_y); + if (world_x < 0 || world_x >= env->size || world_y < 0 || world_y >= env->size) { + view[y][x] = WALL; + } else if (world_x == env->agent_x && world_y == env->agent_y) { + view[y][x] = EMPTY; + } else { + view[y][x] = env->grid[grid_idx(env, world_x, world_y)]; + } + } + } + + compute_visibility(view, visible); + + for (int y = 0; y < FOUR_ROOMS_VIEW_SIZE; y++) { + for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE; x++) { + int base_idx = (y * FOUR_ROOMS_VIEW_SIZE + x) * FOUR_ROOMS_OBS_CHANNELS; + if (!visible[y][x]) { + env->observations[base_idx] = UNSEEN; + env->observations[base_idx + 1] = COLOR_BLACK; + env->observations[base_idx + 2] = 0; + continue; + } + + encode_cell( + view[y][x], + &env->observations[base_idx], + &env->observations[base_idx + 1], + &env->observations[base_idx + 2] + ); + } + } +} + +void create_four_rooms_grid(FourRooms* env) { + int size = env->size; + + memset(env->grid, EMPTY, size * size * sizeof(unsigned char)); + + for (int i = 0; i < size; i++) { + env->grid[i] = WALL; + env->grid[(size - 1) * size + i] = WALL; + env->grid[i * size] = WALL; + env->grid[i * size + size - 1] = WALL; + } + + int room_w = size / 2; + int room_h = size / 2; + + for (int y = 0; y < size; y++) { + env->grid[y * size + room_w] = WALL; + } + + for (int x = 0; x < size; x++) { + env->grid[room_h * size + x] = WALL; + } + + // MiniGrid samples doorway positions from [start + 1, end). + int gap_y1 = 1 + four_rooms_rand(env, room_h - 1); + env->grid[gap_y1 * size + room_w] = EMPTY; + + int gap_y2 = room_h + 1 + four_rooms_rand(env, room_h - 1); + env->grid[gap_y2 * size + room_w] = EMPTY; + + int gap_x1 = 1 + four_rooms_rand(env, room_w - 1); + env->grid[room_h * size + gap_x1] = EMPTY; + + int gap_x2 = room_w + 1 + four_rooms_rand(env, room_w - 1); + env->grid[room_h * size + gap_x2] = EMPTY; +} + +void c_reset(FourRooms* env) { + if (env->max_steps <= 0) { + env->max_steps = FOUR_ROOMS_TIMEOUT_SCALE * env->size; + } + + create_four_rooms_grid(env); + + do { + env->agent_x = 1 + four_rooms_rand(env, env->size - 2); + env->agent_y = 1 + four_rooms_rand(env, env->size - 2); + } while (env->grid[grid_idx(env, env->agent_x, env->agent_y)] != EMPTY); + + do { + env->goal_x = 1 + four_rooms_rand(env, env->size - 2); + env->goal_y = 1 + four_rooms_rand(env, env->size - 2); + } while (env->grid[grid_idx(env, env->goal_x, env->goal_y)] != EMPTY || + (env->goal_x == env->agent_x && env->goal_y == env->agent_y)); + + env->grid[grid_idx(env, env->agent_x, env->agent_y)] = AGENT; + env->grid[grid_idx(env, env->goal_x, env->goal_y)] = GOAL; + + env->agent_dir = four_rooms_rand(env, 4); + env->tick = 0; + env->episode_return = 0.0f; + + generate_observation(env); +} + +void c_step(FourRooms* env) { + env->tick += 1; + + int action = (int)env->actions[0]; + env->terminals[0] = 0; + env->rewards[0] = 0.0; + + env->grid[grid_idx(env, env->agent_x, env->agent_y)] = EMPTY; + + int new_x = env->agent_x; + int new_y = env->agent_y; + int new_dir = env->agent_dir; + + if (action == LEFT) { + new_dir = (env->agent_dir + 3) % 4; + } else if (action == RIGHT) { + new_dir = (env->agent_dir + 1) % 4; + } else if (action == FORWARD) { + if (env->agent_dir == 0) new_x += 1; + else if (env->agent_dir == 1) new_y += 1; + else if (env->agent_dir == 2) new_x -= 1; + else if (env->agent_dir == 3) new_y -= 1; + + if (new_x >= 0 && new_x < env->size && new_y >= 0 && new_y < env->size && + env->grid[grid_idx(env, new_x, new_y)] != WALL) { + env->agent_x = new_x; + env->agent_y = new_y; + } + } + + env->agent_dir = new_dir; + + if (env->agent_x == env->goal_x && env->agent_y == env->goal_y) { + env->terminals[0] = 1; + env->rewards[0] = 1.0f - 0.9f * (float)env->tick / (float)env->max_steps; + env->episode_return += env->rewards[0]; + add_log(env); + c_reset(env); + return; + } + + env->grid[grid_idx(env, env->agent_x, env->agent_y)] = AGENT; + + if (env->tick >= env->max_steps) { + env->terminals[0] = 1; + env->rewards[0] = 0.0; + env->episode_return += env->rewards[0]; + add_log(env); + c_reset(env); + return; + } + + env->episode_return += env->rewards[0]; + generate_observation(env); +} + +void c_render(FourRooms* env) { + if (!IsWindowReady()) { + InitWindow(32*env->size, 32*env->size, "PufferLib FourRooms"); + SetTargetFPS(10); + env->puffers = LoadTexture("resources/shared/puffers_128.png"); + env->texture_loaded = 1; + } + + if (IsKeyDown(KEY_ESCAPE)) { + exit(0); + } + + BeginDrawing(); + ClearBackground(PUFF_BACKGROUND); + + int px = 32; + + for (int y = 0; y < env->size; y++) { + for (int x = 0; x < env->size; x++) { + int cell = env->grid[y * env->size + x]; + Color color = PUFF_BACKGROUND; + + if (cell == WALL) color = PUFF_BACKGROUND2; + else if (cell == GOAL) color = PUFF_RED; + + if (cell != EMPTY && cell != AGENT) { + DrawRectangle(x*px, y*px, px, px, color); + } + } + } + + Color obs_overlay = (Color){180, 180, 180, 80}; + for (int y = 0; y < FOUR_ROOMS_VIEW_SIZE; y++) { + for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE; x++) { + int world_x, world_y; + observation_to_world(env, x, y, &world_x, &world_y); + if (world_x >= 0 && world_x < env->size && world_y >= 0 && world_y < env->size) { + DrawRectangle(world_x*px, world_y*px, px, px, obs_overlay); + } + } + } + + int starting_sprite_x = 0; + int rotation = 90 * env->agent_dir; + if (rotation == 180) { + starting_sprite_x = 128; + rotation = 0; + } + + DrawTexturePro( + env->puffers, + (Rectangle){starting_sprite_x, 0, 128, 128}, + (Rectangle){ + env->agent_x * px + px/2, + env->agent_y * px + px/2, + px, + px + }, + (Vector2){px/2, px/2}, + rotation, + WHITE + ); + + EndDrawing(); +} + +void c_close(FourRooms* env) { + if (env->texture_loaded) { + UnloadTexture(env->puffers); + CloseWindow(); + env->texture_loaded = 0; + } + if (env->grid) { + free(env->grid); + env->grid = NULL; + } +}