Skip to content

Commit

Permalink
Merge pull request #3 from openai/master
Browse files Browse the repository at this point in the history
Update my fork
  • Loading branch information
sujitahirrao committed Dec 16, 2020
2 parents 703dc59 + c4d0af3 commit 5771b8c
Show file tree
Hide file tree
Showing 16 changed files with 186 additions and 53 deletions.
12 changes: 5 additions & 7 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
dist: xenial
dist: focal
language: python
python:
- "3.7"
services:
- docker
env:
- PY_VER=3.5.6
- PY_VER=3.6.8
- PY_VER=3.7.3
- PY_VER=3.8.1

- PY_VER=3.6.12
- PY_VER=3.7.9
- PY_VER=3.8.6

install: "" # so travis doesn't do pip install requirements.txt
script:
Expand All @@ -22,4 +20,4 @@ deploy:
password: $TWINE_PASSWORD
on:
tags: true
condition: $PY_VER = 3.5.6
condition: $PY_VER = 3.6.12
2 changes: 2 additions & 0 deletions gym/envs/toy_text/blackjack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from gym import spaces
from gym.utils import seeding


def cmp(a, b):
return float(a > b) - float(a < b)


# 1 = Ace, 2-10 = Number cards, Jack/Queen/King = 10
deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]

Expand Down
2 changes: 1 addition & 1 deletion gym/envs/toy_text/cliffwalking.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
With inspiration from:
https://github.com/dennybritz/reinforcement-learning/blob/master/lib/envs/cliff_walking.py
The board is a 4x12 matrix, with (using Numpy matrix indexing):
The board is a 4x12 matrix, with (using NumPy matrix indexing):
[3, 0] as the start at bottom-left
[3, 11] as the goal at bottom-right
[3, 1..10] as the cliff at bottom-center
Expand Down
9 changes: 5 additions & 4 deletions gym/envs/toy_text/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from gym import Env, spaces
from gym.utils import seeding


def categorical_sample(prob_n, np_random):
"""
Sample from categorical distribution
Expand All @@ -22,7 +23,7 @@ class DiscreteEnv(Env):
- P: transitions (*)
- isd: initial state distribution (**)
(*) dictionary dict of dicts of lists, where
(*) dictionary of lists, where
P[s][a] == [(probability, nextstate, reward, done), ...]
(**) list or array of length nS
Expand All @@ -31,7 +32,7 @@ class DiscreteEnv(Env):
def __init__(self, nS, nA, P, isd):
self.P = P
self.isd = isd
self.lastaction = None # for rendering
self.lastaction = None # for rendering
self.nS = nS
self.nA = nA

Expand All @@ -53,7 +54,7 @@ def reset(self):
def step(self, a):
transitions = self.P[self.s][a]
i = categorical_sample([t[0] for t in transitions], self.np_random)
p, s, r, d= transitions[i]
p, s, r, d = transitions[i]
self.s = s
self.lastaction = a
return (int(s), r, d, {"prob" : p})
return (int(s), r, d, {"prob": p})
7 changes: 6 additions & 1 deletion gym/envs/toy_text/guessing_game.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class GuessingGame(gym.Env):
The agent will need to use a memory of previously submitted actions and observations
in order to efficiently explore the available actions
The purpose is to have agents optimise their exploration parameters (e.g. how far to
The purpose is to have agents optimize their exploration parameters (e.g. how far to
explore from previous actions) based on previous experience. Because the goal changes
each episode a state-value or action-value function isn't able to provide any additional
benefit apart from being able to tell whether to increase or decrease the next guess.
Expand Down Expand Up @@ -58,6 +58,11 @@ def seed(self, seed=None):
return [seed]

def step(self, action):
if isinstance(action, (int, float)):
action = np.array([action])
elif isinstance(action, list):
action = np.array(action)

assert self.action_space.contains(action)

if action < self.number:
Expand Down
10 changes: 8 additions & 2 deletions gym/envs/toy_text/hotter_colder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ class HotterColder(gym.Env):
The rewards is calculated as:
(min(action, self.number) + self.range) / (max(action, self.number) + self.range)
Ideally an agent will be able to recognise the 'scent' of a higher reward and
Ideally an agent will be able to recognize the 'scent' of a higher reward and
increase the rate in which is guesses in that direction until the reward reaches
its maximum
"""
def __init__(self):
self.range = 1000 # +/- value the randomly select number can be between
self.range = 1000 # +/- the value number can be between
self.bounds = 2000 # Action space bounds

self.action_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]),
Expand All @@ -43,6 +43,11 @@ def seed(self, seed=None):
return [seed]

def step(self, action):
if isinstance(action, (int, float)):
action = np.array([action])
elif isinstance(action, list):
action = np.array(action)

assert self.action_space.contains(action)

if action < self.number:
Expand All @@ -65,4 +70,5 @@ def reset(self):
self.number = self.np_random.uniform(-self.range, self.range)
self.guess_count = 0
self.observation = 0

return self.observation
1 change: 1 addition & 0 deletions gym/envs/toy_text/nchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from gym import spaces
from gym.utils import seeding


class NChainEnv(gym.Env):
"""n-Chain environment
Expand Down
31 changes: 15 additions & 16 deletions gym/envs/toy_text/taxi.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,44 +25,42 @@ class TaxiEnv(discrete.DiscreteEnv):
Description:
There are four designated locations in the grid world indicated by R(ed), G(reen), Y(ellow), and B(lue). When the episode starts, the taxi starts off at a random square and the passenger is at a random location. The taxi drives to the passenger's location, picks up the passenger, drives to the passenger's destination (another one of the four specified locations), and then drops off the passenger. Once the passenger is dropped off, the episode ends.
Observations:
Observations:
There are 500 discrete states since there are 25 taxi positions, 5 possible locations of the passenger (including the case when the passenger is in the taxi), and 4 destination locations.
Passenger locations:
- 0: R(ed)
- 1: G(reen)
- 2: Y(ellow)
- 3: B(lue)
- 4: in taxi
Destinations:
- 0: R(ed)
- 1: G(reen)
- 2: Y(ellow)
- 3: B(lue)
Actions:
There are 6 discrete deterministic actions:
- 0: move south
- 1: move north
- 2: move east
- 3: move west
- 2: move east
- 3: move west
- 4: pickup passenger
- 5: dropoff passenger
Rewards:
- 5: drop off passenger
Rewards:
There is a default per-step reward of -1,
except for delivering the passenger, which is +20,
or executing "pickup" and "drop-off" actions illegally, which is -10.
Rendering:
- blue: passenger
- magenta: destination
- yellow: empty taxi
- green: full taxi
- other letters (R, G, Y and B): locations for passengers and destinations
state space is represented by:
(taxi_row, taxi_col, passenger_location, destination)
Expand All @@ -72,7 +70,7 @@ class TaxiEnv(discrete.DiscreteEnv):
def __init__(self):
self.desc = np.asarray(MAP, dtype='c')

self.locs = locs = [(0,0), (0,4), (4,0), (4,3)]
self.locs = locs = [(0, 0), (0, 4), (4, 0), (4, 3)]

num_states = 500
num_rows = 5
Expand All @@ -93,7 +91,7 @@ def __init__(self):
for action in range(num_actions):
# defaults
new_row, new_col, new_pass_idx = row, col, pass_idx
reward = -1 # default reward when there is no pickup/dropoff
reward = -1 # default reward when there is no pickup/dropoff
done = False
taxi_loc = (row, col)

Expand All @@ -108,7 +106,7 @@ def __init__(self):
elif action == 4: # pickup
if (pass_idx < 4 and taxi_loc == locs[pass_idx]):
new_pass_idx = 4
else: # passenger not at location
else: # passenger not at location
reward = -10
elif action == 5: # dropoff
if (taxi_loc == locs[dest_idx]) and pass_idx == 4:
Expand All @@ -117,7 +115,7 @@ def __init__(self):
reward = 20
elif (taxi_loc in locs) and pass_idx == 4:
new_pass_idx = locs.index(taxi_loc)
else: # dropoff at wrong location
else: # dropoff at wrong location
reward = -10
new_state = self.encode(
new_row, new_col, new_pass_idx, dest_idx)
Expand Down Expand Up @@ -172,7 +170,8 @@ def ul(x): return "_" if x == " " else x
outfile.write("\n".join(["".join(row) for row in out]) + "\n")
if self.lastaction is not None:
outfile.write(" ({})\n".format(["South", "North", "East", "West", "Pickup", "Dropoff"][self.lastaction]))
else: outfile.write("\n")
else:
outfile.write("\n")

# No need to return anything for human
if mode != 'human':
Expand Down
56 changes: 54 additions & 2 deletions gym/spaces/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from collections import OrderedDict

import numpy as np
import pytest

from gym.spaces import utils
from gym.spaces import Tuple, Box, Discrete, MultiDiscrete, MultiBinary, Dict
from gym.spaces import Box, Dict, Discrete, MultiBinary, MultiDiscrete, Tuple, utils


@pytest.mark.parametrize(["space", "flatdim"], [
Expand Down Expand Up @@ -118,3 +118,55 @@ def compare_nested(left, right):
return res
else:
return left == right

'''
Expecteded flattened types are based off:
1. The type that the space is hardcoded as(ie. multi_discrete=np.int64, discrete=np.int64, multi_binary=np.int8)
2. The type that the space is instantiated with(ie. box=np.float32 by default unless instantiated with a different type)
3. The smallest type that the composite space(tuple, dict) can be represented as. In flatten, this is determined
internally by numpy when np.concatenate is called.
'''
@pytest.mark.parametrize(["original_space", "expected_flattened_dtype"], [
(Discrete(3), np.int64),
(Box(low=0., high=np.inf, shape=(2, 2)), np.float32),
(Box(low=0., high=np.inf, shape=(2, 2), dtype=np.float16), np.float16),
(Tuple([Discrete(5), Discrete(10)]), np.int64),
(Tuple([Discrete(5), Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32)]), np.float64),
(Tuple((Discrete(5), Discrete(2), Discrete(2))), np.int64),
(MultiDiscrete([2, 2, 100]), np.int64),
(MultiBinary(10), np.int8),
(Dict({"position": Discrete(5),
"velocity": Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float16)}), np.float64),
])
def test_dtypes(original_space, expected_flattened_dtype):
flattened_space = utils.flatten_space(original_space)

original_sample = original_space.sample()
flattened_sample = utils.flatten(original_space, original_sample)
unflattened_sample = utils.unflatten(original_space, flattened_sample)

assert flattened_space.contains(flattened_sample), "Expected flattened_space to contain flattened_sample"
assert flattened_space.dtype == expected_flattened_dtype, "Expected flattened_space's dtype to equal " \
"{}".format(expected_flattened_dtype)

assert flattened_sample.dtype == flattened_space.dtype, "Expected flattened_space's dtype to equal " \
"flattened_sample's dtype "

compare_sample_types(original_space, original_sample, unflattened_sample)


def compare_sample_types(original_space, original_sample, unflattened_sample):
if isinstance(original_space, Discrete):
assert isinstance(unflattened_sample, int), "Expected unflattened_sample to be an int. unflattened_sample: " \
"{} original_sample: {}".format(unflattened_sample, original_sample)
elif isinstance(original_space, Tuple):
for index in range(len(original_space)):
compare_sample_types(original_space.spaces[index], original_sample[index], unflattened_sample[index])
elif isinstance(original_space, Dict):
for key, space in original_space.spaces.items():
compare_sample_types(space, original_sample[key], unflattened_sample[key])
else:
assert unflattened_sample.dtype == original_sample.dtype, "Expected unflattened_sample's dtype to equal " \
"original_sample's dtype. unflattened_sample: " \
"{} original_sample: {}".format(unflattened_sample,
original_sample)
29 changes: 18 additions & 11 deletions gym/spaces/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ def flatten(space, x):
``gym.spaces``.
"""
if isinstance(space, Box):
return np.asarray(x, dtype=np.float32).flatten()
return np.asarray(x, dtype=space.dtype).flatten()
elif isinstance(space, Discrete):
onehot = np.zeros(space.n, dtype=np.float32)
onehot[x] = 1.0
onehot = np.zeros(space.n, dtype=space.dtype)
onehot[x] = 1
return onehot
elif isinstance(space, Tuple):
return np.concatenate(
Expand All @@ -55,9 +55,9 @@ def flatten(space, x):
return np.concatenate(
[flatten(s, x[key]) for key, s in space.spaces.items()])
elif isinstance(space, MultiBinary):
return np.asarray(x).flatten()
return np.asarray(x, dtype=space.dtype).flatten()
elif isinstance(space, MultiDiscrete):
return np.asarray(x).flatten()
return np.asarray(x, dtype=space.dtype).flatten()
else:
raise NotImplementedError

Expand All @@ -73,7 +73,7 @@ def unflatten(space, x):
defined in ``gym.spaces``.
"""
if isinstance(space, Box):
return np.asarray(x, dtype=np.float32).reshape(space.shape)
return np.asarray(x, dtype=space.dtype).reshape(space.shape)
elif isinstance(space, Discrete):
return int(np.nonzero(x)[0][0])
elif isinstance(space, Tuple):
Expand All @@ -94,9 +94,9 @@ def unflatten(space, x):
]
return OrderedDict(list_unflattened)
elif isinstance(space, MultiBinary):
return np.asarray(x).reshape(space.shape)
return np.asarray(x, dtype=space.dtype).reshape(space.shape)
elif isinstance(space, MultiDiscrete):
return np.asarray(x).reshape(space.shape)
return np.asarray(x, dtype=space.dtype).reshape(space.shape)
else:
raise NotImplementedError

Expand Down Expand Up @@ -140,26 +140,33 @@ def flatten_space(space):
True
"""
if isinstance(space, Box):
return Box(space.low.flatten(), space.high.flatten())
return Box(space.low.flatten(), space.high.flatten(), dtype=space.dtype)
if isinstance(space, Discrete):
return Box(low=0, high=1, shape=(space.n, ))
return Box(low=0, high=1, shape=(space.n, ), dtype=space.dtype)
if isinstance(space, Tuple):
space = [flatten_space(s) for s in space.spaces]
return Box(
low=np.concatenate([s.low for s in space]),
high=np.concatenate([s.high for s in space]),
dtype=np.result_type(*[s.dtype for s in space])
)
if isinstance(space, Dict):
space = [flatten_space(s) for s in space.spaces.values()]
return Box(
low=np.concatenate([s.low for s in space]),
high=np.concatenate([s.high for s in space]),
dtype=np.result_type(*[s.dtype for s in space])
)
if isinstance(space, MultiBinary):
return Box(low=0, high=1, shape=(space.n, ))
return Box(low=0,
high=1,
shape=(space.n, ),
dtype=space.dtype
)
if isinstance(space, MultiDiscrete):
return Box(
low=np.zeros_like(space.nvec),
high=space.nvec,
dtype=space.dtype
)
raise NotImplementedError

0 comments on commit 5771b8c

Please sign in to comment.