Merge pull request #3 from openai/master

Update my fork
sujitahirrao · Dec 16, 2020 · 5771b8c · 5771b8c
2 parents 703dc59 + c4d0af3
commit 5771b8c
Show file tree

Hide file tree

Showing 16 changed files with 186 additions and 53 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,15 +1,13 @@
-dist: xenial
+dist: focal
 language: python
 python:
   - "3.7"
 services:
   - docker
 env:
-  - PY_VER=3.5.6
-  - PY_VER=3.6.8
-  - PY_VER=3.7.3
-  - PY_VER=3.8.1
-
+  - PY_VER=3.6.12
+  - PY_VER=3.7.9
+  - PY_VER=3.8.6
 
 install: "" # so travis doesn't do pip install requirements.txt
 script:
@@ -22,4 +20,4 @@ deploy:
     password: $TWINE_PASSWORD
     on:
         tags: true
-        condition: $PY_VER = 3.5.6
+        condition: $PY_VER = 3.6.12
diff --git a/gym/envs/toy_text/blackjack.py b/gym/envs/toy_text/blackjack.py
@@ -2,9 +2,11 @@
 from gym import spaces
 from gym.utils import seeding
 
+
 def cmp(a, b):
     return float(a > b) - float(a < b)
 
+
 # 1 = Ace, 2-10 = Number cards, Jack/Queen/King = 10
 deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]
 

diff --git a/gym/envs/toy_text/cliffwalking.py b/gym/envs/toy_text/cliffwalking.py
@@ -22,7 +22,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
     With inspiration from:
     https://github.com/dennybritz/reinforcement-learning/blob/master/lib/envs/cliff_walking.py
 
-    The board is a 4x12 matrix, with (using Numpy matrix indexing):
+    The board is a 4x12 matrix, with (using NumPy matrix indexing):
         [3, 0] as the start at bottom-left
         [3, 11] as the goal at bottom-right
         [3, 1..10] as the cliff at bottom-center

diff --git a/gym/envs/toy_text/discrete.py b/gym/envs/toy_text/discrete.py
@@ -3,6 +3,7 @@
 from gym import Env, spaces
 from gym.utils import seeding
 
+
 def categorical_sample(prob_n, np_random):
     """
     Sample from categorical distribution
@@ -22,7 +23,7 @@ class DiscreteEnv(Env):
     - P: transitions (*)
     - isd: initial state distribution (**)
 
-    (*) dictionary dict of dicts of lists, where
+    (*) dictionary of lists, where
       P[s][a] == [(probability, nextstate, reward, done), ...]
     (**) list or array of length nS
 
@@ -31,7 +32,7 @@ class DiscreteEnv(Env):
     def __init__(self, nS, nA, P, isd):
         self.P = P
         self.isd = isd
-        self.lastaction = None # for rendering
+        self.lastaction = None  # for rendering
         self.nS = nS
         self.nA = nA
 
@@ -53,7 +54,7 @@ def reset(self):
     def step(self, a):
         transitions = self.P[self.s][a]
         i = categorical_sample([t[0] for t in transitions], self.np_random)
-        p, s, r, d= transitions[i]
+        p, s, r, d = transitions[i]
         self.s = s
         self.lastaction = a
-        return (int(s), r, d, {"prob" : p})
+        return (int(s), r, d, {"prob": p})
diff --git a/gym/envs/toy_text/guessing_game.py b/gym/envs/toy_text/guessing_game.py
@@ -29,7 +29,7 @@ class GuessingGame(gym.Env):
     The agent will need to use a memory of previously submitted actions and observations
     in order to efficiently explore the available actions
 
-    The purpose is to have agents optimise their exploration parameters (e.g. how far to
+    The purpose is to have agents optimize their exploration parameters (e.g. how far to
     explore from previous actions) based on previous experience. Because the goal changes
     each episode a state-value or action-value function isn't able to provide any additional
     benefit apart from being able to tell whether to increase or decrease the next guess.
@@ -58,6 +58,11 @@ def seed(self, seed=None):
         return [seed]
 
     def step(self, action):
+        if isinstance(action, (int, float)):
+            action = np.array([action])
+        elif isinstance(action, list):
+            action = np.array(action)
+
         assert self.action_space.contains(action)
 
         if action < self.number:

diff --git a/gym/envs/toy_text/hotter_colder.py b/gym/envs/toy_text/hotter_colder.py
@@ -18,12 +18,12 @@ class HotterColder(gym.Env):
     The rewards is calculated as:
     (min(action, self.number) + self.range) / (max(action, self.number) + self.range)
 
-    Ideally an agent will be able to recognise the 'scent' of a higher reward and
+    Ideally an agent will be able to recognize the 'scent' of a higher reward and
     increase the rate in which is guesses in that direction until the reward reaches
     its maximum
     """
     def __init__(self):
-        self.range = 1000  # +/- value the randomly select number can be between
+        self.range = 1000  # +/- the value number can be between
         self.bounds = 2000  # Action space bounds
 
         self.action_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]),
@@ -43,6 +43,11 @@ def seed(self, seed=None):
         return [seed]
 
     def step(self, action):
+        if isinstance(action, (int, float)):
+            action = np.array([action])
+        elif isinstance(action, list):
+            action = np.array(action)
+
         assert self.action_space.contains(action)
 
         if action < self.number:
@@ -65,4 +70,5 @@ def reset(self):
         self.number = self.np_random.uniform(-self.range, self.range)
         self.guess_count = 0
         self.observation = 0
+
         return self.observation
diff --git a/gym/envs/toy_text/nchain.py b/gym/envs/toy_text/nchain.py
@@ -2,6 +2,7 @@
 from gym import spaces
 from gym.utils import seeding
 
+
 class NChainEnv(gym.Env):
     """n-Chain environment
 

diff --git a/gym/envs/toy_text/taxi.py b/gym/envs/toy_text/taxi.py
@@ -25,44 +25,42 @@ class TaxiEnv(discrete.DiscreteEnv):
     Description:
     There are four designated locations in the grid world indicated by R(ed), G(reen), Y(ellow), and B(lue). When the episode starts, the taxi starts off at a random square and the passenger is at a random location. The taxi drives to the passenger's location, picks up the passenger, drives to the passenger's destination (another one of the four specified locations), and then drops off the passenger. Once the passenger is dropped off, the episode ends.
 
-    Observations: 
+    Observations:
     There are 500 discrete states since there are 25 taxi positions, 5 possible locations of the passenger (including the case when the passenger is in the taxi), and 4 destination locations. 
-    
+
     Passenger locations:
     - 0: R(ed)
     - 1: G(reen)
     - 2: Y(ellow)
     - 3: B(lue)
     - 4: in taxi
-    
+
     Destinations:
     - 0: R(ed)
     - 1: G(reen)
     - 2: Y(ellow)
     - 3: B(lue)
-        
+
     Actions:
     There are 6 discrete deterministic actions:
     - 0: move south
     - 1: move north
-    - 2: move east 
-    - 3: move west 
+    - 2: move east
+    - 3: move west
     - 4: pickup passenger
-    - 5: dropoff passenger
-    
-    Rewards: 
+    - 5: drop off passenger
+
+    Rewards:
     There is a default per-step reward of -1,
     except for delivering the passenger, which is +20,
     or executing "pickup" and "drop-off" actions illegally, which is -10.
-    
 
     Rendering:
     - blue: passenger
     - magenta: destination
     - yellow: empty taxi
     - green: full taxi
     - other letters (R, G, Y and B): locations for passengers and destinations
-    
 
     state space is represented by:
         (taxi_row, taxi_col, passenger_location, destination)
@@ -72,7 +70,7 @@ class TaxiEnv(discrete.DiscreteEnv):
     def __init__(self):
         self.desc = np.asarray(MAP, dtype='c')
 
-        self.locs = locs = [(0,0), (0,4), (4,0), (4,3)]
+        self.locs = locs = [(0, 0), (0, 4), (4, 0), (4, 3)]
 
         num_states = 500
         num_rows = 5
@@ -93,7 +91,7 @@ def __init__(self):
                         for action in range(num_actions):
                             # defaults
                             new_row, new_col, new_pass_idx = row, col, pass_idx
-                            reward = -1 # default reward when there is no pickup/dropoff
+                            reward = -1  # default reward when there is no pickup/dropoff
                             done = False
                             taxi_loc = (row, col)
 
@@ -108,7 +106,7 @@ def __init__(self):
                             elif action == 4:  # pickup
                                 if (pass_idx < 4 and taxi_loc == locs[pass_idx]):
                                     new_pass_idx = 4
-                                else: # passenger not at location
+                                else:  # passenger not at location
                                     reward = -10
                             elif action == 5:  # dropoff
                                 if (taxi_loc == locs[dest_idx]) and pass_idx == 4:
@@ -117,7 +115,7 @@ def __init__(self):
                                     reward = 20
                                 elif (taxi_loc in locs) and pass_idx == 4:
                                     new_pass_idx = locs.index(taxi_loc)
-                                else: # dropoff at wrong location
+                                else:  # dropoff at wrong location
                                     reward = -10
                             new_state = self.encode(
                                 new_row, new_col, new_pass_idx, dest_idx)
@@ -172,7 +170,8 @@ def ul(x): return "_" if x == " " else x
         outfile.write("\n".join(["".join(row) for row in out]) + "\n")
         if self.lastaction is not None:
             outfile.write("  ({})\n".format(["South", "North", "East", "West", "Pickup", "Dropoff"][self.lastaction]))
-        else: outfile.write("\n")
+        else:
+            outfile.write("\n")
 
         # No need to return anything for human
         if mode != 'human':

diff --git a/gym/spaces/tests/test_utils.py b/gym/spaces/tests/test_utils.py
@@ -1,9 +1,9 @@
 from collections import OrderedDict
+
 import numpy as np
 import pytest
 
-from gym.spaces import utils
-from gym.spaces import Tuple, Box, Discrete, MultiDiscrete, MultiBinary, Dict
+from gym.spaces import Box, Dict, Discrete, MultiBinary, MultiDiscrete, Tuple, utils
 
 
 @pytest.mark.parametrize(["space", "flatdim"], [
@@ -118,3 +118,55 @@ def compare_nested(left, right):
         return res
     else:
         return left == right
+
+'''
+Expecteded flattened types are based off:
+1. The type that the space is hardcoded as(ie. multi_discrete=np.int64, discrete=np.int64, multi_binary=np.int8)
+2. The type that the space is instantiated with(ie. box=np.float32 by default unless instantiated with a different type)
+3. The smallest type that the composite space(tuple, dict) can be represented as. In flatten, this is determined 
+   internally by numpy when np.concatenate is called. 
+'''
+@pytest.mark.parametrize(["original_space", "expected_flattened_dtype"], [
+    (Discrete(3), np.int64),
+    (Box(low=0., high=np.inf, shape=(2, 2)), np.float32),
+    (Box(low=0., high=np.inf, shape=(2, 2), dtype=np.float16), np.float16),
+    (Tuple([Discrete(5), Discrete(10)]), np.int64),
+    (Tuple([Discrete(5), Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32)]), np.float64),
+    (Tuple((Discrete(5), Discrete(2), Discrete(2))), np.int64),
+    (MultiDiscrete([2, 2, 100]), np.int64),
+    (MultiBinary(10), np.int8),
+    (Dict({"position": Discrete(5),
+           "velocity": Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float16)}), np.float64),
+])
+def test_dtypes(original_space, expected_flattened_dtype):
+    flattened_space = utils.flatten_space(original_space)
+
+    original_sample = original_space.sample()
+    flattened_sample = utils.flatten(original_space, original_sample)
+    unflattened_sample = utils.unflatten(original_space, flattened_sample)
+
+    assert flattened_space.contains(flattened_sample), "Expected flattened_space to contain flattened_sample"
+    assert flattened_space.dtype == expected_flattened_dtype, "Expected flattened_space's dtype to equal " \
+                                                              "{}".format(expected_flattened_dtype)
+
+    assert flattened_sample.dtype == flattened_space.dtype, "Expected flattened_space's dtype to equal " \
+                                                            "flattened_sample's dtype "
+
+    compare_sample_types(original_space, original_sample, unflattened_sample)
+
+
+def compare_sample_types(original_space, original_sample, unflattened_sample):
+    if isinstance(original_space, Discrete):
+        assert isinstance(unflattened_sample, int), "Expected unflattened_sample to be an int. unflattened_sample: " \
+                                                    "{} original_sample: {}".format(unflattened_sample, original_sample)
+    elif isinstance(original_space, Tuple):
+        for index in range(len(original_space)):
+            compare_sample_types(original_space.spaces[index], original_sample[index], unflattened_sample[index])
+    elif isinstance(original_space, Dict):
+        for key, space in original_space.spaces.items():
+            compare_sample_types(space, original_sample[key], unflattened_sample[key])
+    else:
+        assert unflattened_sample.dtype == original_sample.dtype, "Expected unflattened_sample's dtype to equal " \
+                                                                  "original_sample's dtype. unflattened_sample: " \
+                                                                  "{} original_sample: {}".format(unflattened_sample,
+                                                                                                  original_sample)
diff --git a/gym/spaces/utils.py b/gym/spaces/utils.py
@@ -43,10 +43,10 @@ def flatten(space, x):
     ``gym.spaces``.
     """
     if isinstance(space, Box):
-        return np.asarray(x, dtype=np.float32).flatten()
+        return np.asarray(x, dtype=space.dtype).flatten()
     elif isinstance(space, Discrete):
-        onehot = np.zeros(space.n, dtype=np.float32)
-        onehot[x] = 1.0
+        onehot = np.zeros(space.n, dtype=space.dtype)
+        onehot[x] = 1
         return onehot
     elif isinstance(space, Tuple):
         return np.concatenate(
@@ -55,9 +55,9 @@ def flatten(space, x):
         return np.concatenate(
             [flatten(s, x[key]) for key, s in space.spaces.items()])
     elif isinstance(space, MultiBinary):
-        return np.asarray(x).flatten()
+        return np.asarray(x, dtype=space.dtype).flatten()
     elif isinstance(space, MultiDiscrete):
-        return np.asarray(x).flatten()
+        return np.asarray(x, dtype=space.dtype).flatten()
     else:
         raise NotImplementedError
 
@@ -73,7 +73,7 @@ def unflatten(space, x):
     defined in ``gym.spaces``.
     """
     if isinstance(space, Box):
-        return np.asarray(x, dtype=np.float32).reshape(space.shape)
+        return np.asarray(x, dtype=space.dtype).reshape(space.shape)
     elif isinstance(space, Discrete):
         return int(np.nonzero(x)[0][0])
     elif isinstance(space, Tuple):
@@ -94,9 +94,9 @@ def unflatten(space, x):
         ]
         return OrderedDict(list_unflattened)
     elif isinstance(space, MultiBinary):
-        return np.asarray(x).reshape(space.shape)
+        return np.asarray(x, dtype=space.dtype).reshape(space.shape)
     elif isinstance(space, MultiDiscrete):
-        return np.asarray(x).reshape(space.shape)
+        return np.asarray(x, dtype=space.dtype).reshape(space.shape)
     else:
         raise NotImplementedError
 
@@ -140,26 +140,33 @@ def flatten_space(space):
         True
     """
     if isinstance(space, Box):
-        return Box(space.low.flatten(), space.high.flatten())
+        return Box(space.low.flatten(), space.high.flatten(), dtype=space.dtype)
     if isinstance(space, Discrete):
-        return Box(low=0, high=1, shape=(space.n, ))
+        return Box(low=0, high=1, shape=(space.n, ), dtype=space.dtype)
     if isinstance(space, Tuple):
         space = [flatten_space(s) for s in space.spaces]
         return Box(
             low=np.concatenate([s.low for s in space]),
             high=np.concatenate([s.high for s in space]),
+            dtype=np.result_type(*[s.dtype for s in space])
         )
     if isinstance(space, Dict):
         space = [flatten_space(s) for s in space.spaces.values()]
         return Box(
             low=np.concatenate([s.low for s in space]),
             high=np.concatenate([s.high for s in space]),
+            dtype=np.result_type(*[s.dtype for s in space])
         )
     if isinstance(space, MultiBinary):
-        return Box(low=0, high=1, shape=(space.n, ))
+        return Box(low=0,
+                   high=1,
+                   shape=(space.n, ),
+                   dtype=space.dtype
+                   )
     if isinstance(space, MultiDiscrete):
         return Box(
             low=np.zeros_like(space.nvec),
             high=space.nvec,
+            dtype=space.dtype
         )
     raise NotImplementedError