-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexploration.py
122 lines (90 loc) · 3.5 KB
/
exploration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
import inspect
import logging
import numpy as np
import schedule
log = logging.getLogger(__name__)
class LinearRandom(schedule.LinearSchedule):
def __init__(self, env, eps_begin, eps_end, nsteps):
"""
Args:
env: gym environment
eps_begin: float
initial exploration rate
eps_end: float
final exploration rate
nsteps: int
number of steps taken to linearly decay eps_begin to eps_end
"""
self.env = env
super(LinearRandom, self).__init__(eps_begin, eps_end, nsteps)
def get_action(self, best_action, **kwargs):
"""
Returns a random action with prob epsilon, otherwise returns the best_action
Args:
best_action: int
best action according some policy
Returns:
an action
"""
return (
best_action if self.do_greedy(**kwargs) # p(1 - epsilon)
else self.random_action(**kwargs) # p(epsilon)
)
def do_greedy(self, **kwargs):
return np.random.rand() > self.epsilon
def random_action(self, **kwargs):
return self.env.action_space.sample()
class LinearInverseCount(LinearRandom):
def random_action(self, state_action_counts, **kwargs):
inverse_square_counts = 1.0 / np.sqrt(state_action_counts)
probs = inverse_square_counts / inverse_square_counts.sum()
actions = range(len(state_action_counts))
return np.random.choice(actions, p=probs)
class ShapedRandom(LinearRandom):
def do_greedy(self, episode_num, episode_step, avg_episode_length, **kwargs):
"""
Determines whether to return the greedy action or a random action
Args:
best_action: int
best action according some policy
Returns:
bool: True == do greedy
"""
if np.isnan(avg_episode_length):
return super(ShapedRandom, self).do_greedy()
if episode_num % 2 == 0:
eps_modifier = float(episode_step) / avg_episode_length
else:
eps_modifier = (float(episode_step) - avg_episode_length) / avg_episode_length
prob_explore = max(0.0, min(1.0, eps_modifier * self.epsilon))
return np.random.rand() > prob_explore
class ShapedInverseCount(ShapedRandom, LinearInverseCount):
pass
class NoExploration(LinearRandom):
def do_greedy(self, **kwargs):
return True
class StochasticRandom(LinearRandom):
def do_greedy(self, **kwargs):
return False
def random_action(self, q_values, **kwargs):
e_q = np.exp(q_values - np.max(q_values))
softmax = e_q / e_q.sum()
actions = range(len(q_values))
return np.random.choice(actions, p=softmax)
class StochasticInverseCount(LinearRandom):
def do_greedy(self, **kwargs):
return False
def random_action(self, q_values, state_action_counts, beta, **kwargs):
e_q = np.exp(q_values - np.max(q_values))
softmax = e_q / e_q.sum()
inverse_square_counts = 1.0 / np.sqrt(state_action_counts)
ic_probs = inverse_square_counts / inverse_square_counts.sum()
probs = softmax + (beta * ic_probs)
probs = probs / probs.sum()
actions = range(len(q_values))
return np.random.choice(actions, p=probs)
EXPLORATION_MAP = {
key: value for key, value in globals().items()
if inspect.isclass(value) and issubclass(value, schedule.LinearSchedule)
}