import sys
import logging
import itertools
import inspect
import numpy as np
np.random.seed(0)
import scipy.optimize
import gym
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
import environment
env = gym.make('CliffWalking-v0')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
logging.info('type = %s', inspect.getmro(type(env)))
00:00:01 [INFO] env: <StepAPICompatibility<PassiveEnvChecker<CliffWalkingEnv<CliffWalking-v0>>>> 00:00:01 [INFO] _action_space: None 00:00:01 [INFO] _observation_space: None 00:00:01 [INFO] _reward_range: None 00:00:01 [INFO] _metadata: None 00:00:01 [INFO] new_step_api: True 00:00:01 [INFO] _has_reset: False 00:00:01 [INFO] _disable_render_order_enforcing: False 00:00:01 [INFO] type = (<class 'gym.wrappers.order_enforcing.OrderEnforcing'>, <class 'gym.core.Wrapper'>, <class 'gym.core.Env'>, <class 'typing.Generic'>, <class 'object'>)
def evaluate_bellman(env, policy, gamma=1., verbose=True):
if verbose:
logging.info('policy = %s', policy)
a, b = np.eye(env.nS), np.zeros((env.nS))
for state in range(env.nS - 1):
for action in range(env.nA):
pi = policy[state][action]
for p, next_state, reward, terminated in env.P[state][action]:
a[state, next_state] -= (pi * gamma * p)
b[state] += (pi * reward * p)
v = np.linalg.solve(a, b)
q = np.zeros((env.nS, env.nA))
for state in range(env.nS - 1):
for action in range(env.nA):
for p, next_state, reward, terminated in env.P[state][action]:
q[state][action] += ((reward + gamma * v[next_state]) * p)
if verbose:
logging.info('state values = %s', v)
logging.info('action values = %s', q)
return v, q
Evaluate Random Policy
policy = np.ones((env.nS, env.nA)) / env.nA
state_values, action_values = evaluate_bellman(env, policy)
00:00:01 [INFO] policy = [[0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25] [0.25 0.25 0.25 0.25]] 00:00:02 [INFO] state values = [-65104.83759925 -65038.5827873 -64886.60152707 -64615.82843643 -64172.32462725 -63470.85252575 -62382.03961363 -60720.87142909 -58253.74184104 -54780.58582607 -50443.22723536 -46534.94093823 -65167.0924112 -65120.30923558 -65001.39335747 -64784.55915498 -64426.29291958 -63854.19333637 -62950.39488604 -61522.8328326 -59255.76826796 -55640.78840181 -50010.15494178 -42622.65464111 -65272.13039877 -65270.16838634 -65210.10351225 -65090.72190643 -64890.09455972 -64565.2330141 -64038.51376156 -63160.29674733 -61601.70999637 -58512.64457145 -51329.94948883 -31318.86804331 -65375.13039877 -65399.38989566 -65409.12367714 -65379.27827568 -65329.12143901 -65247.9060526 -65116.22623947 -64896.67198591 -64507.02529817 -63734.75894194 -45570.55257159 0. ] 00:00:02 [INFO] action values = [[-6.51058376e+04 -6.50395828e+04 -6.51680924e+04 -6.51058376e+04] [-6.50395828e+04 -6.48876015e+04 -6.51213092e+04 -6.51058376e+04] [-6.48876015e+04 -6.46168284e+04 -6.50023934e+04 -6.50395828e+04] [-6.46168284e+04 -6.41733246e+04 -6.47855592e+04 -6.48876015e+04] [-6.41733246e+04 -6.34718525e+04 -6.44272929e+04 -6.46168284e+04] [-6.34718525e+04 -6.23830396e+04 -6.38551933e+04 -6.41733246e+04] [-6.23830396e+04 -6.07218714e+04 -6.29513949e+04 -6.34718525e+04] [-6.07218714e+04 -5.82547418e+04 -6.15238328e+04 -6.23830396e+04] [-5.82547418e+04 -5.47815858e+04 -5.92567683e+04 -6.07218714e+04] [-5.47815858e+04 -5.04442272e+04 -5.56417884e+04 -5.82547418e+04] [-5.04442272e+04 -4.65359409e+04 -5.00111549e+04 -5.47815858e+04] [-4.65359409e+04 -4.65359409e+04 -4.26236546e+04 -5.04442272e+04] [-6.51058376e+04 -6.51213092e+04 -6.52731304e+04 -6.51680924e+04] [-6.50395828e+04 -6.50023934e+04 -6.52711684e+04 -6.51680924e+04] [-6.48876015e+04 -6.47855592e+04 -6.52111035e+04 -6.51213092e+04] [-6.46168284e+04 -6.44272929e+04 -6.50917219e+04 -6.50023934e+04] [-6.41733246e+04 -6.38551933e+04 -6.48910946e+04 -6.47855592e+04] [-6.34718525e+04 -6.29513949e+04 -6.45662330e+04 -6.44272929e+04] [-6.23830396e+04 -6.15238328e+04 -6.40395138e+04 -6.38551933e+04] [-6.07218714e+04 -5.92567683e+04 -6.31612967e+04 -6.29513949e+04] [-5.82547418e+04 -5.56417884e+04 -6.16027100e+04 -6.15238328e+04] [-5.47815858e+04 -5.00111549e+04 -5.85136446e+04 -5.92567683e+04] [-5.04442272e+04 -4.26236546e+04 -5.13309495e+04 -5.56417884e+04] [-4.65359409e+04 -4.26236546e+04 -3.13198680e+04 -5.00111549e+04] [-6.51680924e+04 -6.52711684e+04 -6.53761304e+04 -6.52731304e+04] [-6.51213092e+04 -6.52111035e+04 -6.54751304e+04 -6.52731304e+04] [-6.50023934e+04 -6.50917219e+04 -6.54751304e+04 -6.52711684e+04] [-6.47855592e+04 -6.48910946e+04 -6.54751304e+04 -6.52111035e+04] [-6.44272929e+04 -6.45662330e+04 -6.54751304e+04 -6.50917219e+04] [-6.38551933e+04 -6.40395138e+04 -6.54751304e+04 -6.48910946e+04] [-6.29513949e+04 -6.31612967e+04 -6.54751304e+04 -6.45662330e+04] [-6.15238328e+04 -6.16027100e+04 -6.54751304e+04 -6.40395138e+04] [-5.92567683e+04 -5.85136446e+04 -6.54751304e+04 -6.31612967e+04] [-5.56417884e+04 -5.13309495e+04 -6.54751304e+04 -6.16027100e+04] [-5.00111549e+04 -3.13198680e+04 -6.54751304e+04 -5.85136446e+04] [-4.26236546e+04 -3.13198680e+04 -1.00000000e+00 -5.13309495e+04] [-6.52731304e+04 -6.54751304e+04 -6.53761304e+04 -6.53761304e+04] [-6.52711684e+04 -6.54751304e+04 -6.54751304e+04 -6.53761304e+04] [-6.52111035e+04 -6.54751304e+04 -6.54751304e+04 -6.54751304e+04] [-6.50917219e+04 -6.54751304e+04 -6.54751304e+04 -6.54751304e+04] [-6.48910946e+04 -6.54751304e+04 -6.54751304e+04 -6.54751304e+04] [-6.45662330e+04 -6.54751304e+04 -6.54751304e+04 -6.54751304e+04] [-6.40395138e+04 -6.54751304e+04 -6.54751304e+04 -6.54751304e+04] [-6.31612967e+04 -6.54751304e+04 -6.54751304e+04 -6.54751304e+04] [-6.16027100e+04 -6.54751304e+04 -6.54751304e+04 -6.54751304e+04] [-5.85136446e+04 -6.54751304e+04 -6.54751304e+04 -6.54751304e+04] [-5.13309495e+04 -1.00000000e+00 -6.54751304e+04 -6.54751304e+04] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]]
Evaluate Optimal Policy
actions = np.ones(env.nS, dtype=int)
actions[36] = 0
actions[11::12] = 2
policy = np.eye(env.nA)[actions]
state_values, action_values = evaluate_bellman(env, policy)
00:00:02 [INFO] policy = [[0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 0. 1. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 0. 1. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 0. 1. 0.] [1. 0. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [0. 0. 1. 0.]] 00:00:02 [INFO] state values = [ -14. -13. -12. -11. -10. -9. -8. -7. -6. -5. -4. -3. -13. -12. -11. -10. -9. -8. -7. -6. -5. -4. -3. -2. -12. -11. -10. -9. -8. -7. -6. -5. -4. -3. -2. -1. -13. -113. -113. -113. -113. -113. -113. -113. -113. -113. -1. 0.] 00:00:02 [INFO] action values = [[ -15. -14. -14. -15.] [ -14. -13. -13. -15.] [ -13. -12. -12. -14.] [ -12. -11. -11. -13.] [ -11. -10. -10. -12.] [ -10. -9. -9. -11.] [ -9. -8. -8. -10.] [ -8. -7. -7. -9.] [ -7. -6. -6. -8.] [ -6. -5. -5. -7.] [ -5. -4. -4. -6.] [ -4. -4. -3. -5.] [ -15. -13. -13. -14.] [ -14. -12. -12. -14.] [ -13. -11. -11. -13.] [ -12. -10. -10. -12.] [ -11. -9. -9. -11.] [ -10. -8. -8. -10.] [ -9. -7. -7. -9.] [ -8. -6. -6. -8.] [ -7. -5. -5. -7.] [ -6. -4. -4. -6.] [ -5. -3. -3. -5.] [ -4. -3. -2. -4.] [ -14. -12. -14. -13.] [ -13. -11. -113. -13.] [ -12. -10. -113. -12.] [ -11. -9. -113. -11.] [ -10. -8. -113. -10.] [ -9. -7. -113. -9.] [ -8. -6. -113. -8.] [ -7. -5. -113. -7.] [ -6. -4. -113. -6.] [ -5. -3. -113. -5.] [ -4. -2. -113. -4.] [ -3. -2. -1. -3.] [ -13. -113. -14. -14.] [ -12. -113. -113. -14.] [ -11. -113. -113. -113.] [ -10. -113. -113. -113.] [ -9. -113. -113. -113.] [ -8. -113. -113. -113.] [ -7. -113. -113. -113.] [ -6. -113. -113. -113.] [ -5. -113. -113. -113.] [ -4. -113. -113. -113.] [ -3. -1. -113. -113.] [ 0. 0. 0. 0.]]
Evaluate Random-Generated Policy
policy = np.random.uniform(size=(env.nS, env.nA))
policy = policy / np.sum(policy, axis=1, keepdims=True)
state_values, action_values = evaluate_bellman(env, policy)
00:00:02 [INFO] policy = [[0.2275677 0.29655611 0.24993822 0.22593797] [0.1766031 0.26924493 0.18241092 0.37174105] [0.36123028 0.14373357 0.29677919 0.19825697] [0.34389291 0.56035414 0.04300507 0.05274788] [0.0080841 0.33291382 0.31113736 0.34786472] [0.32406883 0.26464084 0.15281859 0.25847173] [0.0640631 0.34661191 0.07764701 0.51167798] [0.26418693 0.20992357 0.13393189 0.39195761] [0.27462234 0.34222196 0.01131228 0.37184343] [0.21442448 0.21611939 0.33060629 0.23884984] [0.23128455 0.2811586 0.4488116 0.03874524] [0.39766289 0.39997167 0.12547318 0.07689227] [0.18687207 0.21547646 0.33780682 0.25984466] [0.67668801 0.06986476 0.14300702 0.11044021] [0.40386721 0.15662972 0.28835589 0.15114718] [0.14942755 0.10374995 0.61693388 0.12988862] [0.1325213 0.24856725 0.55345295 0.0654585 ] [0.35220289 0.04039184 0.41042298 0.19698229] [0.41387165 0.25628418 0.31323958 0.01660459] [0.34578413 0.14696266 0.36208649 0.14516673] [0.21357411 0.27824066 0.04308481 0.46510043] [0.39098086 0.18313086 0.36106504 0.06482324] [0.23119035 0.37302825 0.12787658 0.26790482] [0.09979224 0.54237525 0.2191271 0.13870541] [0.40722826 0.01396109 0.57555048 0.00326017] [0.25624328 0.10207442 0.27793439 0.36374791] [0.12505158 0.28964211 0.29762751 0.2876788 ] [0.09033969 0.38582758 0.18106899 0.34276374] [0.31690513 0.13475638 0.36869814 0.17964035] [0.29015699 0.19141956 0.29036517 0.22805828] [0.25657709 0.17735621 0.33823883 0.22782786] [0.31373053 0.44884227 0.01420649 0.22322071] [0.3305768 0.14525412 0.30946646 0.21470262] [0.08495834 0.18705847 0.35743574 0.37054745] [0.24851293 0.28264272 0.2821678 0.18667655] [0.34590367 0.14181193 0.16816446 0.34411994] [0.31867994 0.2782397 0.03961863 0.36346173] [0.26156336 0.36578923 0.05472968 0.31791772] [0.09285944 0.3517723 0.0707591 0.48460916] [0.4357362 0.30716211 0.21977002 0.03733167] [0.25459093 0.16556221 0.26358076 0.3162661 ] [0.44281161 0.38846879 0.00531729 0.16340231] [0.49424043 0.11620195 0.35276806 0.03678956] [0.16179107 0.01498356 0.64207715 0.18114822] [0.17184358 0.46180406 0.35050963 0.01584274] [0.10285067 0.38811012 0.36047634 0.14856287] [0.34940571 0.22962963 0.20033222 0.22063244] [0.44246285 0.18904247 0.24132682 0.12716787]] 00:00:02 [INFO] state values = [-77866.55601456 -77779.04500406 -77608.74917718 -76950.38881904 -76823.37081231 -76070.41137287 -74859.34590728 -73007.62459752 -69167.33257468 -64935.59168528 -57000.8493568 -53592.8869414 -77966.38838181 -77846.58401283 -77810.46867075 -77774.66254703 -77483.80427319 -76887.58061816 -75131.77267159 -73600.25091928 -70865.24854961 -67062.29361723 -58448.54906298 -51496.45514611 -78095.07406503 -78084.2982687 -78090.04492878 -78014.06409739 -77873.51044285 -77473.03561145 -76648.50319811 -74659.49507709 -74293.76061424 -73048.53467563 -63291.95346499 -46136.54896911 -78184.64898474 -78201.03228729 -78266.67102143 -78167.1810898 -78180.23143326 -77925.69997104 -77476.49382755 -77698.29325606 -77599.0123007 -77746.21397371 -55070.21974709 0. ] 00:00:02 [INFO] action values = [[-7.78675560e+04 -7.77800450e+04 -7.79673884e+04 -7.78675560e+04] [-7.77800450e+04 -7.76097492e+04 -7.78475840e+04 -7.78675560e+04] [-7.76097492e+04 -7.69513888e+04 -7.78114687e+04 -7.77800450e+04] [-7.69513888e+04 -7.68243708e+04 -7.77756625e+04 -7.76097492e+04] [-7.68243708e+04 -7.60714114e+04 -7.74848043e+04 -7.69513888e+04] [-7.60714114e+04 -7.48603459e+04 -7.68885806e+04 -7.68243708e+04] [-7.48603459e+04 -7.30086246e+04 -7.51327727e+04 -7.60714114e+04] [-7.30086246e+04 -6.91683326e+04 -7.36012509e+04 -7.48603459e+04] [-6.91683326e+04 -6.49365917e+04 -7.08662485e+04 -7.30086246e+04] [-6.49365917e+04 -5.70018494e+04 -6.70632936e+04 -6.91683326e+04] [-5.70018494e+04 -5.35938869e+04 -5.84495491e+04 -6.49365917e+04] [-5.35938869e+04 -5.35938869e+04 -5.14974551e+04 -5.70018494e+04] [-7.78675560e+04 -7.78475840e+04 -7.80960741e+04 -7.79673884e+04] [-7.77800450e+04 -7.78114687e+04 -7.80852983e+04 -7.79673884e+04] [-7.76097492e+04 -7.77756625e+04 -7.80910449e+04 -7.78475840e+04] [-7.69513888e+04 -7.74848043e+04 -7.80150641e+04 -7.78114687e+04] [-7.68243708e+04 -7.68885806e+04 -7.78745104e+04 -7.77756625e+04] [-7.60714114e+04 -7.51327727e+04 -7.74740356e+04 -7.74848043e+04] [-7.48603459e+04 -7.36012509e+04 -7.66495032e+04 -7.68885806e+04] [-7.30086246e+04 -7.08662485e+04 -7.46604951e+04 -7.51327727e+04] [-6.91683326e+04 -6.70632936e+04 -7.42947606e+04 -7.36012509e+04] [-6.49365917e+04 -5.84495491e+04 -7.30495347e+04 -7.08662485e+04] [-5.70018494e+04 -5.14974551e+04 -6.32929535e+04 -6.70632936e+04] [-5.35938869e+04 -5.14974551e+04 -4.61375490e+04 -5.84495491e+04] [-7.79673884e+04 -7.80852983e+04 -7.81856490e+04 -7.80960741e+04] [-7.78475840e+04 -7.80910449e+04 -7.82846490e+04 -7.80960741e+04] [-7.78114687e+04 -7.80150641e+04 -7.82846490e+04 -7.80852983e+04] [-7.77756625e+04 -7.78745104e+04 -7.82846490e+04 -7.80910449e+04] [-7.74848043e+04 -7.74740356e+04 -7.82846490e+04 -7.80150641e+04] [-7.68885806e+04 -7.66495032e+04 -7.82846490e+04 -7.78745104e+04] [-7.51327727e+04 -7.46604951e+04 -7.82846490e+04 -7.74740356e+04] [-7.36012509e+04 -7.42947606e+04 -7.82846490e+04 -7.66495032e+04] [-7.08662485e+04 -7.30495347e+04 -7.82846490e+04 -7.46604951e+04] [-6.70632936e+04 -6.32929535e+04 -7.82846490e+04 -7.42947606e+04] [-5.84495491e+04 -4.61375490e+04 -7.82846490e+04 -7.30495347e+04] [-5.14974551e+04 -4.61375490e+04 -1.00000000e+00 -6.32929535e+04] [-7.80960741e+04 -7.82846490e+04 -7.81856490e+04 -7.81856490e+04] [-7.80852983e+04 -7.82846490e+04 -7.82846490e+04 -7.81856490e+04] [-7.80910449e+04 -7.82846490e+04 -7.82846490e+04 -7.82846490e+04] [-7.80150641e+04 -7.82846490e+04 -7.82846490e+04 -7.82846490e+04] [-7.78745104e+04 -7.82846490e+04 -7.82846490e+04 -7.82846490e+04] [-7.74740356e+04 -7.82846490e+04 -7.82846490e+04 -7.82846490e+04] [-7.66495032e+04 -7.82846490e+04 -7.82846490e+04 -7.82846490e+04] [-7.46604951e+04 -7.82846490e+04 -7.82846490e+04 -7.82846490e+04] [-7.42947606e+04 -7.82846490e+04 -7.82846490e+04 -7.82846490e+04] [-7.30495347e+04 -7.82846490e+04 -7.82846490e+04 -7.82846490e+04] [-6.32929535e+04 -1.00000000e+00 -7.82846490e+04 -7.82846490e+04] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]]
def optimal_bellman(env, gamma=1., verbose=True):
p = np.zeros((env.nS, env.nA, env.nS))
r = np.zeros((env.nS, env.nA))
for state in range(env.nS - 1):
for action in range(env.nA):
for prob, next_state, reward, terminated in env.P[state][action]:
p[state, action, next_state] += prob
r[state, action] += (reward * prob)
c = np.ones(env.nS)
a_ub = gamma * p.reshape(-1, env.nS) - \
np.repeat(np.eye(env.nS), env.nA, axis=0)
b_ub = -r.reshape(-1)
a_eq = np.zeros((0, env.nS))
b_eq = np.zeros(0)
bounds = [(None, None),] * env.nS
res = scipy.optimize.linprog(c, a_ub, b_ub, bounds=bounds,
method='interior-point')
v = res.x
q = r + gamma * np.dot(p, v)
if verbose:
logging.info('optimal state values = %s', v)
logging.info('optimal action values = %s', q)
return v, q
optimal_state_values, optimal_action_values = optimal_bellman(env)
00:00:02 [INFO] optimal state values = [-1.40000000e+01 -1.30000000e+01 -1.20000000e+01 -1.10000000e+01 -1.00000000e+01 -9.00000000e+00 -8.00000000e+00 -7.00000000e+00 -6.00000000e+00 -5.00000000e+00 -4.00000000e+00 -3.00000000e+00 -1.30000000e+01 -1.20000000e+01 -1.10000000e+01 -1.00000000e+01 -9.00000000e+00 -8.00000000e+00 -7.00000000e+00 -6.00000000e+00 -5.00000000e+00 -4.00000000e+00 -3.00000000e+00 -2.00000000e+00 -1.20000000e+01 -1.10000000e+01 -1.00000000e+01 -9.00000000e+00 -8.00000000e+00 -7.00000000e+00 -6.00000000e+00 -5.00000000e+00 -4.00000000e+00 -3.00000000e+00 -2.00000000e+00 -1.00000000e+00 -1.30000000e+01 -1.20000000e+01 -1.10000000e+01 -1.00000000e+01 -9.00000000e+00 -8.00000000e+00 -7.00000000e+00 -6.00000000e+00 -5.00000000e+00 -4.00000000e+00 -9.99999999e-01 1.82224030e-11] 00:00:02 [INFO] optimal action values = [[ -14.99999999 -13.99999999 -13.99999999 -14.99999999] [ -13.99999999 -13. -13. -14.99999999] [ -13. -12. -12. -13.99999999] [ -12. -11. -11. -13. ] [ -11. -10. -10. -12. ] [ -10. -9. -9. -11. ] [ -9. -8. -8. -10. ] [ -8. -7. -7. -9. ] [ -7. -6. -6. -8. ] [ -6. -5. -5. -7. ] [ -5. -4. -4. -6. ] [ -4. -4. -3. -5. ] [ -14.99999999 -13. -13. -13.99999999] [ -13.99999999 -12. -12. -13.99999999] [ -13. -11. -11. -13. ] [ -12. -10. -10. -12. ] [ -11. -9. -9. -11. ] [ -10. -8. -8. -10. ] [ -9. -7. -7. -9. ] [ -8. -6. -6. -8. ] [ -7. -5. -5. -7. ] [ -6. -4. -4. -6. ] [ -5. -3. -3. -5. ] [ -4. -3. -2. -4. ] [ -13.99999999 -12. -14. -13. ] [ -13. -11. -113. -13. ] [ -12. -10. -113. -12. ] [ -11. -9. -113. -11. ] [ -10. -8. -113. -10. ] [ -9. -7. -113. -9. ] [ -8. -6. -113. -8. ] [ -7. -5. -113. -7. ] [ -6. -4. -113. -6. ] [ -5. -3. -113. -5. ] [ -4. -2. -113. -4. ] [ -3. -2. -1. -3. ] [ -13. -113. -14. -14. ] [ -12. -113. -113. -14. ] [ -11. -113. -113. -113. ] [ -10. -113. -113. -113. ] [ -9. -113. -113. -113. ] [ -8. -113. -113. -113. ] [ -7. -113. -113. -113. ] [ -6. -113. -113. -113. ] [ -5. -113. -113. -113. ] [ -4. -113. -113. -113. ] [ -3. -1. -113. -113. ] [ 0. 0. 0. 0. ]]
optimal_actions = optimal_action_values.argmax(axis=1)
logging.info('optimal policy = %s', optimal_actions)
00:00:03 [INFO] optimal policy = [2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 0 0 0 0 0 0 0 0 0 0 1 0]