Source code for lib.agents.trainAgents

import json
import csv, sys, os

from collections import deque

from tqdm import tqdm

import numpy as np
import torch
from datetime import datetime as dt

from lib.agents import Agent_DQN as dqn
from lib.agents import Agent_DoubleDQN as ddqn
from lib.agents import qNetwork as qN

from lib.envs import envUnity
from lib.envs import envGym
from lib.utils import ReplayBuffer as RB

from torch.nn import functional as F

from datetime import datetime as dt


[docs]def trainAgentGymEpsGreedy(configAgent):

    try:

        now = dt.now().strftime(r'%Y-%m-%d--%H-%M-%S')

        functionMaps = {
            'relu': F.relu,
            'relu6': F.relu6,
            'elu': F.elu,
            'celu': F.celu,
            'selu': F.selu,
            'prelu': F.prelu,
            'rrelu': F.rrelu,
            'glu': F.glu,

            'tanh': torch.tanh,
            'hardtanh': F.hardtanh

        }

        # Config parameters
        # --------------------------
        memorySize           = configAgent['memorySize']
        envName              = configAgent['envName']
        nIterations          = configAgent['nIterations']
        initMemoryIterations = configAgent['initMemoryIterations']
        eps0                 = configAgent['eps0']
        epsDecay             = configAgent['epsDecay']
        minEps               = configAgent['minEps']
        maxSteps             = configAgent['maxSteps']
        nSamples             = configAgent['nSamples']
        Tau                  = configAgent['Tau']
        inpSize              = configAgent['inpSize']
        outSize              = configAgent['outSize']
        hiddenSizes          = configAgent['hiddenSizes']
        hiddenActivations    = configAgent['hiddenActivations']
        lr                   = configAgent['lr']
        N                    = configAgent['N']
        sigma                = configAgent['sigma']
        loadFolder           = configAgent['loadFolder']
        saveFolder           = configAgent['saveFolder']

        hiddenActivations = [functionMaps[m] for m in hiddenActivations]

        slidingScore = deque(maxlen=100)
        prevBest = -np.inf

        allResults = {
            "scores": [],
            "slidingScores": [],
            "saveLocations": [],
        }

        QNslow = qN.qNetworkDiscrete(
            inpSize*N, outSize, hiddenSizes, activations=hiddenActivations, lr=lr)
        QNfast = qN.qNetworkDiscrete(
            inpSize*N, outSize, hiddenSizes, activations=hiddenActivations, lr=lr)
        memoryBuffer = RB.SimpleReplayBuffer(memorySize)

        with envGym.Env1D(envName, N=N, showEnv=False) as env:

            if configAgent['agentType'] == 'DQN':
                agent = dqn.Agent_DQN(
                    env, memoryBuffer, QNslow, QNfast, numActions=outSize, gamma=1, device='cuda:0')
            if configAgent['agentType'] == 'DoubleDQN':
                agent = ddqn.Agent_DoubleDQN(
                    env, memoryBuffer, QNslow, QNfast, numActions=outSize, gamma=1, device='cuda:0')

            if loadFolder:
                agent.load( loadFolder, 'agent_0' )
            agent.eval()


            if not loadFolder:
                def policy(m): return [agent.sigmaMaxAction(m, 1)]
            else:
                def policy(m): return [agent.sigmaMaxAction(m, 0.01)]
                agent.memory.clear()

            agent.fastUpdate()

            print('Generating some initial memory ...')
            for i in tqdm(range(initMemoryIterations)):
                score = agent.memoryUpdateEpisode(
                    policy, maxSteps=maxSteps, minScoreToAdd=None)
                tqdm.write(f'score = {score}')

            eps = eps0
            print('Optimizing model ...')
            for i in tqdm(range(nIterations)):

                eps = max(minEps, epsDecay*eps)  # decrease epsilon

                # We are changing the policy to adding noise
                def policy(m): return [agent.sigmaMaxAction(m, eps)]
                agent.memoryUpdateEpisode(policy, maxSteps=maxSteps)

                agent.step(nSamples=nSamples, sigma=sigma)
                agent.softUpdate(Tau)

                # Calculate Score
                results = env.episode(
                    lambda m: [agent.maxAction(m)], maxSteps)[0]
                s, a, r, ns, f = zip(*results)
                score = sum(r)
                slidingScore.append(score)

                if i % 20 == 0:
                    agent.fastUpdate()

                if (score > prevBest):
                    tqdm.write('score = {}, max = {}, sliding score = {}, eps = {}'.format(
                        score, max(r), np.mean(slidingScore), eps))

                if saveFolder and (score > prevBest):
                    prevBest = score
                    folder = os.path.join( saveFolder, f'{now}_{i:05d}_{int(score)}' )
                    os.makedirs(folder)
                    agent.save(folder, 'agent_0')
                    allResults['saveLocations'].append((score, folder))

                    json.dump(configAgent, open(os.path.join(folder, 'configAgent.json'), 'w'))

                allResults['scores'].append(score)
                allResults['slidingScores'].append(np.mean(slidingScore))

        return allResults

    except Exception as e:
        raise type(e)(
            'lib.agents.Agent_DQN.Agent_DQN.save - ERROR - ' + str(e)
        ).with_traceback(sys.exc_info()[2])

    return