Source code for lib.agents.Agent_DoubleDQN

import torch, os
import numpy as np 
import sys

[docs]class Agent_DoubleDQN:

    def __init__(self, env, memory, qNetworkSlow, qNetworkFast, numActions, gamma, device='cpu'):
        '''A class allowing the training of the DQN

        This class is intended to be used by functions within the ``lib.agents.trainAgents``
        module.
        
        This is supposed to be an imporvement over DQN. Details of the idea behind Double DQN is
        present in the original paper: 

        Deep Reinforcement Learning with Double Q-learning
        https://arxiv.org/pdf/1509.06461.pdf

        See the ``step()`` function for the details of its implementation.

        Parameters
        ----------
        env : instance of an Env class
            The environment that will be used for generating the result of a particulat action
            in the current state
        memory : instance of the Memory class
            The environment that will allow one to store and retrieve previously held states that
            can be used to train upon.
        qNetworkSlow : neural network instance
            This is a neural network instance that can be used for converting a state into a
            set of Q-values. This is the slower version, used for making a prediction, and is 
            never trained. Its parameters are slowly updated over time to slowly allow it to 
            converge to the right value
        qNetworkFast : neural network instance
            This is the instance of the faster network that can be used for training Q-learning
            algorithm. This is the main network that implements the Bellman equation.
        numActions : int
            The number of discrete actions that the current environment can accept.
        gamma : float
            The discount factor. currently not used
        device : str, optional
            the device where you want to run your algorithm, by default 'cpu'. If you want to run
            the optimization of a particular GPU, you may specify that. For example with 'cuda:0'
        
        Raises
        ------
        type
            [description]
        '''
        
        try:

            if not torch.cuda.is_available():
                self.device = device
            else:
                self.device = 'cpu'

            self.env            = env
            self.memory         = memory
            self.qNetworkSlow   = qNetworkSlow.to(self.device)
            self.qNetworkFast   = qNetworkFast.to(self.device)
            self.gamma          = torch.as_tensor(gamma).float().to(device)
            self.numActions     = numActions
        except Exception as e:
            raise type(e)( 
                'lib.agents.Agent_DQN.Agent_DQN.__init__ - ERROR - ' + str(e) 
                ).with_traceback(sys.exc_info()[2])

        return

[docs]    def randomAction(self, state):
        '''returns a set of random actions for the given states
        
        given the size of the number of actions, this function is going
        to return a set of actions that has the same number of actions
        as the number of inputs in the shape. For example, if 
        ``state.shape == (10, ?)`` then the result will be a vector of
        size 10. This is in accordance with the redduction in the
        dimensionality of the maxAction space. 
        
        Parameters
        ----------
        state : {nd_array or tensor}
            numpy array or tensor containing the state. The columns
            represent the different parts of the state.
        
        Returns
        -------
        uarray
            The return value is set of random actions
        '''

        try:
            r, c = state.shape
            result = np.random.randint(0, self.numActions, size=r).astype(np.float32)
            result = torch.as_tensor(result).to(self.device)
            return result
        except Exception as e:
            raise type(e)( 
                'lib.agents.Agent_DQN.Agent_DQN.randomAction - ERROR - ' + str(e) 
                ).with_traceback(sys.exc_info()[2])

[docs]    def sigmaMaxAction(self, state, sigma=0):
        '''returns the action that maximizes the noisy Q function 
        
        Given an set of statees, this function is going to return a set
        of actions which will maximize the value of the Q network for each
        of the supplied states, after adding Gaussian noise to the layers.
        This is alternative to using an $\\epsilon$-greedy policy, and has 
        shown to provide better results under most circumstances.
        
        Parameters
        ----------
        state : {nd_array or tensor}
            numpy array or tensor containing the state. The columns
            represent the different parts of the state.
        
        Returns
        -------
        uarray
            The return values of actions that maximize the states
        '''

        try:

            state = torch.as_tensor(state).float().to(self.device)
            qVals = self.qNetworkSlow(state, sigma)
            result = torch.argmax(qVals, dim=1)
            result = result.to(dtype=torch.float32, device=self.device)
            return result
        except Exception as e:
            raise type(e)(
                'lib.agents.Agent_DQN.Agent_DQN.maxAction - ERROR - ' + str(e)
            ).with_traceback(sys.exc_info()[2])

[docs]    def maxAction(self, state):
        '''returns the action that maximizes the Q function
        
        Given an set of statees, this function is going to return a set
        of actions which will maximize the value of the Q network for each
        of the supplied states.
        
        Parameters
        ----------
        state : {nd_array or tensor}
            numpy array or tensor containing the state. The columns
            represent the different parts of the state.
        
        Returns
        -------
        uarray
            The return values of actions that maximize the states
        '''

        try:

            state  = torch.as_tensor(state).float().to(self.device)
            qVals  = self.qNetworkSlow( state )
            result = torch.argmax(qVals, dim=1)
            result = result.to(dtype=torch.float32, device=self.device)
            return result
        except Exception as e:
            raise type(e)( 
                'lib.agents.Agent_DQN.Agent_DQN.maxAction - ERROR - ' + str(e) 
                ).with_traceback(sys.exc_info()[2])

[docs]    def epsGreedyAction(self, state, eps=0.999):
        '''epsilon greedy action
        
        This is the epsilon greedy action. In general, this is going to
        select the maximum action ``eps`` percentage of the times, while
        selecting the random action the rest of the time. It is assumed
        that this will receive a value of epsilon between 0 and 1.
        
        Parameters
        ----------
        state : {ndarray}
            [description]
        eps : float, optional
            Determines the fraction of times the max action will be selected
            in comparison to a random action. (the default is 0.999)
        
        Returns
        -------
        tensor
            The 1d tensor that has an action for each state provided. 
        '''

        try:
            ma = self.maxAction(state)
            ra = self.randomAction(state)
            p  = np.random.choice([1, 0], size=len(ma), p=[1-eps, eps]).astype(np.float32)
            p  = torch.as_tensor( p ).to(self.device)

            result = ma * p + ra * (1-p)

            return result
        except Exception as e:
            raise type(e)( 
                'lib.agents.Agent_DQN.Agent_DQN.epsGreedyAction - ERROR - ' + str(e) 
                ).with_traceback(sys.exc_info()[2])

[docs]    def memoryUpdateEpisode(self, policy, maxSteps=1000, minScoreToAdd=None):
        '''update the memory
        
        Given a particular policy, this memory is going to take
        the policy and generate a series of memories and update
        thememory buffer. Generating memories is easier to do 
        using this function than an external function ...
        
        Parameters
        ----------
        policy : {function}
            This is a function that takes a state and returns an action. This
            defines how the agent will explore the environment by changing the
            exploration/exploitation scale.
        maxSteps : {number}, optional
            The maximum number of steps that one shoule have within an episode. 
            (the default is 1000)
        '''

        try:
            allResults = self.env.episode(policy, maxSteps = maxSteps)
            s, a, r, ns, f = zip(*allResults[0])
            score = np.sum(r)
            if (minScoreToAdd is None):
                self.memory.appendAllAgentResults( allResults )

            if (minScoreToAdd is not None) and (score >= minScoreToAdd):
                self.memory.appendAllAgentResults( allResults )
            return score
        except Exception as e:
            raise type(e)( 
                'lib.agents.Agent_DQN.Agent_DQN.memoryUpdateEpisode - ERROR - ' + str(e) 
                ).with_traceback(sys.exc_info()[2])

[docs]    def step(self, nSamples = 100, sigma=0):
        '''optimize the fast Q network via the bellman equations

        This function is going to obtain a number of samples form the 
        replay memory, and train the fast network over this dataset. 
        The idea behind this is that the Q network for the next step 
        will not automatically pick the next best value. It will possibly
        pick the best value that the Fast network thinks it will pick, and
        so the original DQN will overestimate the possible Q value. This 
        should reduce that estimation.
        
        This will optimize based upon the idea that
        
        > Given:
        >     Qf = fast network
        >     Qs = slow network
        >     s  = current state
        >     a  = action that maximizes the current state
        >     r  = reward
        >     s' = next state
        > 
        > a'  = argmax Qf(s')
        > Qf(s, a) = r + Qs(s', a')
        
        Parameters
        ----------
        nSamples : int, optional
            The number of samples too retrieve from the replay memory for 
            training, by default 100
        sigma : float, optional
            The amount by which the fast network should be jittered so that the
            network introduces some Gaussian noise in the learning process, by 
            default 0, which does not introduce noise in the learning algorithm.
        
        Raises
        ------
        type
            [description]
        '''

        try:


            data = self.memory.sample( nSamples )
            states, actions, rewards, nextStates, dones = zip(*data)
            
            states      = torch.as_tensor(states).float().to(self.device)
            actions     = torch.as_tensor(actions).float().to(self.device)
            rewards     = torch.as_tensor(rewards).float().to(self.device)
            nextStates  = torch.as_tensor(nextStates).float().to(self.device)
            dones       = torch.as_tensor(dones).float().to(self.device)
            
            # We cannot just assume that the best action will be the one
            # taken from the max of the next network. It is infact the 
            # one that is chosen by the fast network for the next step.
            newQVals = self.qNetworkFast(nextStates)
            newActions = torch.argmax(newQVals, dim=1)
            # print(newActions)
            # print(self.qNetworkSlow(nextStates)[:10])
            # print(self.qNetworkSlow(nextStates).shape)
            # print(self.qNetworkSlow(nextStates)[:, newActions][:, 0].shape)

            self.qNetworkFast.train()
            # self.qNetworkSlow.train()

            # Note that `max` also returns the positions
            qVal    = self.qNetworkFast( states, sigma ).max(dim=1)[0]
            qValHat = rewards + self.qNetworkSlow( nextStates )[ :, newActions ][:, 0] * (1-dones)
            
            self.qNetworkFast.step(qValHat, qVal)
            
            self.qNetworkFast.eval()
            # self.qNetworkSlow.eval()
        except Exception as e:
            raise type(e)( 
                'lib.agents.Agent_DQN.Agent_DQN.step - ERROR - ' + str(e) 
                ).with_traceback(sys.exc_info()[2])
        
        return

[docs]    def checkTrainingMode(self):
        '''prints whether the networks are in training or eval mode
        
        This function allows us to determine whether the function is 
        in training or evaluation mode. This is important for several
        things -  specifically to make sure that the networks are not
        going to be randomly evaluated, as well as making sure the 
        things like batch normalization and dropout are properly evaluated.
        '''
        try:
            print('qNetworkSlow is in trai mode:', self.qNetworkSlow.training)
            print('qNetworkFast is in trai mode:', self.qNetworkFast.training)
            return
        except Exception as e:
            raise type(e)( 
                'lib.agents.Agent_DQN.Agent_DQN.checkTrainingMode - ERROR - ' + str(e) 
                ).with_traceback(sys.exc_info()[2])

[docs]    def eval(self):
        '''put both the networks in eval mode
        
        This will allow us to make sure that the networks
        do not randomly get evaluated for some reason.
        '''
        try:
            self.qNetworkFast.eval()
            self.qNetworkSlow.eval()
            return
        except Exception as e:
            raise type(e)( 
                'lib.agents.Agent_DQN.Agent_DQN.eval - ERROR - ' + str(e) 
                ).with_traceback(sys.exc_info()[2])

[docs]    def softUpdate(self, tau=0.1):
        '''update the slow network slightly
        
        This is going to update the slow network slightly. The amount
        is dictated by ``tau``. This should be a number between 0 and 1.
        It will update the ``tau`` fraction of the slow network weights
        with the new weights. This is done for providing stability to the
        network. 
        
        Parameters
        ----------
        tau : {number}, optional
            This parameter determines how much of the fast Networks weights
            will be updated to the ne parameters weights (the default is 0.1)
        '''

        for v1, v2 in zip(self.qNetworkFast.parameters(), self.qNetworkSlow.parameters()):
            v2.data.copy_( tau*v1 + (1-tau)*v2 )

        return

[docs]    def fastUpdate(self, tau=1):
        '''update the fast network slightly
        
        This is going to update the fast network slightly. The amount
        is dictated by ``tau``. This should be a number between 0 and 1.
        It will update the ``tau`` fraction of the slow network weights
        with the new weights. This is done for providing stability to the
        network. 
        
        Parameters
        ----------
        tau : {number}, optional
            This parameter determines how much of the slow Networks weights
            will be updated to the fast parameters weights (the default is 1)
        '''

        for v1, v2 in zip(self.qNetworkFast.parameters(), self.qNetworkSlow.parameters()):
            v1.data.copy_(tau*v2 + (1-tau)*v1)

        return

[docs]    def save(self, folder, name):
        '''save the model
        
        This function allows one to save the model, in a folder that is 
        specified, with the fast and the slow qNetworks, as well as the
        memory buffer. Sometimes there may be more than a single agent,
        and under those circumstances, the name will come in handy. If the
        supplied folder does not exist, it will be generated. 
        
        Parameters
        ----------
        folder : {str}
            folder into which the model should be saved.
        name : {str}
            A name to associate the current model with. It is
            absolutelty possible to save a number of models within
            the same folder.
        '''

        try:
            if not os.path.exists(folder):
                os.makedirs(folder)

            torch.save(
                self.qNetworkFast.state_dict(), 
                os.path.join(folder, f'{name}.qNetworkFast'))

            torch.save(
                self.qNetworkSlow.state_dict(), 
                os.path.join(folder, f'{name}.qNetworkSlow'))

            self.memory.save(folder, name)
            
        except Exception as e:
            raise type(e)( 
                'lib.agents.Agent_DQN.Agent_DQN.save - ERROR - ' + str(e) 
                ).with_traceback(sys.exc_info()[2])

        return

[docs]    def load(self, folder, name, map_location = None):
        '''load the model
        
        An agent saved with the save command can be safely loaded with this command.
        This will load both the qNetworks, as well as the memory buffer. There is a
        possibility that one may not want to load the model into the same device. In
        that case, the user should insert the device that the user wants to load the
        model into. 
        
        Parameters
        ----------
        folder : {str}
            folder into which the model should be saved.
        name : {str}
            A name to associate the model to load. It is absolutelty possible to save 
            a number of models within the same folder, and hence the name can retrieve
            that model that is important.
        map_location : {str}, optional
            The device in which to load the file. This is a string like 'cpu', 'cuad:0'
            etc. (the default is None, which results in the model being loaded to the 
            originam device)
        '''

        try:
            if map_location is None:
                self.qNetworkSlow.load_state_dict(
                    torch.load(os.path.join(folder, f'{name}.qNetworkSlow')))
                self.qNetworkFast.load_state_dict(
                    torch.load(os.path.join(folder, f'{name}.qNetworkFast')))
            else:
                self.qNetworkSlow.load_state_dict(
                    torch.load(os.path.join(folder, f'{name}.qNetworkSlow')),
                    map_location = map_location)
                self.qNetworkFast.load_state_dict(
                    torch.load(os.path.join(folder, f'{name}.qNetworkFast')),
                    map_location = map_location)

            self.memory.load(folder, name)
        except Exception as e:
            raise type(e)( 
                'lib.agents.Agent_DQN.Agent_DQN.load - ERROR - ' + str(e) 
                ).with_traceback(sys.exc_info()[2])

        return