2017-09-27 14 views

나는자가 운전용 자동차 프로그램을 위해 아래의 코드를 작성 중이다. 내 choose_action 함수에 문제가있다.최적의 액션 선택 무작위로 선택

: 에이전트는 다음 단계에서 가장 높은 Q 값이 행동의 선택에서 임의의 조치를 선택해야합니다 "다른 : 조치를 = maxQaction"

하지만 지금 쓴 식으로 매번 같은 행동을 선택할 것입니다. 누구든지 가장 높은 Q 값에 대한 선택을 무작위로 추출하는 방법을 제안 할 수 있습니까? 아마도 목록을 사용할 수 있습니다. 속임수를 썼는지

import random 
import math 
from environment import Agent, Environment 
from planner import RoutePlanner 
from simulator import Simulator 
import itertools 

class LearningAgent(Agent): 
    """ An agent that learns to drive in the Smartcab world. 
     This is the object you will be modifying. """ 

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): 
     super(LearningAgent, self).__init__(env)  # Set the agent in the evironment 
     self.planner = RoutePlanner(self.env, self) # Create a route planner 
     self.valid_actions = self.env.valid_actions # The set of valid actions 

     # Set parameters of the learning agent 
     self.learning = learning # Whether the agent is expected to learn 
     self.Q = dict()   # Create a Q-table which will be a dictionary of tuples 
     self.epsilon = epsilon # Random exploration factor 
     self.alpha = alpha  # Learning factor 

     ## TO DO ## 
     # Set any additional class parameters as needed 

     self.states = [    
      ['red', 'green'],     #light 
      ['left', 'right', 'forward', None], #vehicleleft 
      ['left', 'right', 'forward', None], #vehicleright 
      ['left', 'right', 'forward', None], #vehicleoncoming 
      ['left', 'right', 'forward']  #waypoint 

     self.x = 0 

     self.q_maker = dict((k, 0.0) for k in self.valid_actions) 

     for prod_state in itertools.product(*self.states): 
      self.Q[prod_state] = self.q_maker.copy() 

    def reset(self, destination=None, testing=False): 
     """ The reset function is called at the beginning of each trial. 
      'testing' is set to True if testing trials are being used 
      once training trials have completed. """ 

     # Select the destination as the new location to route to 

     ## TO DO ## 
     # Update epsilon using a decay function of your choice 
     # Update additional class parameters as needed 
     # If 'testing' is True, set epsilon and alpha to 0 

     #Added for Question 6 
     #self.x = self.x + 1 
     if testing: 
      self.epsilon = 0.0 
      self.alpha = 0.0 
      #self.epsilon = self.epsilon - 0.05 for question 6 

      self.x += 1 
      self.epsilon = math.exp(-self.alpha*self.x) 
      #self.epsilon = math.fabs(math.cos(self.alpha*self.x)) 
      # self.epsilon = 1.0/(self.x**2) 
      # self.epsilon = self.alpha**self.x 

     return None 

    def build_state(self): 
     """ The build_state function is called when the agent requests data from the 
      environment. The next waypoint, the intersection inputs, and the deadline 
      are all features available to the agent. """ 

     # Collect data about the environment 
     waypoint = self.planner.next_waypoint() # The next waypoint 
     inputs = self.env.sense(self)   # Visual input - intersection light and traffic 
     deadline = self.env.get_deadline(self) # Remaining deadline 

     ## TO DO ## 
     # Set 'state' as a tuple of relevant data for the agent   
     #state = (waypoint, inputs['light'], inputs['left'], inputs['right'], inputs['oncoming']) #None modified for "Update the Driving Agent State" 

     state = (inputs['light'], inputs['left'], inputs['right'], inputs['oncoming'],waypoint) 

     return state 

    def get_maxQ(self, state): 
     """ The get_max_Q function is called when the agent is asked to find the 
      maximum Q-value of all actions based on the 'state' the smartcab is in. """ 

     ## TO DO ## 
     # Calculate the maximum Q-value of all actions for a given state 

     action_selections = self.Q[state] 

     maxQ = max(action_selections.items(), key=lambda x: x[1])[1] 

     return maxQ 

    def createQ(self, state): 
     """ The createQ function is called when a state is generated by the agent. """ 

     ## TO DO ## 
     # When learning, check if the 'state' is not in the Q-table 
     # If it is not, create a new dictionary for that state 
     # Then, for each action available, set the initial Q-value to 0.0 

     if not self.learning: 

     if not state in self.Q: 
      self.Q[state] = self.q_maker.copy() 


    def choose_action(self, state): 
     """ The choose_action function is called when the agent is asked to choose 
      which action to take, based on the 'state' the smartcab is in. """ 

     # Set the agent state and default action 
     self.state = state 
     self.next_waypoint = self.planner.next_waypoint() 
     action = random.choice([None, 'forward', 'left', 'right']) ##None ##Modified from None for question 3 
     #action = None # added after first submission 

     ## TO DO ## 
     # When not learning, choose a random action 
     # When learning, choose a random action with 'epsilon' probability 
     # Otherwise, choose an action with the highest Q-value for the current state 

     action_selections = self.Q[state] 
     maxQaction = max(action_selections.items(), key=lambda x: x[1])[0] 
     if self.learning: 
      choose_using_epsilon = random.random() < 1 - self.epsilon 
      if not choose_using_epsilon: 
       valid_actions = filter(lambda x: x != maxQaction, 
       action = random.choice(valid_actions) 
       action = maxQaction 
      action = random.choice(Environment.valid_actions) 
     return action 


if not self.learning or random.random() < self.epsilon: 
      action = random.choice(self.valid_actions) 
      maxQaction= self.get_maxQ(state) 
      maxQaction= [] # build list of actions that match the max Q value 
      for act in self.Q[state]: 
       if self.Q[state][act] == maxQ: 
      action = random.choice(maxQaction) # choose one randomly 

감사합니다. – user3476463