Ejemplo n.º 1
0
double getAdjustedReward( int action, int state ) {
/*
  Although the getEntryMatrix() routine is normally used to extract
  matrix entries, we provide this routine for the immediate reward
  (utilities) matrix for two purposes:  First, the pomdp-solve code
  itself no longer deals with with utilities that are specified in
  terms of costs.  This routine will mask this fact by multiplying all
  immediate rewards by -1.  Second, it is often desirable to have
  non-negative rewards. Any problem can be converted to one of these
  by adding the appropriate offset.  The routine can do this as well.
  Note that the actual value functions will be skewed and require some
  form of rescaling to make sense.

  To force only non-negative rewards, set the global flag
  gRequireNonNegativeRewards before calling this routine. 

  If you want cost utilities and do not mind negative values,
  the you should access the immediate rewards directly with:

     getEntryMatrix( Q, a, state ) 
*/
  double reward;

  reward = getEntryMatrix( Q, action, state );

  if( gValueType == COST_value_type )
    reward *= -1.0;

  if (( gRequireNonNegativeRewards )
      && ( gMinimumImmediateReward < 0.0 ))
    reward -= gMinimumImmediateReward;

  return ( reward );

}  /* getAdjustedReward */
Ejemplo n.º 2
0
void 
displayMDPSlice( int state ) {
/*
   Shows the transition and observation probabilites (and rewards) for
   the given state.
*/
   int a, j, obs;

   if(( state < 0 ) || ( state >= gNumStates ) || ( gNumStates < 1 ))
      return;

   printf( "MDP slice for state: %d\n", state );

   for( a = 0; a < gNumActions; a++ )
      for( j = P[a]->row_start[state]; 
          j < P[a]->row_start[state] +  P[a]->row_length[state];
          j++ ) 
         printf( "\tP( s=%d | s=%d, a=%d ) = %.6lf\n",
                P[a]->col[j], state, a, P[a]->mat_val[j] );
  
   if( gProblemType == POMDP_problem_type )
      for( a = 0; a < gNumActions; a++ )
         for( obs = R[a]->row_start[state]; 
             obs < R[a]->row_start[state] +  R[a]->row_length[state];
             obs++ ) 
            printf( "\tP( o=%d | s=%d, a=%d ) = %.6lf\n",
                   R[a]->col[obs], state, a, R[a]->mat_val[obs] );
   
   for( a = 0; a < gNumActions; a++ )
      printf( "\tQ( s=%d, a=%d ) = %5.6lf\n",
             state, a, getEntryMatrix( Q, a, state ));
   
}  /* displayMDPSlice */
Ejemplo n.º 3
0
void setPossibleObservations( double epsilon ) {
/*
  Sets the global arrays to precomputed values to determine whether or
  not each observation is possible for a given action.  Also stores
  how many observations are possible for each action.
*/
  int a, z, j, cur_state;
  int all_zero_prob_obs;

  for ( a = 0; a < gNumActions; a++ ) {

    for ( z = 0; z < gNumObservations; z++ ) {
      
      /* We want to check for the case where an observation is
         impossible.  */

      all_zero_prob_obs = TRUE;
      for ( cur_state = 0; cur_state < gNumStates; cur_state++)
        for ( j = P[a]->row_start[cur_state]; 
              j < P[a]->row_start[cur_state] 
                + P[a]->row_length[cur_state];
              j++ ) 
          if ( ! Equal( getEntryMatrix( R[a], P[a]->col[j], z ),
                        0.0, epsilon )) {
            all_zero_prob_obs = FALSE;
      
            /* Yeah, it's a 'goto'; just so I can say I used one. */
            goto END_LOOP;
          }
      
    END_LOOP:
      
      if ( all_zero_prob_obs )
        gObservationPossible[a][z] = FALSE;
      
      else  {
        gObservationPossible[a][z] = TRUE;
        gNumPossibleObservations[a]++;
      }  /* if observation is possible */
      
    } /* for z */
  
  } /* for a */

  /* A little sanity check. */
  for ( a = 0; a < gNumActions; a++ )
    Assert( gNumPossibleObservations[a] > 0,
            "Bad POMDP. No observations possible for some action." );

}  /* setPossibleObservations */
Ejemplo n.º 4
0
int 
transformBeliefState( double *pi,
		      double *pi_hat,
		      int a,
		      int obs ) {
   double denom;
   int i, j, z, cur_state, next_state;

   if( gProblemType != POMDP_problem_type )
      return( 0 );

   /* zero out all elements since we will acumulate probabilities
      as we loop */
   for( i = 0; i < gNumStates; i++ )
      pi_hat[i] = 0.0;

   for( cur_state = 0; cur_state < gNumStates; cur_state++ ) {

      for( j = P[a]->row_start[cur_state]; 
	  j < P[a]->row_start[cur_state] +  P[a]->row_length[cur_state];
	  j++ ) {

         next_state = P[a]->col[j];

         pi_hat[next_state] += pi[cur_state] * P[a]->mat_val[j] 
            * getEntryMatrix( R[a], next_state, obs );

      } /* for j */
   }  /* for i */

   /* Normalize */
   denom = 0.0;
   for( i = 0; i < gNumStates; i++ )
      denom += pi_hat[i];
   
   if( IS_ZERO( denom ))
      return( 0 );

   for( i = 0; i < gNumStates; i++ )
      pi_hat[i] /= denom;

   return( 1 );
}  /* transformBeliefState */
Ejemplo n.º 5
0
REAL_VALUE getImmediateReward( int action, int cur_state, int next_state,
			   int obs ) {
#if USE_DECISION_TREE
  return dtGet(action, cur_state, next_state, obs);
#else
  Imm_Reward_List temp = gImmRewardList;
  REAL_VALUE return_value = 0.0;

  assert(( action >= 0) && (action < gNumActions)
	 && (cur_state >= 0) && (cur_state < gNumStates)
	 && (next_state >= 0) && (next_state < gNumStates));

  while( temp != NULL ) {
    
    if((( temp->action == WILDCARD_SPEC )
	|| ( temp->action == action ))) {

      switch( temp->type ) {
      case ir_value:

	if( gProblemType == POMDP_problem_type ) {
	  if((( temp->next_state == WILDCARD_SPEC )
	      || ( temp->next_state == next_state))
	     && ((temp->obs == WILDCARD_SPEC)
		 || (temp->obs == obs ))
	     && ((temp->cur_state == WILDCARD_SPEC)
		 || (temp->cur_state == cur_state ))) {

	    
	    return_value = temp->rep.value;
	    
	  }  /* if we have a match */
	}  /* if POMDP */

	else {  /* then it is an MDP */
	  if((( temp->cur_state == WILDCARD_SPEC )
	      || ( temp->cur_state == cur_state))
	     && ((temp->next_state == WILDCARD_SPEC)
		 || (temp->next_state == next_state ))) {
	    
	    return_value = temp->rep.value;
	    
	  }  /* if we have a match */
	}
	     break;
    
      case ir_vector:

	if( gProblemType == POMDP_problem_type ) {
	  if((( temp->next_state == WILDCARD_SPEC )
	      || ( temp->next_state == next_state))
	     && ((temp->cur_state == WILDCARD_SPEC)
		 || (temp->cur_state == cur_state ))) {
	    
	    return_value = temp->rep.vector[obs];
	  }
	}  /* if POMDP */

	else {  /* it is an MDP */
	  if(( temp->cur_state == WILDCARD_SPEC )
	     || ( temp->cur_state == cur_state)) {
	    
	    return_value = temp->rep.vector[next_state];
	  }
	}

	break;
    
      case ir_matrix:
	if( gProblemType == POMDP_problem_type )  {
	  if(( temp->cur_state == WILDCARD_SPEC )
	     || (temp->cur_state == cur_state ))
	    return_value = getEntryMatrix( temp->rep.matrix, next_state,
					obs );
	}
	else
	  return_value = getEntryMatrix( temp->rep.matrix, cur_state,
					next_state );

	break;

      default:
	fprintf( stderr, 
		"** ERR ** Unreckognized IR_Type in getImmediateReward().\n");
	exit( -1 );
	break;
      }  /* switch */

    
    }  /* If we have a partially matching node */

    temp = temp->next;
  }  /* while */

  return( return_value );
#endif /* if USE_DECISION_TREE / else */
}  /* getImmediateReward */
Ejemplo n.º 6
0
double 
getImmediateReward( int action, int cur_state, int next_state,
		    int obs ) {
#if USE_DECISION_TREE
  double dt_return_value;
#endif
#if !USE_DECISION_TREE || CHECK_DECISION_TREE
  Imm_Reward_List temp = gImmRewardList;
  double return_value = 0.0;

  assert(( action >= 0) && (action < gNumActions)
	 && (cur_state >= 0) && (cur_state < gNumStates)
	 && (next_state >= 0) && (next_state < gNumStates));

  while( temp != NULL ) {
    
    if((( temp->action == WILDCARD_SPEC )
	|| ( temp->action == action ))) {

      switch( temp->type ) {
      case ir_value:

	if( gProblemType == POMDP_problem_type ) {
	  if((( temp->next_state == WILDCARD_SPEC )
	      || ( temp->next_state == next_state))
	     && ((temp->obs == WILDCARD_SPEC)
		 || (temp->obs == obs ))
	     && ((temp->cur_state == WILDCARD_SPEC)
		 || (temp->cur_state == cur_state ))) {

	    
	    return_value = temp->rep.value;
	    
	  }  /* if we have a match */
	}  /* if POMDP */

	else {  /* then it is an MDP */
	  if((( temp->cur_state == WILDCARD_SPEC )
	      || ( temp->cur_state == cur_state))
	     && ((temp->next_state == WILDCARD_SPEC)
		 || (temp->next_state == next_state ))) {
	    
	    return_value = temp->rep.value;
	    
	  }  /* if we have a match */
	}
	     break;
    
      case ir_vector:

	if( gProblemType == POMDP_problem_type ) {
	  if((( temp->next_state == WILDCARD_SPEC )
	      || ( temp->next_state == next_state))
	     && ((temp->cur_state == WILDCARD_SPEC)
		 || (temp->cur_state == cur_state ))) {
	    
	    return_value = temp->rep.vector[obs];
	  }
	}  /* if POMDP */

	else {  /* it is an MDP */
	  if(( temp->cur_state == WILDCARD_SPEC )
	     || ( temp->cur_state == cur_state)) {
	    
	    return_value = temp->rep.vector[next_state];
	  }
	}

	break;
    
      case ir_matrix:
	if( gProblemType == POMDP_problem_type )  {
	  if(( temp->cur_state == WILDCARD_SPEC )
	     || (temp->cur_state == cur_state ))
	    return_value = getEntryMatrix( temp->rep.matrix, next_state,
					obs );
	}
	else
	  return_value = getEntryMatrix( temp->rep.matrix, cur_state,
					next_state );

	break;

      default:
	fprintf( stderr, 
		"** ERR ** Unreckognized IR_Type in getImmediateReward().\n");
	exit( -1 );
	break;
      }  /* switch */

    
    }  /* If we have a partially matching node */

    temp = temp->next;
  }  /* while */
#endif /* if !USE_DECISION_TREE || CHECK_DECISION_TREE */


#if USE_DECISION_TREE
  dt_return_value = dtGet(action, cur_state, next_state, obs);
#if CHECK_DECISION_TREE
  if (return_value != dt_return_value) {
    fprintf(stderr,
	    "ERROR: getImmediateReward: decision-tree value and pattern match values disagree\n"
	    "  action=%d cur_state=%d next_state=%d obs=%d)\n"
	    "  decision-tree value=%g pattern match value=%g difference=%g\n",
	    action, cur_state, next_state, obs,
	    dt_return_value, return_value, fabs(dt_return_value - return_value));
    exit(EXIT_FAILURE);
  }
#endif
  return dt_return_value;
#else /* if USE_DECISION_TREE / else */
  return( return_value );
#endif /* if USE_DECISION_TREE / else */
  
}  /* getImmediateReward */
Ejemplo n.º 7
0
AlphaList 
projectVector( AlphaList node, int a, int z ) 
{
  AlphaList proj_node;
  double *alpha;
  int j, cur_state;

  Assert ( node != NULL,
           "Bad parameters" );

  /* If this observation is impossible, then there is no projection,
     so we return NULL. Note that we SHOULD NOT get here in the normal
     course of events, since the projectList() routine handles this
     case specially, which should result in this function *not* being
     called when the observation is impossible. This is especially
     important due to the assumption we make below about dividing the
     immediate reward into one piece per observation.
  */
  Assert ( gObservationPossible[a][z],
		 "Shouldn't be projecting vector when obs is not possible" );
  
  alpha = newAlpha();

  /* Set projection values */
  for ( cur_state = 0; cur_state < gNumStates; cur_state++) {
    alpha[cur_state] = 0.0;
    
    for ( j = P[a]->row_start[cur_state]; 
          j < P[a]->row_start[cur_state] +  P[a]->row_length[cur_state];
          j++ ) 

      alpha[cur_state] +=  P[a]->mat_val[j] 
        * getEntryMatrix( R[a], P[a]->col[j], z )
        * node->alpha[P[a]->col[j]];
    
    alpha[cur_state] *= gDiscount;
      
    /* Now we add a piece of the immediate rewards. This may seem a
       little odd to add only a portion of the immediate rewards
       here.  In fact, the actual values here don't make complete
       sense, but the effect will be that a vector from each
       observations' projection will sum to be the actual new alpha
       vector. Without adding this, we would need to add the extra
       step of adding the immediate rewards making the code not an
       nice. It turns out that adding this constant vector does not
       change any of the properties of the sets that we are
       interested in. IMPORTANT: Because of the way we define the
       projection set for impossible observations, we can also use
       the 1/|Z| weighting of rewards here.  If we did not define
       the impossible observation projections to exist at all then
       it would not enough to use gNumObservations in the
       denominator since some observations are not possible, meaning
       the sum will consist of less vectors than there are
       observations.  If this were the case we would need to use the
       precomputed total non-zero prob. observations for each
       action. */
    alpha[cur_state] 
      += getAdjustedReward( a, cur_state ) 
      / ((double) gNumObservations);
      
  }   /* for i */

  /* Create a node for this vector. Note that the action we want to
	associate with this vector should be the same as the original
	vector.  The 'a' defining how it was projected is an attribute of
	the projected list as a whole.  */
  proj_node = newAlphaNode( alpha, node->action );

  /* We will also store the action and observation for each
     individual vector.  This will come in handy when we enumerate
     vectors because we can easily identify which action and
     observation vector came from. */
  proj_node->obs = z;
  
  /* Indicate which vector it originated from. */
  proj_node->prev_source = node;

  return ( proj_node );

}  /* projectVector */