double getAdjustedReward( int action, int state ) { /* Although the getEntryMatrix() routine is normally used to extract matrix entries, we provide this routine for the immediate reward (utilities) matrix for two purposes: First, the pomdp-solve code itself no longer deals with with utilities that are specified in terms of costs. This routine will mask this fact by multiplying all immediate rewards by -1. Second, it is often desirable to have non-negative rewards. Any problem can be converted to one of these by adding the appropriate offset. The routine can do this as well. Note that the actual value functions will be skewed and require some form of rescaling to make sense. To force only non-negative rewards, set the global flag gRequireNonNegativeRewards before calling this routine. If you want cost utilities and do not mind negative values, the you should access the immediate rewards directly with: getEntryMatrix( Q, a, state ) */ double reward; reward = getEntryMatrix( Q, action, state ); if( gValueType == COST_value_type ) reward *= -1.0; if (( gRequireNonNegativeRewards ) && ( gMinimumImmediateReward < 0.0 )) reward -= gMinimumImmediateReward; return ( reward ); } /* getAdjustedReward */
void displayMDPSlice( int state ) { /* Shows the transition and observation probabilites (and rewards) for the given state. */ int a, j, obs; if(( state < 0 ) || ( state >= gNumStates ) || ( gNumStates < 1 )) return; printf( "MDP slice for state: %d\n", state ); for( a = 0; a < gNumActions; a++ ) for( j = P[a]->row_start[state]; j < P[a]->row_start[state] + P[a]->row_length[state]; j++ ) printf( "\tP( s=%d | s=%d, a=%d ) = %.6lf\n", P[a]->col[j], state, a, P[a]->mat_val[j] ); if( gProblemType == POMDP_problem_type ) for( a = 0; a < gNumActions; a++ ) for( obs = R[a]->row_start[state]; obs < R[a]->row_start[state] + R[a]->row_length[state]; obs++ ) printf( "\tP( o=%d | s=%d, a=%d ) = %.6lf\n", R[a]->col[obs], state, a, R[a]->mat_val[obs] ); for( a = 0; a < gNumActions; a++ ) printf( "\tQ( s=%d, a=%d ) = %5.6lf\n", state, a, getEntryMatrix( Q, a, state )); } /* displayMDPSlice */
void setPossibleObservations( double epsilon ) { /* Sets the global arrays to precomputed values to determine whether or not each observation is possible for a given action. Also stores how many observations are possible for each action. */ int a, z, j, cur_state; int all_zero_prob_obs; for ( a = 0; a < gNumActions; a++ ) { for ( z = 0; z < gNumObservations; z++ ) { /* We want to check for the case where an observation is impossible. */ all_zero_prob_obs = TRUE; for ( cur_state = 0; cur_state < gNumStates; cur_state++) for ( j = P[a]->row_start[cur_state]; j < P[a]->row_start[cur_state] + P[a]->row_length[cur_state]; j++ ) if ( ! Equal( getEntryMatrix( R[a], P[a]->col[j], z ), 0.0, epsilon )) { all_zero_prob_obs = FALSE; /* Yeah, it's a 'goto'; just so I can say I used one. */ goto END_LOOP; } END_LOOP: if ( all_zero_prob_obs ) gObservationPossible[a][z] = FALSE; else { gObservationPossible[a][z] = TRUE; gNumPossibleObservations[a]++; } /* if observation is possible */ } /* for z */ } /* for a */ /* A little sanity check. */ for ( a = 0; a < gNumActions; a++ ) Assert( gNumPossibleObservations[a] > 0, "Bad POMDP. No observations possible for some action." ); } /* setPossibleObservations */
int transformBeliefState( double *pi, double *pi_hat, int a, int obs ) { double denom; int i, j, z, cur_state, next_state; if( gProblemType != POMDP_problem_type ) return( 0 ); /* zero out all elements since we will acumulate probabilities as we loop */ for( i = 0; i < gNumStates; i++ ) pi_hat[i] = 0.0; for( cur_state = 0; cur_state < gNumStates; cur_state++ ) { for( j = P[a]->row_start[cur_state]; j < P[a]->row_start[cur_state] + P[a]->row_length[cur_state]; j++ ) { next_state = P[a]->col[j]; pi_hat[next_state] += pi[cur_state] * P[a]->mat_val[j] * getEntryMatrix( R[a], next_state, obs ); } /* for j */ } /* for i */ /* Normalize */ denom = 0.0; for( i = 0; i < gNumStates; i++ ) denom += pi_hat[i]; if( IS_ZERO( denom )) return( 0 ); for( i = 0; i < gNumStates; i++ ) pi_hat[i] /= denom; return( 1 ); } /* transformBeliefState */
REAL_VALUE getImmediateReward( int action, int cur_state, int next_state, int obs ) { #if USE_DECISION_TREE return dtGet(action, cur_state, next_state, obs); #else Imm_Reward_List temp = gImmRewardList; REAL_VALUE return_value = 0.0; assert(( action >= 0) && (action < gNumActions) && (cur_state >= 0) && (cur_state < gNumStates) && (next_state >= 0) && (next_state < gNumStates)); while( temp != NULL ) { if((( temp->action == WILDCARD_SPEC ) || ( temp->action == action ))) { switch( temp->type ) { case ir_value: if( gProblemType == POMDP_problem_type ) { if((( temp->next_state == WILDCARD_SPEC ) || ( temp->next_state == next_state)) && ((temp->obs == WILDCARD_SPEC) || (temp->obs == obs )) && ((temp->cur_state == WILDCARD_SPEC) || (temp->cur_state == cur_state ))) { return_value = temp->rep.value; } /* if we have a match */ } /* if POMDP */ else { /* then it is an MDP */ if((( temp->cur_state == WILDCARD_SPEC ) || ( temp->cur_state == cur_state)) && ((temp->next_state == WILDCARD_SPEC) || (temp->next_state == next_state ))) { return_value = temp->rep.value; } /* if we have a match */ } break; case ir_vector: if( gProblemType == POMDP_problem_type ) { if((( temp->next_state == WILDCARD_SPEC ) || ( temp->next_state == next_state)) && ((temp->cur_state == WILDCARD_SPEC) || (temp->cur_state == cur_state ))) { return_value = temp->rep.vector[obs]; } } /* if POMDP */ else { /* it is an MDP */ if(( temp->cur_state == WILDCARD_SPEC ) || ( temp->cur_state == cur_state)) { return_value = temp->rep.vector[next_state]; } } break; case ir_matrix: if( gProblemType == POMDP_problem_type ) { if(( temp->cur_state == WILDCARD_SPEC ) || (temp->cur_state == cur_state )) return_value = getEntryMatrix( temp->rep.matrix, next_state, obs ); } else return_value = getEntryMatrix( temp->rep.matrix, cur_state, next_state ); break; default: fprintf( stderr, "** ERR ** Unreckognized IR_Type in getImmediateReward().\n"); exit( -1 ); break; } /* switch */ } /* If we have a partially matching node */ temp = temp->next; } /* while */ return( return_value ); #endif /* if USE_DECISION_TREE / else */ } /* getImmediateReward */
double getImmediateReward( int action, int cur_state, int next_state, int obs ) { #if USE_DECISION_TREE double dt_return_value; #endif #if !USE_DECISION_TREE || CHECK_DECISION_TREE Imm_Reward_List temp = gImmRewardList; double return_value = 0.0; assert(( action >= 0) && (action < gNumActions) && (cur_state >= 0) && (cur_state < gNumStates) && (next_state >= 0) && (next_state < gNumStates)); while( temp != NULL ) { if((( temp->action == WILDCARD_SPEC ) || ( temp->action == action ))) { switch( temp->type ) { case ir_value: if( gProblemType == POMDP_problem_type ) { if((( temp->next_state == WILDCARD_SPEC ) || ( temp->next_state == next_state)) && ((temp->obs == WILDCARD_SPEC) || (temp->obs == obs )) && ((temp->cur_state == WILDCARD_SPEC) || (temp->cur_state == cur_state ))) { return_value = temp->rep.value; } /* if we have a match */ } /* if POMDP */ else { /* then it is an MDP */ if((( temp->cur_state == WILDCARD_SPEC ) || ( temp->cur_state == cur_state)) && ((temp->next_state == WILDCARD_SPEC) || (temp->next_state == next_state ))) { return_value = temp->rep.value; } /* if we have a match */ } break; case ir_vector: if( gProblemType == POMDP_problem_type ) { if((( temp->next_state == WILDCARD_SPEC ) || ( temp->next_state == next_state)) && ((temp->cur_state == WILDCARD_SPEC) || (temp->cur_state == cur_state ))) { return_value = temp->rep.vector[obs]; } } /* if POMDP */ else { /* it is an MDP */ if(( temp->cur_state == WILDCARD_SPEC ) || ( temp->cur_state == cur_state)) { return_value = temp->rep.vector[next_state]; } } break; case ir_matrix: if( gProblemType == POMDP_problem_type ) { if(( temp->cur_state == WILDCARD_SPEC ) || (temp->cur_state == cur_state )) return_value = getEntryMatrix( temp->rep.matrix, next_state, obs ); } else return_value = getEntryMatrix( temp->rep.matrix, cur_state, next_state ); break; default: fprintf( stderr, "** ERR ** Unreckognized IR_Type in getImmediateReward().\n"); exit( -1 ); break; } /* switch */ } /* If we have a partially matching node */ temp = temp->next; } /* while */ #endif /* if !USE_DECISION_TREE || CHECK_DECISION_TREE */ #if USE_DECISION_TREE dt_return_value = dtGet(action, cur_state, next_state, obs); #if CHECK_DECISION_TREE if (return_value != dt_return_value) { fprintf(stderr, "ERROR: getImmediateReward: decision-tree value and pattern match values disagree\n" " action=%d cur_state=%d next_state=%d obs=%d)\n" " decision-tree value=%g pattern match value=%g difference=%g\n", action, cur_state, next_state, obs, dt_return_value, return_value, fabs(dt_return_value - return_value)); exit(EXIT_FAILURE); } #endif return dt_return_value; #else /* if USE_DECISION_TREE / else */ return( return_value ); #endif /* if USE_DECISION_TREE / else */ } /* getImmediateReward */
AlphaList projectVector( AlphaList node, int a, int z ) { AlphaList proj_node; double *alpha; int j, cur_state; Assert ( node != NULL, "Bad parameters" ); /* If this observation is impossible, then there is no projection, so we return NULL. Note that we SHOULD NOT get here in the normal course of events, since the projectList() routine handles this case specially, which should result in this function *not* being called when the observation is impossible. This is especially important due to the assumption we make below about dividing the immediate reward into one piece per observation. */ Assert ( gObservationPossible[a][z], "Shouldn't be projecting vector when obs is not possible" ); alpha = newAlpha(); /* Set projection values */ for ( cur_state = 0; cur_state < gNumStates; cur_state++) { alpha[cur_state] = 0.0; for ( j = P[a]->row_start[cur_state]; j < P[a]->row_start[cur_state] + P[a]->row_length[cur_state]; j++ ) alpha[cur_state] += P[a]->mat_val[j] * getEntryMatrix( R[a], P[a]->col[j], z ) * node->alpha[P[a]->col[j]]; alpha[cur_state] *= gDiscount; /* Now we add a piece of the immediate rewards. This may seem a little odd to add only a portion of the immediate rewards here. In fact, the actual values here don't make complete sense, but the effect will be that a vector from each observations' projection will sum to be the actual new alpha vector. Without adding this, we would need to add the extra step of adding the immediate rewards making the code not an nice. It turns out that adding this constant vector does not change any of the properties of the sets that we are interested in. IMPORTANT: Because of the way we define the projection set for impossible observations, we can also use the 1/|Z| weighting of rewards here. If we did not define the impossible observation projections to exist at all then it would not enough to use gNumObservations in the denominator since some observations are not possible, meaning the sum will consist of less vectors than there are observations. If this were the case we would need to use the precomputed total non-zero prob. observations for each action. */ alpha[cur_state] += getAdjustedReward( a, cur_state ) / ((double) gNumObservations); } /* for i */ /* Create a node for this vector. Note that the action we want to associate with this vector should be the same as the original vector. The 'a' defining how it was projected is an attribute of the projected list as a whole. */ proj_node = newAlphaNode( alpha, node->action ); /* We will also store the action and observation for each individual vector. This will come in handy when we enumerate vectors because we can easily identify which action and observation vector came from. */ proj_node->obs = z; /* Indicate which vector it originated from. */ proj_node->prev_source = node; return ( proj_node ); } /* projectVector */