Exemple #1
0
/*
 * Generate 624 more random values.  This function is called when the
 * state vector has been exhausted.  It generates another batch of
 * pseudo-random values.  The performance of this function is critical
 * to the performance of the Mersenne Twist PRNG, so it has been
 * highly optimized.
 */
void mts_refresh(
    register mt_state*	state)		/* State for the PRNG */
    {
    register int	i;		/* Index into the state */
    register uint32_t*
			state_ptr;	/* Next place to get from state */
    register uint32_t
			value1;		/* Scratch val picked up from state */
    register uint32_t
			value2;		/* Scratch val picked up from state */

    /*
     * Start by making sure a random seed has been set.  If not, set
     * one.
     */
    if (!state->initialized)
	{
	mts_seed32(state, DEFAULT_SEED32_OLD);
	return;				/* Seed32 calls us recursively */
	}

    /*
     * Now generate the new pseudorandom values by applying the
     * recurrence relation.  We use two loops and a final
     * 2-statement sequence so that we can handle the wraparound
     * explicitly, rather than having to use the relatively slow
     * modulus operator.
     *
     * In essence, the recurrence relation concatenates bits
     * chosen from the current random value (last time around)
     * with the immediately preceding one.  Then it
     * matrix-multiplies the concatenated bits with a value
     * RECURRENCE_OFFSET away and a constant matrix.  The matrix
     * multiplication reduces to a shift and two XORs.
     *
     * Some comments on the optimizations are in order:
     *
     * Strictly speaking, none of the optimizations should be
     * necessary.  All could conceivably be done by a really good
     * compiler.  However, the compilers available to me aren't quite
     * smart enough, so hand optimization needs to be done.
     *
     * Shawn Cokus was the first to achieve a major speedup.  In the
     * original code, the first value given to COMBINE_BITS (in my
     * characterization) was re-fetched from the state array, rather
     * than being carried in a scratch variable.  Cokus noticed that
     * the first argument to COMBINE_BITS could be saved in a register
     * in the previous loop iteration, getting rid of the need for an
     * expensive memory reference.
     *
     * Cokus also switched to using pointers to access the state
     * array and broke the original loop into two so that he could
     * avoid using the expensive modulus operator.  Cokus used three
     * pointers; Richard J. Wagner noticed that the offsets between
     * the three were constant, so that they could be collapsed into a
     * single pointer and constant-offset accesses.  This is clearly
     * faster on x86 architectures, and is the same cost on RISC
     * machines.  A secondary benefit is that Cokus' version was
     * register-starved on the x86, while Wagner's version was not.
     *
     * I made several smaller improvements to these observations.
     * First, I reversed the contents of the state vector.  In the
     * current version of the code, this change doesn't directly
     * affect the performance of the refresh loop, but it has the nice
     * side benefit that an all-zero state structure represents an
     * uninitialized generator.  It also slightly speeds up the
     * random-number routines, since they can compare the state
     * pointer against zero instead of against a constant (this makes
     * the biggest difference on RISC machines).
     *
     * Second, I returned to Matsumoto and Nishimura's original
     * technique of using a lookup table to decide whether to xor the
     * constant vector A (MATRIX_A in this code) with the newly
     * computed value.  Cokus and Wagner had used the ?: operator,
     * which requires a test and branch.  Modern machines don't like
     * branches, so the table lookup is faster.
     *
     * Third, in the Cokus and Wagner versions the loop ends with a
     * statement similar to "value1 = value2", which is necessary to
     * carry the fetched value into the next loop iteration.  I
     * recognized that if the loop were unrolled so that it generates
     * two values per iteration, a bit of variable renaming would get
     * rid of that assignment.  A nice side effect is that the
     * overhead of loop control becomes only half as large.
     *
     * It is possible to improve the code's performance somewhat
     * further.  In particular, since the second loop's loop count
     * factors into 2*2*3*3*11, it could be unrolled yet further.
     * That's easy to do, too: just change the "/ 2" into a division
     * by whatever factor you choose, and then use cut-and-paste to
     * duplicate the code in the body.  To remove a few more cycles,
     * fix the code to decrement state_ptr by the unrolling factor, and
     * adjust the various offsets appropriately.  However, the payoff
     * will be small.  At the moment, the x86 version of the loop is
     * 25 instructions, of which 3 are involved in loop control
     * (including the decrementing of state_ptr).  Further unrolling by
     * a factor of 2 would thus produce only about a 6% speedup.
     *
     * The logical extension of the unrolling
     * approach would be to remove the loops and create 624
     * appropriate copies of the body.  However, I think that doing
     * the latter is a bit excessive!
     *
     * I suspect that a superior optimization would be to simplify the
     * mathematical operations involved in the recurrence relation.
     * However, I have no idea whether such a simplification is
     * feasible.
     */
    state_ptr = &state->statevec[MT_STATE_SIZE - 1];
    value1 = *state_ptr;
    for (i = (MT_STATE_SIZE - RECURRENCE_OFFSET) / 2;  --i >= 0;  )
	{
	state_ptr -= 2;
	value2 = state_ptr[1];
	value1 = COMBINE_BITS(value1, value2);
	state_ptr[2] =
	  MATRIX_MULTIPLY(state_ptr[-RECURRENCE_OFFSET + 2], value1);
	value1 = state_ptr[0];
	value2 = COMBINE_BITS(value2, value1);
	state_ptr[1] =
	  MATRIX_MULTIPLY(state_ptr[-RECURRENCE_OFFSET + 1], value2);
	}
    value2 = *--state_ptr;
    value1 = COMBINE_BITS(value1, value2);
    state_ptr[1] =
      MATRIX_MULTIPLY(state_ptr[-RECURRENCE_OFFSET + 1], value1);

    for (i = (RECURRENCE_OFFSET - 1) / 2;  --i >= 0;  )
	{
	state_ptr -= 2;
	value1 = state_ptr[1];
	value2 = COMBINE_BITS(value2, value1);
	state_ptr[2] =
	  MATRIX_MULTIPLY(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET + 2],
	    value2);
	value2 = state_ptr[0];
	value1 = COMBINE_BITS(value1, value2);
	state_ptr[1] =
	  MATRIX_MULTIPLY(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET + 1],
	    value1);
	}

    /*
     * The final entry in the table requires the "previous" value
     * to be gotten from the other end of the state vector, so it
     * must be handled specially.
     */
    value1 = COMBINE_BITS(value2, state->statevec[MT_STATE_SIZE - 1]);
    *state_ptr =
      MATRIX_MULTIPLY(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET], value1);

    /*
     * Now that refresh is complete, reset the state pointer to allow more
     * pseudorandom values to be fetched from the state array.
     */
    state->stateptr = MT_STATE_SIZE;
    }
Exemple #2
0
/**
   Refresh the state for next set of random numbers.
 */
void mts_refresh(register mt_state* state /**< State for the PRNG */
		 ){
    register int	i;		/* Index into the state */
    register mt_u32bit_t*
			state_ptr;	/* Next place to get from state */
    register mt_u32bit_t
			value1;		/* Scratch val picked up from state */
    register mt_u32bit_t
			value2;		/* Scratch val picked up from state */

    /*
     * Start by making sure a random seed has been set.  If not, set
     * one.
     */
    if (!state->initialized)
	{
	mts_seed32(state, DEFAULT_SEED32_OLD);
	return;				/* Seed32 calls us recursively */
	}

    state_ptr = &state->statevec[MT_STATE_SIZE - 1];
    value1 = *state_ptr;
    for (i = (MT_STATE_SIZE - RECURRENCE_OFFSET) / 2;  --i >= 0;  )
	{
	state_ptr -= 2;
	value2 = state_ptr[1];
	value1 = COMBINE_BITS(value1, value2);
	state_ptr[2] =
	  MATRIX_MULTIPLX(state_ptr[-RECURRENCE_OFFSET + 2], value1);
	value1 = state_ptr[0];
	value2 = COMBINE_BITS(value2, value1);
	state_ptr[1] =
	  MATRIX_MULTIPLX(state_ptr[-RECURRENCE_OFFSET + 1], value2);
	}
    value2 = *--state_ptr;
    value1 = COMBINE_BITS(value1, value2);
    state_ptr[1] =
      MATRIX_MULTIPLX(state_ptr[-RECURRENCE_OFFSET + 1], value1);

    for (i = (RECURRENCE_OFFSET - 1) / 2;  --i >= 0;  )
	{
	state_ptr -= 2;
	value1 = state_ptr[1];
	value2 = COMBINE_BITS(value2, value1);
	state_ptr[2] =
	  MATRIX_MULTIPLX(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET + 2],
	    value2);
	value2 = state_ptr[0];
	value1 = COMBINE_BITS(value1, value2);
	state_ptr[1] =
	  MATRIX_MULTIPLX(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET + 1],
	    value1);
	}

    /*
     * The final entry in the table requires the "previous" value
     * to be gotten from the other end of the state vector, so it
     * must be handled specially.
     */
    value1 = COMBINE_BITS(value2, state->statevec[MT_STATE_SIZE - 1]);
    *state_ptr =
      MATRIX_MULTIPLX(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET], value1);

    /*
     * Now that refresh is complete, reset the state pointer to allow more
     * pseudorandom values to be fetched from the state array.
     */
    state->stateptr = MT_STATE_SIZE;
    }