/* * Generate 624 more random values. This function is called when the * state vector has been exhausted. It generates another batch of * pseudo-random values. The performance of this function is critical * to the performance of the Mersenne Twist PRNG, so it has been * highly optimized. */ void mts_refresh( register mt_state* state) /* State for the PRNG */ { register int i; /* Index into the state */ register uint32_t* state_ptr; /* Next place to get from state */ register uint32_t value1; /* Scratch val picked up from state */ register uint32_t value2; /* Scratch val picked up from state */ /* * Start by making sure a random seed has been set. If not, set * one. */ if (!state->initialized) { mts_seed32(state, DEFAULT_SEED32_OLD); return; /* Seed32 calls us recursively */ } /* * Now generate the new pseudorandom values by applying the * recurrence relation. We use two loops and a final * 2-statement sequence so that we can handle the wraparound * explicitly, rather than having to use the relatively slow * modulus operator. * * In essence, the recurrence relation concatenates bits * chosen from the current random value (last time around) * with the immediately preceding one. Then it * matrix-multiplies the concatenated bits with a value * RECURRENCE_OFFSET away and a constant matrix. The matrix * multiplication reduces to a shift and two XORs. * * Some comments on the optimizations are in order: * * Strictly speaking, none of the optimizations should be * necessary. All could conceivably be done by a really good * compiler. However, the compilers available to me aren't quite * smart enough, so hand optimization needs to be done. * * Shawn Cokus was the first to achieve a major speedup. In the * original code, the first value given to COMBINE_BITS (in my * characterization) was re-fetched from the state array, rather * than being carried in a scratch variable. Cokus noticed that * the first argument to COMBINE_BITS could be saved in a register * in the previous loop iteration, getting rid of the need for an * expensive memory reference. * * Cokus also switched to using pointers to access the state * array and broke the original loop into two so that he could * avoid using the expensive modulus operator. Cokus used three * pointers; Richard J. Wagner noticed that the offsets between * the three were constant, so that they could be collapsed into a * single pointer and constant-offset accesses. This is clearly * faster on x86 architectures, and is the same cost on RISC * machines. A secondary benefit is that Cokus' version was * register-starved on the x86, while Wagner's version was not. * * I made several smaller improvements to these observations. * First, I reversed the contents of the state vector. In the * current version of the code, this change doesn't directly * affect the performance of the refresh loop, but it has the nice * side benefit that an all-zero state structure represents an * uninitialized generator. It also slightly speeds up the * random-number routines, since they can compare the state * pointer against zero instead of against a constant (this makes * the biggest difference on RISC machines). * * Second, I returned to Matsumoto and Nishimura's original * technique of using a lookup table to decide whether to xor the * constant vector A (MATRIX_A in this code) with the newly * computed value. Cokus and Wagner had used the ?: operator, * which requires a test and branch. Modern machines don't like * branches, so the table lookup is faster. * * Third, in the Cokus and Wagner versions the loop ends with a * statement similar to "value1 = value2", which is necessary to * carry the fetched value into the next loop iteration. I * recognized that if the loop were unrolled so that it generates * two values per iteration, a bit of variable renaming would get * rid of that assignment. A nice side effect is that the * overhead of loop control becomes only half as large. * * It is possible to improve the code's performance somewhat * further. In particular, since the second loop's loop count * factors into 2*2*3*3*11, it could be unrolled yet further. * That's easy to do, too: just change the "/ 2" into a division * by whatever factor you choose, and then use cut-and-paste to * duplicate the code in the body. To remove a few more cycles, * fix the code to decrement state_ptr by the unrolling factor, and * adjust the various offsets appropriately. However, the payoff * will be small. At the moment, the x86 version of the loop is * 25 instructions, of which 3 are involved in loop control * (including the decrementing of state_ptr). Further unrolling by * a factor of 2 would thus produce only about a 6% speedup. * * The logical extension of the unrolling * approach would be to remove the loops and create 624 * appropriate copies of the body. However, I think that doing * the latter is a bit excessive! * * I suspect that a superior optimization would be to simplify the * mathematical operations involved in the recurrence relation. * However, I have no idea whether such a simplification is * feasible. */ state_ptr = &state->statevec[MT_STATE_SIZE - 1]; value1 = *state_ptr; for (i = (MT_STATE_SIZE - RECURRENCE_OFFSET) / 2; --i >= 0; ) { state_ptr -= 2; value2 = state_ptr[1]; value1 = COMBINE_BITS(value1, value2); state_ptr[2] = MATRIX_MULTIPLY(state_ptr[-RECURRENCE_OFFSET + 2], value1); value1 = state_ptr[0]; value2 = COMBINE_BITS(value2, value1); state_ptr[1] = MATRIX_MULTIPLY(state_ptr[-RECURRENCE_OFFSET + 1], value2); } value2 = *--state_ptr; value1 = COMBINE_BITS(value1, value2); state_ptr[1] = MATRIX_MULTIPLY(state_ptr[-RECURRENCE_OFFSET + 1], value1); for (i = (RECURRENCE_OFFSET - 1) / 2; --i >= 0; ) { state_ptr -= 2; value1 = state_ptr[1]; value2 = COMBINE_BITS(value2, value1); state_ptr[2] = MATRIX_MULTIPLY(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET + 2], value2); value2 = state_ptr[0]; value1 = COMBINE_BITS(value1, value2); state_ptr[1] = MATRIX_MULTIPLY(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET + 1], value1); } /* * The final entry in the table requires the "previous" value * to be gotten from the other end of the state vector, so it * must be handled specially. */ value1 = COMBINE_BITS(value2, state->statevec[MT_STATE_SIZE - 1]); *state_ptr = MATRIX_MULTIPLY(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET], value1); /* * Now that refresh is complete, reset the state pointer to allow more * pseudorandom values to be fetched from the state array. */ state->stateptr = MT_STATE_SIZE; }
/** Refresh the state for next set of random numbers. */ void mts_refresh(register mt_state* state /**< State for the PRNG */ ){ register int i; /* Index into the state */ register mt_u32bit_t* state_ptr; /* Next place to get from state */ register mt_u32bit_t value1; /* Scratch val picked up from state */ register mt_u32bit_t value2; /* Scratch val picked up from state */ /* * Start by making sure a random seed has been set. If not, set * one. */ if (!state->initialized) { mts_seed32(state, DEFAULT_SEED32_OLD); return; /* Seed32 calls us recursively */ } state_ptr = &state->statevec[MT_STATE_SIZE - 1]; value1 = *state_ptr; for (i = (MT_STATE_SIZE - RECURRENCE_OFFSET) / 2; --i >= 0; ) { state_ptr -= 2; value2 = state_ptr[1]; value1 = COMBINE_BITS(value1, value2); state_ptr[2] = MATRIX_MULTIPLX(state_ptr[-RECURRENCE_OFFSET + 2], value1); value1 = state_ptr[0]; value2 = COMBINE_BITS(value2, value1); state_ptr[1] = MATRIX_MULTIPLX(state_ptr[-RECURRENCE_OFFSET + 1], value2); } value2 = *--state_ptr; value1 = COMBINE_BITS(value1, value2); state_ptr[1] = MATRIX_MULTIPLX(state_ptr[-RECURRENCE_OFFSET + 1], value1); for (i = (RECURRENCE_OFFSET - 1) / 2; --i >= 0; ) { state_ptr -= 2; value1 = state_ptr[1]; value2 = COMBINE_BITS(value2, value1); state_ptr[2] = MATRIX_MULTIPLX(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET + 2], value2); value2 = state_ptr[0]; value1 = COMBINE_BITS(value1, value2); state_ptr[1] = MATRIX_MULTIPLX(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET + 1], value1); } /* * The final entry in the table requires the "previous" value * to be gotten from the other end of the state vector, so it * must be handled specially. */ value1 = COMBINE_BITS(value2, state->statevec[MT_STATE_SIZE - 1]); *state_ptr = MATRIX_MULTIPLX(state_ptr[MT_STATE_SIZE - RECURRENCE_OFFSET], value1); /* * Now that refresh is complete, reset the state pointer to allow more * pseudorandom values to be fetched from the state array. */ state->stateptr = MT_STATE_SIZE; }