/*
 * Select next block to sample.
 *
 * Uses linear probing algorithm for picking next block.
 */
static BlockNumber
system_rows_nextsampleblock(SampleScanState *node)
{
	SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state;
	TableScanDesc scan = node->ss.ss_currentScanDesc;
	HeapScanDesc hscan = (HeapScanDesc) scan;

	/* First call within scan? */
	if (sampler->doneblocks == 0)
	{
		/* First scan within query? */
		if (sampler->step == 0)
		{
			/* Initialize now that we have scan descriptor */
			SamplerRandomState randstate;

			/* If relation is empty, there's nothing to scan */
			if (hscan->rs_nblocks == 0)
				return InvalidBlockNumber;

			/* We only need an RNG during this setup step */
			sampler_random_init_state(sampler->seed, randstate);

			/* Compute nblocks/firstblock/step only once per query */
			sampler->nblocks = hscan->rs_nblocks;

			/* Choose random starting block within the relation */
			/* (Actually this is the predecessor of the first block visited) */
			sampler->firstblock = sampler_random_fract(randstate) *
				sampler->nblocks;

			/* Find relative prime as step size for linear probing */
			sampler->step = random_relative_prime(sampler->nblocks, randstate);
		}

		/* Reinitialize lb */
		sampler->lb = sampler->firstblock;
	}

	/* If we've read all blocks or returned all needed tuples, we're done */
	if (++sampler->doneblocks > sampler->nblocks ||
		sampler->donetuples >= sampler->ntuples)
		return InvalidBlockNumber;

	/*
	 * It's probably impossible for scan->rs_nblocks to decrease between scans
	 * within a query; but just in case, loop until we select a block number
	 * less than scan->rs_nblocks.  We don't care if scan->rs_nblocks has
	 * increased since the first scan.
	 */
	do
	{
		/* Advance lb, using uint64 arithmetic to forestall overflow */
		sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks;
	} while (sampler->lb >= hscan->rs_nblocks);

	return sampler->lb;
}
Example #2
0
static uint32
random_relative_prime(uint32 n, SamplerRandomState randstate)
{
	/* Pick random starting number, with some limits on what it can be. */
	uint32 r = (uint32) sampler_random_fract(randstate) * n/2 + n/4,
		   t;

	/*
	 * This should only take 2 or 3 iterations as the probability of 2 numbers
	 * being relatively prime is ~61%.
	 */
	while ((t = gcd(r, n)) > 1)
	{
		CHECK_FOR_INTERRUPTS();
		r /= t;
	}

	return r;
}
Example #3
0
/*
 * Reset state (called by ReScan).
 */
Datum
tsm_system_time_reset(PG_FUNCTION_ARGS)
{
	TableSampleDesc	   *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
	SystemSamplerData  *sampler = (SystemSamplerData *) tsdesc->tsmdata;

	sampler->lt = InvalidOffsetNumber;
	sampler->start_time = GetCurrentTimestamp();
	sampler->end_time = TimestampTzPlusMilliseconds(sampler->start_time,
													sampler->time);
	sampler->estblocks = 2;
	sampler->doneblocks = 0;

	sampler_random_init_state(sampler->seed, sampler->randstate);
	sampler->step = random_relative_prime(sampler->nblocks, sampler->randstate);
	sampler->lb = sampler_random_fract(sampler->randstate) * (sampler->nblocks / sampler->step);

	PG_RETURN_VOID();
}
Example #4
0
/*
 * Initializes the state.
 */
Datum
tsm_system_time_init(PG_FUNCTION_ARGS)
{
	TableSampleDesc	   *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
	uint32				seed = PG_GETARG_UINT32(1);
	int32				time = PG_ARGISNULL(2) ? -1 : PG_GETARG_INT32(2);
	HeapScanDesc		scan = tsdesc->heapScan;
	SystemSamplerData  *sampler;

	if (time < 1)
		ereport(ERROR,
				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
				 errmsg("invalid time limit"),
				 errhint("Time limit must be positive integer value.")));

	sampler = palloc0(sizeof(SystemSamplerData));

	/* Remember initial values for reinit */
	sampler->seed = seed;
	sampler->nblocks = scan->rs_nblocks;
	sampler->lt = InvalidOffsetNumber;
	sampler->estblocks = 2;
	sampler->doneblocks = 0;
	sampler->time = time;
	sampler->start_time = GetCurrentTimestamp();
	sampler->end_time = TimestampTzPlusMilliseconds(sampler->start_time,
													sampler->time);

	sampler_random_init_state(sampler->seed, sampler->randstate);

	/* Find relative prime as step size for linear probing. */
	sampler->step = random_relative_prime(sampler->nblocks, sampler->randstate);
	/*
	 * Randomize start position so that blocks close to step size don't have
	 * higher probability of being chosen on very short scan.
	 */
	sampler->lb = sampler_random_fract(sampler->randstate) * (sampler->nblocks / sampler->step);

	tsdesc->tsmdata = (void *) sampler;

	PG_RETURN_VOID();
}
/*
 * Pick a random value less than and relatively prime to n, if possible
 * (else return 1).
 */
static uint32
random_relative_prime(uint32 n, SamplerRandomState randstate)
{
	uint32		r;

	/* Safety check to avoid infinite loop or zero result for small n. */
	if (n <= 1)
		return 1;

	/*
	 * This should only take 2 or 3 iterations as the probability of 2 numbers
	 * being relatively prime is ~61%; but just in case, we'll include a
	 * CHECK_FOR_INTERRUPTS in the loop.
	 */
	do
	{
		CHECK_FOR_INTERRUPTS();
		r = (uint32) (sampler_random_fract(randstate) * n);
	} while (r == 0 || gcd(r, n) > 1);

	return r;
}
Example #6
0
/*
 * Get next tuple from current block.
 *
 * This method implements the main logic in bernoulli sampling.
 * The algorithm simply generates new random number (in 0.0-1.0 range) and if
 * it falls within user specified probability (in the same range) return the
 * tuple offset.
 *
 * It is ok here to return tuple offset without knowing if tuple is visible
 * and not check it via examinetuple. The reason for that is that we do the
 * coinflip (random number generation) for every tuple in the table. Since all
 * tuples have same probability of being returned the visible and invisible
 * tuples will be returned in same ratio as they have in the actual table.
 * This means that there is no skew towards either visible or invisible tuples
 * and the  number returned visible tuples to from the executor node is the
 * fraction of visible tuples which was specified in input.
 *
 * This is faster than doing the coinflip in the examinetuple because we don't
 * have to do visibility checks on uninteresting tuples.
 *
 * If we reach end of the block return InvalidOffsetNumber which tells
 * SampleScan to go to next block.
 */
Datum
tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS)
{
	TableSampleDesc		   *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
	OffsetNumber			maxoffset = PG_GETARG_UINT16(2);
	BernoulliSamplerData   *sampler =
		(BernoulliSamplerData *) tsdesc->tsmdata;
	OffsetNumber			tupoffset = sampler->lt;
	float4					probability = sampler->probability;

	if (tupoffset == InvalidOffsetNumber)
		tupoffset = FirstOffsetNumber;
	else
		tupoffset++;

	/*
	 * Loop over tuple offsets until the random generator returns value that
	 * is within the probability of returning the tuple or until we reach
	 * end of the block.
	 *
	 * (This is our implementation of bernoulli trial)
	 */
	while (sampler_random_fract(sampler->randstate) > probability)
	{
		tupoffset++;

		if (tupoffset > maxoffset)
			break;
	}

	if (tupoffset > maxoffset)
		/* Tell SampleScan that we want next block. */
		tupoffset = InvalidOffsetNumber;

	sampler->lt = tupoffset;

	PG_RETURN_UINT16(tupoffset);
}