Example #1
0
REALNUM_TYPE dataset::sfexcl_split(set &seed, UNSIGNED_2B_TYPE Mode, REALNUM_TYPE f, bool ifForceDistCalc, UNSIGNED_1B_TYPE toTest, UNSIGNED_1B_TYPE toTrain, UNSIGNED_1B_TYPE nSeed, REALNUM_TYPE Kmetric)
//sphere exclusion implementation with some extra features added to the basic published version
//ref: Golbraikhm A, Tropsha A et al, J Comp-Aid Mol Design 2003, 17: 241-253
//There are two ways of calculating sphere's radius R:
//1) R =  f*(V/N)^1/k, where f is interpreted as Dissimilarity [0.2 .. 5.2]; 
//2) R =  mind.dist + f(max.dist- min.dist), where f should be varied [0 .. 0.25]
//
//Parameters:
//Mode				see the header-file
//seed				is the set of user-supplied points for seeding, nSeed - number of points to be seeded randomly in addition to that
//toTest, toTrain	#points from the sphere that are placed, respectively, into test and training set, 
//					which is done alternatingly (toTrain #points to training set, then toTest #points to test set, and repeated)
{
	UNSIGNED_4B_TYPE C, N = patt.RowNO(), D = patt.ColNO();
	SIGNED_4B_TYPE Z, Z1, el, el1;
	REALNUM_TYPE R, mnD, rtD;

	if ((N < 2) || (D < 2)) return 0;
	if ( (N != dist.ColNO()) || (!dist.IsSquare()) || ifForceDistCalc )
	{//!may be affected by changed metric-function
		if (Mode & SFEXCL_METRIC_COSINE)
			calc_dist(0, Kmetric, 1); //cosine-metric
		else
			if (Mode & SFEXCL_METRIC_CORR)
				calc_dist(0, Kmetric, 2); //similar to cosine-metric, but mean-centered
			else
				if (Mode & SFEXCL_METRIC_TANIMOTO)
					calc_dist(0, Kmetric, 3); //Tanimoto, though strange to apply it to non-discrete data : )
				else
					calc_dist(0, Kmetric, 0); //Euclidean-like
	}

	//----------------------
	//prepare seeding set
	if (nSeed)
	{
		if ((Mode & SFEXCL_SEED_BYACTS) == SFEXCL_SEED_BYACTS)	
			rand_split((UNSIGNED_4B_TYPE)nSeed, nSeed); 
		else 
			rand_split((UNSIGNED_4B_TYPE)nSeed, 1);
		seed |= test;
	}

	if ((Mode & SFEXCL_SEED_MINACT) == SFEXCL_SEED_MINACT)
	{
		C = 0;
		while (++C < N) if (act[sact[C]] > act[sact[0]]) break;
		Z = GetRandomNumber(C);
		seed.PutInSet(sact[Z]);
	}

	if ((Mode & SFEXCL_SEED_MAXACT) == SFEXCL_SEED_MAXACT)
	{
		C = 1;
		while (++C < N) if (act[sact[N-C]] < act[sact[N-1]]) break;
		Z = GetRandomNumber(--C) + 1;
		seed.PutInSet(sact[N - Z]);
	}

	if ((Mode & SFEXCL_R_BYDIST) == SFEXCL_R_BYDIST)
		R = f * (distAv - distMin) + distMin; //distMax is too big to use effectively!
	else
	{
		if ((Mode & SFEXCL_R_BYUSER) == SFEXCL_R_BYUSER) 
			R = f;
		else
		{//default
			REALNUM_TYPE l, h, V = 1;
			for (C = 0; C < D; C++)
			{//calculate volume of the descriptors' space
				patt.GetColScale(C, C, l, h);
				V *= pow(h - l, 1.0/D);
			}
			V /= pow(N, 1.0/D);
			R = f * V;
		}
	}

	if (R < distMin) R = distMin;
	
	//----------------------
	//prepare R-neiborhood-subsets to speed up splitting
	lneib Rneibs(N);
	for (C = 0; C < N - 1; C++)
	for (Z = C + 1; Z < SIGNED_4B_TYPE(N); Z++)
		if (dist(C, Z) < R)
		{
			Rneibs[C].PutInSet(Z);
			Rneibs[Z].PutInSet(C);
		}
	//----------------------
	test.Dump();
	train = seed;
	C = train.Size();
	SIGNED_4B_TYPE nspnts = 0;
	apvector<SIGNED_4B_TYPE> pnts_i(N), pnts(N - C), spnts(N), vecRneibs;
	for (Z1 = Z = 0; Z < SIGNED_4B_TYPE(N); Z++)
	{
		if (seed.IsInSet(Z)) continue;
		pnts[Z1++] = Z;
	}
	pnts.rand_shuffle(); //randomize datapoint positions
	for (Z = 0; Z < SIGNED_4B_TYPE(N - C); Z++)	pnts_i[pnts[Z]] = Z; //store randomized positions
	
	while (C < N)
	{//go on as long as there are points to exhaust

		if (seed.IsEmpty())
		{
			if ( ((Mode & SFEXCL_NEXTSF_RAND) == SFEXCL_NEXTSF_RAND) || (nspnts == 0) )
			//get random seeding point from the rest of the data
				Z = 0; //GetRandomNumber( N - C ); //array has been randomized already anyway
			else
			{//el would store -> pnts[]
				mnD = 0;
				if ((Mode & SFEXCL_NEXTSF_STEP2_MIN) == SFEXCL_NEXTSF_STEP2_MIN) mnD = distMax * N;
				if ((Mode & SFEXCL_NEXTSF_SPHERES) == SFEXCL_NEXTSF_SPHERES)
					for (el = Z = 0; Z + C < N; Z++)
					{
						rtD = 0;
						if ( ((Mode & SFEXCL_NEXTSF_STEP1_SUMDIST) != SFEXCL_NEXTSF_STEP1_SUMDIST) &&
							((Mode & SFEXCL_NEXTSF_STEP1_MIN) == SFEXCL_NEXTSF_STEP1_MIN) )
							rtD = distMax * N;

						for (Z1 = 0; Z1 < nspnts; Z1++)
						{
							if ((Mode & SFEXCL_NEXTSF_STEP1_SUMDIST) == SFEXCL_NEXTSF_STEP1_SUMDIST)
								rtD += dist(pnts[Z], spnts[Z1]);
							else
								if ( ((Mode & SFEXCL_NEXTSF_STEP1_MIN) == SFEXCL_NEXTSF_STEP1_MIN) ^ (dist(pnts[Z], spnts[Z1]) > rtD) )	
									rtD = dist(pnts[Z], spnts[Z1]);
						}

						if ( ((Mode & SFEXCL_NEXTSF_STEP2_MIN) == SFEXCL_NEXTSF_STEP2_MIN) ^ ( mnD < rtD ) )
						{
							mnD = rtD;
							el = Z;
						}
					}
				else //if ((Mode & SFEXCL_NEXTSF_SPHERES) == SFEXCL_NEXTSF_SPHERES)
					for (Z = 0; Z < nspnts; Z++)
					{
						el1 = 0;
						for (Z1 = 1; Z1 + C < N; Z1++)
						if ( ((Mode & SFEXCL_NEXTSF_STEP1_MIN) == SFEXCL_NEXTSF_STEP1_MIN) ^ 
							(dist(spnts[Z], pnts[Z1]) > dist(spnts[Z], pnts[el1])) )
							el1 = Z1;

						if ( ((Mode & SFEXCL_NEXTSF_STEP2_MIN) == SFEXCL_NEXTSF_STEP2_MIN) ^ ( mnD < dist(spnts[Z], pnts[el1]) ) )
						{
							mnD = dist(spnts[Z], pnts[el1]);
							el = el1;
						}
					}
				Z = el;	
			}//if ((Mode & SFEXCL_NEXTSF_RAND) == SFEXCL_NEXTSF_RAND) .. else

			el = pnts[Z];
			train.PutInSet(el);
			C++;
			pnts[Z] = pnts[N - C];
			pnts_i[el] = N;
			pnts_i[pnts[Z]] = Z;
		}//if (seed.IsEmpty())
		else
		{
			seed.GetElement(el);
			seed.RemoveFromSet(el);
		}

		spnts[nspnts++] = el;

		//now let's get the points within R-distance from el and distribute them between train and test
		Rneibs[el].GetList(vecRneibs);
		vecRneibs.rand_shuffle(); //to randomize the order of points!
		for (Z1 = el1 = 0; el1 < vecRneibs.length(); el1++)
		{
			Z = vecRneibs[el1];
			if (test.IsInSet(Z) || train.IsInSet(Z)) continue;
			C++;
			pnts[pnts_i[Z]] = pnts[N - C];
			pnts_i[pnts[N - C]] = pnts_i[Z];
			pnts_i[Z] = N;
			if ((Mode & SFEXCL_SPLIT_FIRST2TRN) == SFEXCL_SPLIT_FIRST2TRN)
			{
				if (Z1++ < toTrain)	train.PutInSet(Z); else	test.PutInSet(Z);
			}
			else
				if (Z1++ < toTest)	test.PutInSet(Z);	else	train.PutInSet(Z);
			if (Z1 == toTrain + toTest) Z1 = 0;
		}
	}//global-loop while (C < N)
	return (R);
}