REALNUM_TYPE dataset::sfexcl_split(set &seed, UNSIGNED_2B_TYPE Mode, REALNUM_TYPE f, bool ifForceDistCalc, UNSIGNED_1B_TYPE toTest, UNSIGNED_1B_TYPE toTrain, UNSIGNED_1B_TYPE nSeed, REALNUM_TYPE Kmetric) //sphere exclusion implementation with some extra features added to the basic published version //ref: Golbraikhm A, Tropsha A et al, J Comp-Aid Mol Design 2003, 17: 241-253 //There are two ways of calculating sphere's radius R: //1) R = f*(V/N)^1/k, where f is interpreted as Dissimilarity [0.2 .. 5.2]; //2) R = mind.dist + f(max.dist- min.dist), where f should be varied [0 .. 0.25] // //Parameters: //Mode see the header-file //seed is the set of user-supplied points for seeding, nSeed - number of points to be seeded randomly in addition to that //toTest, toTrain #points from the sphere that are placed, respectively, into test and training set, // which is done alternatingly (toTrain #points to training set, then toTest #points to test set, and repeated) { UNSIGNED_4B_TYPE C, N = patt.RowNO(), D = patt.ColNO(); SIGNED_4B_TYPE Z, Z1, el, el1; REALNUM_TYPE R, mnD, rtD; if ((N < 2) || (D < 2)) return 0; if ( (N != dist.ColNO()) || (!dist.IsSquare()) || ifForceDistCalc ) {//!may be affected by changed metric-function if (Mode & SFEXCL_METRIC_COSINE) calc_dist(0, Kmetric, 1); //cosine-metric else if (Mode & SFEXCL_METRIC_CORR) calc_dist(0, Kmetric, 2); //similar to cosine-metric, but mean-centered else if (Mode & SFEXCL_METRIC_TANIMOTO) calc_dist(0, Kmetric, 3); //Tanimoto, though strange to apply it to non-discrete data : ) else calc_dist(0, Kmetric, 0); //Euclidean-like } //---------------------- //prepare seeding set if (nSeed) { if ((Mode & SFEXCL_SEED_BYACTS) == SFEXCL_SEED_BYACTS) rand_split((UNSIGNED_4B_TYPE)nSeed, nSeed); else rand_split((UNSIGNED_4B_TYPE)nSeed, 1); seed |= test; } if ((Mode & SFEXCL_SEED_MINACT) == SFEXCL_SEED_MINACT) { C = 0; while (++C < N) if (act[sact[C]] > act[sact[0]]) break; Z = GetRandomNumber(C); seed.PutInSet(sact[Z]); } if ((Mode & SFEXCL_SEED_MAXACT) == SFEXCL_SEED_MAXACT) { C = 1; while (++C < N) if (act[sact[N-C]] < act[sact[N-1]]) break; Z = GetRandomNumber(--C) + 1; seed.PutInSet(sact[N - Z]); } if ((Mode & SFEXCL_R_BYDIST) == SFEXCL_R_BYDIST) R = f * (distAv - distMin) + distMin; //distMax is too big to use effectively! else { if ((Mode & SFEXCL_R_BYUSER) == SFEXCL_R_BYUSER) R = f; else {//default REALNUM_TYPE l, h, V = 1; for (C = 0; C < D; C++) {//calculate volume of the descriptors' space patt.GetColScale(C, C, l, h); V *= pow(h - l, 1.0/D); } V /= pow(N, 1.0/D); R = f * V; } } if (R < distMin) R = distMin; //---------------------- //prepare R-neiborhood-subsets to speed up splitting lneib Rneibs(N); for (C = 0; C < N - 1; C++) for (Z = C + 1; Z < SIGNED_4B_TYPE(N); Z++) if (dist(C, Z) < R) { Rneibs[C].PutInSet(Z); Rneibs[Z].PutInSet(C); } //---------------------- test.Dump(); train = seed; C = train.Size(); SIGNED_4B_TYPE nspnts = 0; apvector<SIGNED_4B_TYPE> pnts_i(N), pnts(N - C), spnts(N), vecRneibs; for (Z1 = Z = 0; Z < SIGNED_4B_TYPE(N); Z++) { if (seed.IsInSet(Z)) continue; pnts[Z1++] = Z; } pnts.rand_shuffle(); //randomize datapoint positions for (Z = 0; Z < SIGNED_4B_TYPE(N - C); Z++) pnts_i[pnts[Z]] = Z; //store randomized positions while (C < N) {//go on as long as there are points to exhaust if (seed.IsEmpty()) { if ( ((Mode & SFEXCL_NEXTSF_RAND) == SFEXCL_NEXTSF_RAND) || (nspnts == 0) ) //get random seeding point from the rest of the data Z = 0; //GetRandomNumber( N - C ); //array has been randomized already anyway else {//el would store -> pnts[] mnD = 0; if ((Mode & SFEXCL_NEXTSF_STEP2_MIN) == SFEXCL_NEXTSF_STEP2_MIN) mnD = distMax * N; if ((Mode & SFEXCL_NEXTSF_SPHERES) == SFEXCL_NEXTSF_SPHERES) for (el = Z = 0; Z + C < N; Z++) { rtD = 0; if ( ((Mode & SFEXCL_NEXTSF_STEP1_SUMDIST) != SFEXCL_NEXTSF_STEP1_SUMDIST) && ((Mode & SFEXCL_NEXTSF_STEP1_MIN) == SFEXCL_NEXTSF_STEP1_MIN) ) rtD = distMax * N; for (Z1 = 0; Z1 < nspnts; Z1++) { if ((Mode & SFEXCL_NEXTSF_STEP1_SUMDIST) == SFEXCL_NEXTSF_STEP1_SUMDIST) rtD += dist(pnts[Z], spnts[Z1]); else if ( ((Mode & SFEXCL_NEXTSF_STEP1_MIN) == SFEXCL_NEXTSF_STEP1_MIN) ^ (dist(pnts[Z], spnts[Z1]) > rtD) ) rtD = dist(pnts[Z], spnts[Z1]); } if ( ((Mode & SFEXCL_NEXTSF_STEP2_MIN) == SFEXCL_NEXTSF_STEP2_MIN) ^ ( mnD < rtD ) ) { mnD = rtD; el = Z; } } else //if ((Mode & SFEXCL_NEXTSF_SPHERES) == SFEXCL_NEXTSF_SPHERES) for (Z = 0; Z < nspnts; Z++) { el1 = 0; for (Z1 = 1; Z1 + C < N; Z1++) if ( ((Mode & SFEXCL_NEXTSF_STEP1_MIN) == SFEXCL_NEXTSF_STEP1_MIN) ^ (dist(spnts[Z], pnts[Z1]) > dist(spnts[Z], pnts[el1])) ) el1 = Z1; if ( ((Mode & SFEXCL_NEXTSF_STEP2_MIN) == SFEXCL_NEXTSF_STEP2_MIN) ^ ( mnD < dist(spnts[Z], pnts[el1]) ) ) { mnD = dist(spnts[Z], pnts[el1]); el = el1; } } Z = el; }//if ((Mode & SFEXCL_NEXTSF_RAND) == SFEXCL_NEXTSF_RAND) .. else el = pnts[Z]; train.PutInSet(el); C++; pnts[Z] = pnts[N - C]; pnts_i[el] = N; pnts_i[pnts[Z]] = Z; }//if (seed.IsEmpty()) else { seed.GetElement(el); seed.RemoveFromSet(el); } spnts[nspnts++] = el; //now let's get the points within R-distance from el and distribute them between train and test Rneibs[el].GetList(vecRneibs); vecRneibs.rand_shuffle(); //to randomize the order of points! for (Z1 = el1 = 0; el1 < vecRneibs.length(); el1++) { Z = vecRneibs[el1]; if (test.IsInSet(Z) || train.IsInSet(Z)) continue; C++; pnts[pnts_i[Z]] = pnts[N - C]; pnts_i[pnts[N - C]] = pnts_i[Z]; pnts_i[Z] = N; if ((Mode & SFEXCL_SPLIT_FIRST2TRN) == SFEXCL_SPLIT_FIRST2TRN) { if (Z1++ < toTrain) train.PutInSet(Z); else test.PutInSet(Z); } else if (Z1++ < toTest) test.PutInSet(Z); else train.PutInSet(Z); if (Z1 == toTrain + toTest) Z1 = 0; } }//global-loop while (C < N) return (R); }