/* Creates a new R-near neighbor data structure (PRNearNeighborStructT) from the parameters <thresholdR> and <successProbability> for the data set <dataSet>. <nPoints> is the number of points in the data set and <dimension> is the dimension of the points. The set <sampleQueries> is a set with query sample points (R-NN DS's parameters are optimized for query points from the set <sampleQueries>). <sampleQueries> could be a sample of points from the actual query set or from the data set. When computing the estimated number of collisions of a sample query point <q> with the data set points, if there is a point in the data set with the same pointer with <q> (that is when <q> is a data set point), then the corresponding point (<q>) is not considered in the data set (for the purpose of computing the respective #collisions estimation). */ PRNearNeighborStructT initSelfTunedRNearNeighborWithDataSet(RealT thresholdR, RealT successProbability, Int32T nPoints, IntT dimension, PPointT *dataSet, IntT nSampleQueries, PPointT *sampleQueries, MemVarT memoryUpperBound){ initializeLSHGlobal(); PRNearNeighborStructT nnStruct = NULL; RNNParametersT optParameters = computeOptimalParameters(thresholdR, successProbability, nPoints, dimension, dataSet, nSampleQueries, sampleQueries, memoryUpperBound); if (!optParameters.useUfunctions) { DPRINTF("Used L=%d\n", optParameters.parameterL); }else{ DPRINTF("Used m = %d\n", optParameters.parameterM); DPRINTF("Used L = %d\n", optParameters.parameterL); } TimeVarT timeInit = 0; TIMEV_START(timeInit); // Init the R-NN data structure. if (optParameters.typeHT != HT_HYBRID_CHAINS){ nnStruct = initLSH(optParameters, nPoints); }else{ printRNNParameters(DEBUG_OUTPUT, optParameters); nnStruct = initLSH_WithDataSet(optParameters, nPoints, dataSet); } TIMEV_END(timeInit); DPRINTF("Time for initializing: %0.6lf\n", timeInit); DPRINTF("Allocated memory: %lld\n", totalAllocatedMemory); TimeVarT timeAdding = 0; if (optParameters.typeHT != HT_HYBRID_CHAINS){ // Add the points to the LSH buckets. TIMEV_START(timeAdding); for(IntT i = 0; i < nPoints; i++){ addNewPointToPRNearNeighborStruct(nnStruct, dataSet[i]); } TIMEV_END(timeAdding); printf("Time for adding points: %0.6lf\n", timeAdding); DPRINTF("Allocated memory: %lld\n", totalAllocatedMemory); } DPRINTF("Time for creating buckets: %0.6lf\n", timeBucketCreation); DPRINTF("Time for putting buckets into UH: %0.6lf\n", timeBucketIntoUH); DPRINTF("Time for computing GLSH: %0.6lf\n", timeComputeULSH); DPRINTF("NGBuckets: %d\n", nGBuckets); return nnStruct; }
// Determines the run-time coefficients of the different parts of the //确定查询算法不同部分的运行时间 // query algorithm. Values that are computed and returned are // <lshPrecomp>, <uhashOver>, <distComp>. <lshPrecomp> is the time for // pre-computing one function from the LSH family. <uhashOver> is the // time for getting a bucket from a hash table (of buckets).<distComp> // is the time to compute one distance between two points. These times // are computed by constructing a R-NN DS on a sample data set and // running a sample query set on it. void determineRTCoefficients(RealT thresholdR, RealT successProbability, BooleanT useUfunctions, IntT typeHT, //建立hash表的类型 IntT dimension, Int32T nPoints, PPointT *realData, RealT &lshPrecomp, RealT &uhashOver, RealT &distComp){ // use a subset of the original data set. 使用原始数据集的一个子集 // there is not much theory behind the formula below. //减小运算规模 IntT n = nPoints / 50; //最多生成n各点,缩小50倍 if (n < 100) { //如果生成的点的个数小于100,则使桶的数量与数据集点的数量一样多 n = nPoints; } if (n > 10000) { n = 10000; } // Initialize the data set to use. PPointT *dataSet; FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); for(IntT i = 0; i < n; i++){ //从真实数据集中随机取n个点 (最多10000个) dataSet[i] = realData[genRandomInt(0, nPoints - 1)]; } IntT hashTableSize = n; //哈希表大小也初始化为n,是指hashTableSize放的点的个数,还是放的桶的个数? RNNParametersT algParameters; algParameters.parameterR = thresholdR; //半径 algParameters.successProbability = successProbability; algParameters.dimension = dimension; #ifdef USE_L1_DISTANCE algParameters.parameterR2 = thresholdR; //使用L1距离,R2=R #else algParameters.parameterR2 = SQR(thresholdR); //使用L2 R2=R^2 #endif algParameters.useUfunctions = useUfunctions; algParameters.parameterK = 16; //k 设定为16,只是测试,估算运算时间,可能是先随机设置一个时间,之后再在代码中改成16,因为16是bestK. algParameters.parameterW = PARAMETER_W_DEFAULT; //W=4,manuel中说经过多次测试,4是最好的值 algParameters.parameterT = n; //点的个数 algParameters.typeHT = typeHT; //桶的类型HT_HYBRID_CHAINS,在line405里面定义的。 if (algParameters.useUfunctions){ algParameters.parameterM = computeMForULSH(algParameters.parameterK, algParameters.successProbability); //经过改进的L和M algParameters.parameterL = algParameters.parameterM * (algParameters.parameterM - 1) / 2; }else{ algParameters.parameterM = computeLfromKP(algParameters.parameterK, algParameters.successProbability); //论文里面的M=L algParameters.parameterL = algParameters.parameterM; } // FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); // for(IntT i = 0; i < n; i++){ // FAILIF(NULL == (dataSet[i] = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (dataSet[i]->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // dataSet[i]->index = i; // sqrLength = 0; // for(IntT d = 0; d < dimension; d++){ // if (i == 0) { // dataSet[i]->coordinates[d] = genUniformRandom(-100, 100); // }else{ // dataSet[i]->coordinates[d] = dataSet[0]->coordinates[d]; // } // sqrLength += SQR(dataSet[i]->coordinates[d]); // } // dataSet[i]->sqrLength = sqrLength; // } // switch on timing BooleanT tempTimingOn = timingOn; //初始化为True timingOn = TRUE; // initialize result arrays PPointT *result = NULL; //结果集以及其初始化 IntT resultSize = 0; IntT nNNs; IntT nSucReps; do{ // create the test structure PRNearNeighborStructT nnStruct; switch(algParameters.typeHT){ case HT_LINKED_LIST: nnStruct = initLSH(algParameters, n); // add points to the test structure for(IntT i = 0; i < n; i++){ addNewPointToPRNearNeighborStruct(nnStruct, realData[i]); } break; case HT_HYBRID_CHAINS: nnStruct = initLSH_WithDataSet(algParameters, n, dataSet); //初始化数据结构,参数集,点的个数,数据集,对点进行映射转换,桶进行映射转换,点存入桶中 break; default: ASSERT(FALSE); } // query point PPointT queryPoint; // FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // RealT sqrLength = 0; // for(IntT i = 0; i < dimension; i++){ // queryPoint->coordinates[i] = dataSet[0]->coordinates[i]; // //queryPoint->coordinates[i] = 0.1; // sqrLength += SQR(queryPoint->coordinates[i]); // } //queryPoint->coordinates[0] = dataPoint->coordinates[0] + 0.0001; //queryPoint->sqrLength = sqrLength; // reset the R parameter so that there are no NN neighbors. setResultReporting(nnStruct, FALSE); //DPRINTF1("X\n"); lshPrecomp = 0; uhashOver = 0; distComp = 0; IntT nReps = 20; nSucReps = 0; for(IntT rep = 0; rep < nReps; rep++){ queryPoint = realData[genRandomInt(0, nPoints - 1)]; //查询点为数据集中随机抽取出来的一个点 timeComputeULSH = 0; timeGetBucket = 0; timeCycleBucket = 0; nOfDistComps = 0; //点与点比较的次数 //返回查找到的近邻点数,并将查询到的近邻点存入result中。 nNNs = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize); //DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH); //DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket); //DPRINTF("Time to cycle through buckets: %0.9lf\n", timeCycleBucket); //DPRINTF("N of dist comp: %d\n", nOfDistComps); ASSERT(nNNs == 0); //若一个点都没有找到,将发生中断。 if (nOfDistComps >= MIN(n / 10, 100)){ //与足够的点比较过,才将时间计入 nSucReps++; lshPrecomp += timeComputeULSH / algParameters.parameterK / algParameters.parameterM; //一个点对一个哈希函数的处理时间。共有k*L个哈希函数 uhashOver += timeGetBucket / algParameters.parameterL; //找到一个链表中桶的时间 distComp += timeCycleBucket / nOfDistComps; //遍历链表中桶,并与桶里面的点比较的时间 } } if (nSucReps >= 5){ lshPrecomp /= nSucReps; uhashOver /= nSucReps; distComp /= nSucReps; DPRINTF1("RT coeffs computed.\n"); }else{ algParameters.parameterR *= 2; // double the radius and repeat //比较的点数不够,将半径扩大,重复比较 DPRINTF1("Could not determine the RT coeffs. Repeating.\n"); } freePRNearNeighborStruct(nnStruct); }while(nSucReps < 5); //做一个有效值的判断,要获得5次有效值 FREE(dataSet); FREE(result); timingOn = tempTimingOn; }
// Determines the run-time coefficients of the different parts of the // query algorithm. Values that are computed and returned are // <lshPrecomp>, <uhashOver>, <distComp>. <lshPrecomp> is the time for // pre-computing one function from the LSH family. <uhashOver> is the // time for getting a bucket from a hash table (of buckets).<distComp> // is the time to compute one distance between two points. These times // are computed by constructing a R-NN DS on a sample data set and // running a sample query set on it. void determineRTCoefficients(RealT thresholdR, RealT successProbability, BooleanT useUfunctions, IntT typeHT, IntT dimension, Int32T nPoints, PPointT *realData, RealT &lshPrecomp, RealT &uhashOver, RealT &distComp){ // use a subset of the original data set. // there is not much theory behind the formula below. IntT n = nPoints / 50; if (n < 100) { n = nPoints; } if (n > 10000) { n = 10000; } // Initialize the data set to use. PPointT *dataSet; FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); for(IntT i = 0; i < n; i++){ dataSet[i] = realData[genRandomInt(0, nPoints - 1)]; } IntT hashTableSize = n; RNNParametersT algParameters; algParameters.parameterR = thresholdR; algParameters.successProbability = successProbability; algParameters.dimension = dimension; #ifdef USE_L1_DISTANCE algParameters.parameterR2 = thresholdR; #else algParameters.parameterR2 = SQR(thresholdR); #endif algParameters.useUfunctions = useUfunctions; algParameters.parameterK = 16; algParameters.parameterW = PARAMETER_W_DEFAULT; algParameters.parameterT = n; algParameters.typeHT = typeHT; if (algParameters.useUfunctions){ algParameters.parameterM = computeMForULSH(algParameters.parameterK, algParameters.successProbability); algParameters.parameterL = algParameters.parameterM * (algParameters.parameterM - 1) / 2; }else{ algParameters.parameterM = computeLfromKP(algParameters.parameterK, algParameters.successProbability); algParameters.parameterL = algParameters.parameterM; } // FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); // for(IntT i = 0; i < n; i++){ // FAILIF(NULL == (dataSet[i] = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (dataSet[i]->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // dataSet[i]->index = i; // sqrLength = 0; // for(IntT d = 0; d < dimension; d++){ // if (i == 0) { // dataSet[i]->coordinates[d] = genUniformRandom(-100, 100); // }else{ // dataSet[i]->coordinates[d] = dataSet[0]->coordinates[d]; // } // sqrLength += SQR(dataSet[i]->coordinates[d]); // } // dataSet[i]->sqrLength = sqrLength; // } // switch on timing BooleanT tempTimingOn = timingOn; timingOn = TRUE; // initialize result arrays PPointT *result = NULL; IntT resultSize = 0; IntT nNNs; IntT nSucReps; do{ // create the test structure PRNearNeighborStructT nnStruct; switch(algParameters.typeHT){ case HT_LINKED_LIST: nnStruct = initLSH(algParameters, n); // add points to the test structure for(IntT i = 0; i < n; i++){ addNewPointToPRNearNeighborStruct(nnStruct, realData[i]); } break; case HT_HYBRID_CHAINS: nnStruct = initLSH_WithDataSet(algParameters, n, dataSet); break; default: ASSERT(FALSE); } // query point PPointT queryPoint; // FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // RealT sqrLength = 0; // for(IntT i = 0; i < dimension; i++){ // queryPoint->coordinates[i] = dataSet[0]->coordinates[i]; // //queryPoint->coordinates[i] = 0.1; // sqrLength += SQR(queryPoint->coordinates[i]); // } //queryPoint->coordinates[0] = dataPoint->coordinates[0] + 0.0001; //queryPoint->sqrLength = sqrLength; // reset the R parameter so that there are no NN neighbors. setResultReporting(nnStruct, FALSE); //DPRINTF1("X\n"); lshPrecomp = 0; uhashOver = 0; distComp = 0; IntT nReps = 20; nSucReps = 0; for(IntT rep = 0; rep < nReps; rep++){ queryPoint = realData[genRandomInt(0, nPoints - 1)]; timeComputeULSH = 0; timeGetBucket = 0; timeCycleBucket = 0; nOfDistComps = 0; nNNs = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize); //DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH); //DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket); //DPRINTF("Time to cycle through buckets: %0.9lf\n", timeCycleBucket); //DPRINTF("N of dist comp: %d\n", nOfDistComps); ASSERT(nNNs == 0); if (nOfDistComps >= MIN(n / 10, 100)){ nSucReps++; lshPrecomp += timeComputeULSH / algParameters.parameterK / algParameters.parameterM; uhashOver += timeGetBucket / algParameters.parameterL; distComp += timeCycleBucket / nOfDistComps; } } if (nSucReps >= 5){ lshPrecomp /= nSucReps; uhashOver /= nSucReps; distComp /= nSucReps; DPRINTF1("RT coeffs computed.\n"); }else{ algParameters.parameterR *= 2; // double the radius and repeat DPRINTF1("Could not determine the RT coeffs. Repeating.\n"); } freePRNearNeighborStruct(nnStruct); }while(nSucReps < 5); FREE(dataSet); FREE(result); timingOn = tempTimingOn; }