Int32T getRNearNeighbors(PRNearNeighborStructT nnStruct, PPointT queryPoint, PPointT *(&result), Int32T &resultSize){ DPRINTF("Estimated ULSH comp: %0.6lf\n", lshPrecomp * nnStruct->nHFTuples * nnStruct->hfTuplesLength); DPRINTF("Estimated UH overhead: %0.6lf\n", uhashOver * nnStruct->nHFTuples); // RealT estNColls = estimateNCollisions(nnStruct->nPoints, // nnStruct->dimension, // nnStruct->points, // queryPoint, // nnStruct->parameterK, // nnStruct->parameterL, // nnStruct->parameterR); // DPRINTF("Estimated #collisions (query specific): %0.6lf\n", (double)estNColls); // estNColls = (double)estimateNDistinctCollisions(nnStruct->nPoints, // nnStruct->dimension, // nnStruct->points, // queryPoint, // nnStruct->useUfunctions, // nnStruct->hfTuplesLength, // nnStruct->nHFTuples, // nnStruct->parameterR); // DPRINTF("Estimated #distinct collisions (query specific): %0.6lf\n", estNColls); // DPRINTF("Estimated Dist comp time (query specific): %0.6lf\n", distComp * estNColls); // reset all the timers timeRNNQuery = 0; timeComputeULSH = 0; timeGetBucket = 0; timeCycleBucket = 0; timeDistanceComputation = 0; timeResultStoring = 0; timeCycleProc = 0; timePrecomputeHash = 0; timeGBHash = 0; timeChainTraversal = 0; nOfDistComps = 0; timeTotalBuckets = 0; TIMEV_START(timeRNNQuery); noExpensiveTiming = !DEBUG_PROFILE_TIMING; Int32T nNearNeighbors = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize); TIMEV_END(timeRNNQuery); DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH); DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket); DPRINTF("Time to cycle through buckets: %0.6lf\n", timeCycleBucket); DPRINTF("Time to for processing buckets (UH+examining points): %0.6lf\n", timeTotalBuckets); //DPRINTF("Time to copy ULSHs: %0.6lf\n", timeCopyingULSHs); //DPRINTF("Time to unmark points: %0.6lf\n", timeUnmarking); DPRINTF("Time for distance comps: %0.6lf\n", timeDistanceComputation); DPRINTF("Time to store result: %0.6lf\n", timeResultStoring); //printf("Time for cycle processing: %0.6lf\n", timeCycleProc); //printf("Time for precomputing hashes: %0.6lf\n", timePrecomputeHash); //printf("Time for GB hash: %0.6lf\n", timeGBHash); //printf("Time for traversal of chains: %0.6lf\n", timeChainTraversal); DPRINTF("Number of dist comps: %d\n", nOfDistComps); DPRINTF("Number buckets in chains: %d\n", nBucketsInChains); DPRINTF("Number buckets in chains / L: %0.3lf\n", (double)nBucketsInChains / nnStruct->nHFTuples); DPRINTF("Cumulative time for R-NN query: %0.6lf\n", timeRNNQuery); return nNearNeighbors; }
// Determines the run-time coefficients of the different parts of the //确定查询算法不同部分的运行时间 // query algorithm. Values that are computed and returned are // <lshPrecomp>, <uhashOver>, <distComp>. <lshPrecomp> is the time for // pre-computing one function from the LSH family. <uhashOver> is the // time for getting a bucket from a hash table (of buckets).<distComp> // is the time to compute one distance between two points. These times // are computed by constructing a R-NN DS on a sample data set and // running a sample query set on it. void determineRTCoefficients(RealT thresholdR, RealT successProbability, BooleanT useUfunctions, IntT typeHT, //建立hash表的类型 IntT dimension, Int32T nPoints, PPointT *realData, RealT &lshPrecomp, RealT &uhashOver, RealT &distComp){ // use a subset of the original data set. 使用原始数据集的一个子集 // there is not much theory behind the formula below. //减小运算规模 IntT n = nPoints / 50; //最多生成n各点,缩小50倍 if (n < 100) { //如果生成的点的个数小于100,则使桶的数量与数据集点的数量一样多 n = nPoints; } if (n > 10000) { n = 10000; } // Initialize the data set to use. PPointT *dataSet; FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); for(IntT i = 0; i < n; i++){ //从真实数据集中随机取n个点 (最多10000个) dataSet[i] = realData[genRandomInt(0, nPoints - 1)]; } IntT hashTableSize = n; //哈希表大小也初始化为n,是指hashTableSize放的点的个数,还是放的桶的个数? RNNParametersT algParameters; algParameters.parameterR = thresholdR; //半径 algParameters.successProbability = successProbability; algParameters.dimension = dimension; #ifdef USE_L1_DISTANCE algParameters.parameterR2 = thresholdR; //使用L1距离,R2=R #else algParameters.parameterR2 = SQR(thresholdR); //使用L2 R2=R^2 #endif algParameters.useUfunctions = useUfunctions; algParameters.parameterK = 16; //k 设定为16,只是测试,估算运算时间,可能是先随机设置一个时间,之后再在代码中改成16,因为16是bestK. algParameters.parameterW = PARAMETER_W_DEFAULT; //W=4,manuel中说经过多次测试,4是最好的值 algParameters.parameterT = n; //点的个数 algParameters.typeHT = typeHT; //桶的类型HT_HYBRID_CHAINS,在line405里面定义的。 if (algParameters.useUfunctions){ algParameters.parameterM = computeMForULSH(algParameters.parameterK, algParameters.successProbability); //经过改进的L和M algParameters.parameterL = algParameters.parameterM * (algParameters.parameterM - 1) / 2; }else{ algParameters.parameterM = computeLfromKP(algParameters.parameterK, algParameters.successProbability); //论文里面的M=L algParameters.parameterL = algParameters.parameterM; } // FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); // for(IntT i = 0; i < n; i++){ // FAILIF(NULL == (dataSet[i] = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (dataSet[i]->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // dataSet[i]->index = i; // sqrLength = 0; // for(IntT d = 0; d < dimension; d++){ // if (i == 0) { // dataSet[i]->coordinates[d] = genUniformRandom(-100, 100); // }else{ // dataSet[i]->coordinates[d] = dataSet[0]->coordinates[d]; // } // sqrLength += SQR(dataSet[i]->coordinates[d]); // } // dataSet[i]->sqrLength = sqrLength; // } // switch on timing BooleanT tempTimingOn = timingOn; //初始化为True timingOn = TRUE; // initialize result arrays PPointT *result = NULL; //结果集以及其初始化 IntT resultSize = 0; IntT nNNs; IntT nSucReps; do{ // create the test structure PRNearNeighborStructT nnStruct; switch(algParameters.typeHT){ case HT_LINKED_LIST: nnStruct = initLSH(algParameters, n); // add points to the test structure for(IntT i = 0; i < n; i++){ addNewPointToPRNearNeighborStruct(nnStruct, realData[i]); } break; case HT_HYBRID_CHAINS: nnStruct = initLSH_WithDataSet(algParameters, n, dataSet); //初始化数据结构,参数集,点的个数,数据集,对点进行映射转换,桶进行映射转换,点存入桶中 break; default: ASSERT(FALSE); } // query point PPointT queryPoint; // FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // RealT sqrLength = 0; // for(IntT i = 0; i < dimension; i++){ // queryPoint->coordinates[i] = dataSet[0]->coordinates[i]; // //queryPoint->coordinates[i] = 0.1; // sqrLength += SQR(queryPoint->coordinates[i]); // } //queryPoint->coordinates[0] = dataPoint->coordinates[0] + 0.0001; //queryPoint->sqrLength = sqrLength; // reset the R parameter so that there are no NN neighbors. setResultReporting(nnStruct, FALSE); //DPRINTF1("X\n"); lshPrecomp = 0; uhashOver = 0; distComp = 0; IntT nReps = 20; nSucReps = 0; for(IntT rep = 0; rep < nReps; rep++){ queryPoint = realData[genRandomInt(0, nPoints - 1)]; //查询点为数据集中随机抽取出来的一个点 timeComputeULSH = 0; timeGetBucket = 0; timeCycleBucket = 0; nOfDistComps = 0; //点与点比较的次数 //返回查找到的近邻点数,并将查询到的近邻点存入result中。 nNNs = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize); //DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH); //DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket); //DPRINTF("Time to cycle through buckets: %0.9lf\n", timeCycleBucket); //DPRINTF("N of dist comp: %d\n", nOfDistComps); ASSERT(nNNs == 0); //若一个点都没有找到,将发生中断。 if (nOfDistComps >= MIN(n / 10, 100)){ //与足够的点比较过,才将时间计入 nSucReps++; lshPrecomp += timeComputeULSH / algParameters.parameterK / algParameters.parameterM; //一个点对一个哈希函数的处理时间。共有k*L个哈希函数 uhashOver += timeGetBucket / algParameters.parameterL; //找到一个链表中桶的时间 distComp += timeCycleBucket / nOfDistComps; //遍历链表中桶,并与桶里面的点比较的时间 } } if (nSucReps >= 5){ lshPrecomp /= nSucReps; uhashOver /= nSucReps; distComp /= nSucReps; DPRINTF1("RT coeffs computed.\n"); }else{ algParameters.parameterR *= 2; // double the radius and repeat //比较的点数不够,将半径扩大,重复比较 DPRINTF1("Could not determine the RT coeffs. Repeating.\n"); } freePRNearNeighborStruct(nnStruct); }while(nSucReps < 5); //做一个有效值的判断,要获得5次有效值 FREE(dataSet); FREE(result); timingOn = tempTimingOn; }
// Determines the run-time coefficients of the different parts of the // query algorithm. Values that are computed and returned are // <lshPrecomp>, <uhashOver>, <distComp>. <lshPrecomp> is the time for // pre-computing one function from the LSH family. <uhashOver> is the // time for getting a bucket from a hash table (of buckets).<distComp> // is the time to compute one distance between two points. These times // are computed by constructing a R-NN DS on a sample data set and // running a sample query set on it. void determineRTCoefficients(RealT thresholdR, RealT successProbability, BooleanT useUfunctions, IntT typeHT, IntT dimension, Int32T nPoints, PPointT *realData, RealT &lshPrecomp, RealT &uhashOver, RealT &distComp){ // use a subset of the original data set. // there is not much theory behind the formula below. IntT n = nPoints / 50; if (n < 100) { n = nPoints; } if (n > 10000) { n = 10000; } // Initialize the data set to use. PPointT *dataSet; FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); for(IntT i = 0; i < n; i++){ dataSet[i] = realData[genRandomInt(0, nPoints - 1)]; } IntT hashTableSize = n; RNNParametersT algParameters; algParameters.parameterR = thresholdR; algParameters.successProbability = successProbability; algParameters.dimension = dimension; #ifdef USE_L1_DISTANCE algParameters.parameterR2 = thresholdR; #else algParameters.parameterR2 = SQR(thresholdR); #endif algParameters.useUfunctions = useUfunctions; algParameters.parameterK = 16; algParameters.parameterW = PARAMETER_W_DEFAULT; algParameters.parameterT = n; algParameters.typeHT = typeHT; if (algParameters.useUfunctions){ algParameters.parameterM = computeMForULSH(algParameters.parameterK, algParameters.successProbability); algParameters.parameterL = algParameters.parameterM * (algParameters.parameterM - 1) / 2; }else{ algParameters.parameterM = computeLfromKP(algParameters.parameterK, algParameters.successProbability); algParameters.parameterL = algParameters.parameterM; } // FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); // for(IntT i = 0; i < n; i++){ // FAILIF(NULL == (dataSet[i] = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (dataSet[i]->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // dataSet[i]->index = i; // sqrLength = 0; // for(IntT d = 0; d < dimension; d++){ // if (i == 0) { // dataSet[i]->coordinates[d] = genUniformRandom(-100, 100); // }else{ // dataSet[i]->coordinates[d] = dataSet[0]->coordinates[d]; // } // sqrLength += SQR(dataSet[i]->coordinates[d]); // } // dataSet[i]->sqrLength = sqrLength; // } // switch on timing BooleanT tempTimingOn = timingOn; timingOn = TRUE; // initialize result arrays PPointT *result = NULL; IntT resultSize = 0; IntT nNNs; IntT nSucReps; do{ // create the test structure PRNearNeighborStructT nnStruct; switch(algParameters.typeHT){ case HT_LINKED_LIST: nnStruct = initLSH(algParameters, n); // add points to the test structure for(IntT i = 0; i < n; i++){ addNewPointToPRNearNeighborStruct(nnStruct, realData[i]); } break; case HT_HYBRID_CHAINS: nnStruct = initLSH_WithDataSet(algParameters, n, dataSet); break; default: ASSERT(FALSE); } // query point PPointT queryPoint; // FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // RealT sqrLength = 0; // for(IntT i = 0; i < dimension; i++){ // queryPoint->coordinates[i] = dataSet[0]->coordinates[i]; // //queryPoint->coordinates[i] = 0.1; // sqrLength += SQR(queryPoint->coordinates[i]); // } //queryPoint->coordinates[0] = dataPoint->coordinates[0] + 0.0001; //queryPoint->sqrLength = sqrLength; // reset the R parameter so that there are no NN neighbors. setResultReporting(nnStruct, FALSE); //DPRINTF1("X\n"); lshPrecomp = 0; uhashOver = 0; distComp = 0; IntT nReps = 20; nSucReps = 0; for(IntT rep = 0; rep < nReps; rep++){ queryPoint = realData[genRandomInt(0, nPoints - 1)]; timeComputeULSH = 0; timeGetBucket = 0; timeCycleBucket = 0; nOfDistComps = 0; nNNs = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize); //DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH); //DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket); //DPRINTF("Time to cycle through buckets: %0.9lf\n", timeCycleBucket); //DPRINTF("N of dist comp: %d\n", nOfDistComps); ASSERT(nNNs == 0); if (nOfDistComps >= MIN(n / 10, 100)){ nSucReps++; lshPrecomp += timeComputeULSH / algParameters.parameterK / algParameters.parameterM; uhashOver += timeGetBucket / algParameters.parameterL; distComp += timeCycleBucket / nOfDistComps; } } if (nSucReps >= 5){ lshPrecomp /= nSucReps; uhashOver /= nSucReps; distComp /= nSucReps; DPRINTF1("RT coeffs computed.\n"); }else{ algParameters.parameterR *= 2; // double the radius and repeat DPRINTF1("Could not determine the RT coeffs. Repeating.\n"); } freePRNearNeighborStruct(nnStruct); }while(nSucReps < 5); FREE(dataSet); FREE(result); timingOn = tempTimingOn; }