/* The main entry to LSH package. Depending on the command line parameters, the function computes the R-NN data structure optimal parameters and/or construct the R-NN data structure and runs the queries on the data structure. */ int main(int argc, char *argv[]){ FAILIF(0 != regcomp(&preg[ENUM_PPROP_FILE], "FILE:([^,]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_LINE], "LINE:([0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_OFFSET], "OFFSET:([0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_NODE_KIND], "NODE_KIND:([0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_NUM_NODE], "NUM_NODE:([0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_NUM_DECL], "NUM_DECL:([0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_NUM_STMT], "NUM_STMT:([0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_NUM_EXPR], "NUM_EXPR:([0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_TBID], "TBID:([-]?[0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_TEID], "TEID:([-]?[0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_nVARs], "VARs:\\{[^}]*\\}([0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_CONTEXT_KIND], "CONTEXT_KIND:([0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_NEIGHBOR_KIND], "NEIGHBOR_KIND:([0-9]+)", REG_EXTENDED)); FAILIF(0 != regcomp(&preg[ENUM_PPROP_OIDs], "OIDs:\\{[^}]*\\}([0-9]+)", REG_EXTENDED)); // TODO, pair-wise comparision of Vars. //initializeLSHGlobal(); availableTotalMemory = 800000000; // Parse part of the command-line parameters. bool computeParameters = false; char *paramsFile = NULL; // Parameters for filtering: bool no_filtering = false, bug_detecting = true; int upperBound = 0, lowerBound = 2; int minNumNodes = 0, min_nVars = 0; int max_num_diff_vars = 16; float max_num_diff_nVars_diff = 0.5, max_nVars_diff = 0.35; bool interfiles = false; int min_lines = 0; for (int opt; (opt = getopt(argc, argv, "ABl:v:V:e:E:a:m:N:d:p:P:R:M:cFf:b:t:")) != -1; ) { // Needed: -p -f -R switch (opt) { case 'A': fprintf(stderr, "Warning: output all clones. Takes more time...\n"); no_filtering = true; break; case 'B': fprintf(stderr, "Warning: no filtering for bugs now.\n"); bug_detecting = false; break; case 'l': min_lines = atoi(optarg); break; case 'v': min_nVars = atoi(optarg); break; case 'V': max_num_diff_vars = atoi(optarg); break; case 'e': max_num_diff_nVars_diff = atof(optarg); break; case 'E': max_nVars_diff = atof(optarg); break; case 'm': minNumNodes = atoi(optarg); break; case 'b': lowerBound = atoi(optarg); break; case 't': upperBound = atoi(optarg); break; case 'N': nPoints = atol(optarg); break; case 'd': pointsDimension = atol(optarg); break; case 'p': paramsFile = optarg; break; case 'P': successProbability = atof(optarg); break; case 'M': availableTotalMemory = atol(optarg); break; case 'a': prefetch = atol(optarg); break; case 'c': fprintf(stderr, "Warning: will compute parameters\n"); computeParameters = true; break; case 'F': fprintf(stderr, "Warning: inter-file clone detection. Takes more time...\n"); interfiles = true; break; case 'R': nRadii = 1; FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT)))); FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT)))); listOfRadii[0] = strtod(optarg, NULL); memRatiosForNNStructs[0] = 1; break; case 'f': readDataSetFromFile2(optarg); DPRINTF("Allocated memory (after reading data set): %ld\n", totalAllocatedMemory); break; default: fprintf(stderr, "Unknown option: -%c\n", opt); usage(1, argv[0]); } } if (optind < argc) { fprintf(stderr, "There are unprocessed parameters left\n"); usage(1, argv[0]); } CHECK_INT(availableTotalMemory); CHECK_INT(nPoints); CHECK_INT(pointsDimension); CHECK_INT(nRadii); if (nPoints > MAX_N_POINTS) { printf("Error: the structure supports at most %ld points (%ld were specified).\n", MAX_N_POINTS, nPoints); fprintf(ERROR_OUTPUT, "Error: the structure supports at most %ld points (%ld were specified).\n", MAX_N_POINTS, nPoints); exit(1); } if (computeParameters == false) computeParameters = readParamsFile(paramsFile); if (computeParameters) { IntT nSampleQueries = N_SAMPLE_QUERY_POINTS; PPointT sampleQueries[nSampleQueries]; IntT sampleQBoundaryIndeces[nSampleQueries]; // Choose several data set points for the sample query points. for(IntT i = 0; i < nSampleQueries; i++){ sampleQueries[i] = dataSetPoints[genRandomInt(0, nPoints - 1)]; } // Compute the array sampleQBoundaryIndeces that specifies how to // segregate the sample query points according to their distance // to NN. sortQueryPointsByRadii(pointsDimension, nSampleQueries, sampleQueries, nPoints, dataSetPoints, nRadii, listOfRadii, sampleQBoundaryIndeces); // Compute the R-NN DS parameters // if a parameter file is given, output them to that file, and continue // otherwise, output them to stdout, and exit FILE *fd; if (paramsFile == NULL) { fd = stdout; } else { fd = fopen(paramsFile, "wt"); if (fd == NULL) { fprintf(stderr, "Unable to write to parameter file %s\n", paramsFile); exit(1); } } fprintf(fd, "%ld\n", nRadii); transformMemRatios(); for(IntT i = 0; i < nRadii; i++) { // which sample queries to use IntT segregatedQStart = (i == 0) ? 0 : sampleQBoundaryIndeces[i - 1]; IntT segregatedQNumber = nSampleQueries - segregatedQStart; if (segregatedQNumber == 0) { // XXX: not the right answer segregatedQNumber = nSampleQueries; segregatedQStart = 0; } ASSERT(segregatedQStart < nSampleQueries); ASSERT(segregatedQStart >= 0); ASSERT(segregatedQStart + segregatedQNumber <= nSampleQueries); ASSERT(segregatedQNumber >= 0); RNNParametersT optParameters = computeOptimalParameters(listOfRadii[i], successProbability, nPoints, pointsDimension, dataSetPoints, segregatedQNumber, sampleQueries + segregatedQStart, (UnsT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i])); printRNNParameters(fd, optParameters); } if (fd == stdout) { exit(0); } else { fclose(fd); ASSERT(readParamsFile(paramsFile) == false); } } // output vector clusters according to the filtering parameters. printf("========================= Structure built =========================\n"); printf("nPoints = %ld, Dimension = %ld\n", nPoints, pointsDimension); printf("no_filtering (0/1) = %d, inter-file (0/1) = %d, prefetch = %ld\n", no_filtering, interfiles, prefetch); printf("*** Filtering Parameters for individual vectors ***\n"); printf("minNumNodes = %d, min_nVars = %d, min_lines = %d\n", minNumNodes, min_nVars, min_lines); printf("*** Filtering Parameters for clusters ***\n"); printf("lowerBound = %d, upperBound = %d\n", lowerBound, upperBound); printf("Max num of different nVars = %d, Max diff among different nVars = %g, \nMax diff among the num of different nVars = %g\n", max_num_diff_vars, max_nVars_diff, max_num_diff_nVars_diff); IntT resultSize = nPoints; PPointT *result = (PPointT*)MALLOC(resultSize * sizeof(*result)); PPointT queryPoint; FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT)))); FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(pointsDimension * sizeof(RealT)))); TimeVarT meanQueryTime = 0; IntT nQueries = 0; bool seen[nPoints]; IntT nBuckets = 0, nBucketedPoints = 0; memset(seen, 0, nPoints * sizeof(bool)); for(IntT i = 0; i < nPoints; nQueries++, i++) { // find the next unseen point while (i < nPoints && seen[i]) i++; if (i >= nPoints) break; queryPoint = dataSetPoints[i]; // get the near neighbors. IntT nNNs = 0; for(IntT r = 0; r < nRadii; r++) { // nRadii is always 1 so far. nNNs = getRNearNeighbors(nnStructs[r], queryPoint, result, resultSize); //printf("Total time for R-NN query at radius %0.6lf (radius no. %ld):\t%0.6lf\n", (double)(listOfRadii[r]), r, timeRNNQuery); meanQueryTime += timeRNNQuery; //printf("\nQuery point %ld: found %ld NNs at distance %0.6lf (radius no. %ld). NNs are:\n", // i, nNNs, (double)(listOfRadii[r]), r); // sort by filename, then number of variables, then line number qsort(result, nNNs, sizeof(*result), comparePoints); // The result array may contain the queryPoint, so do not output it in the following. PPointT *cur = result, *end = result + nNNs; if ( ! no_filtering ) { // Filter out certain vectors and clusters. while (cur < end) { // Shall we discard the rest results // and start over for a new point? Not // now for the sake of // performance...TODO ASSERT(*cur != NULL); // Look for the first un-filtered point for the next bucket. while ( cur < end ) { if ( pointIsNotFiltered(cur) ) { break; } seen[(*cur)->index] = true; cur++; } if ( cur >= end ) break; bool worthy = false; int sizeBucket = 1; // 1 means the first un-filtered point PPointT *begin = cur; seen[(*begin)->index] = true; cur++; while ( cur < end && // look for the next point outside the current file // if interfiles is false; that point is the end of // current bucket (assume vectors in a bucket are // sorted by their filenames already). ( interfiles || strcmp((*begin)->filename, (*cur)->filename)==0 ) ) { if ( pointIsNotFiltered(cur) ) { // prepare for filtering sizeBucket++; // the first heuristics for bugs AFTER filtering: worthy = worthy || (*begin)->prop[ENUM_PPROP_nVARs-1] != (*cur)->prop[ENUM_PPROP_nVARs-1]; // the second heuristics for bugs AFTER filtering: worthy = worthy || inconsistentIDchanges((*begin)->oids, (*cur)->oids); // TODO } seen[(*cur)->index] = true; cur++; } // output the bucket if: // - there are >= 2 different points // - there are <= upperBound (default 0) && >= lowerBound (default 2) points // - there are >= 2 different numbers of variables // and update nBuckets and nBucketedPoints consequently if (sizeBucket >= lowerBound && (upperBound < lowerBound || sizeBucket <= upperBound) && ( bug_detecting ? worthy : true ) ) { nBuckets++; printf("\n"); for (PPointT *p = begin; p < cur; p++) { ASSERT(*p != NULL); if ( pointIsNotFiltered(p) ) { nBucketedPoints++; // compute the distance to the query point (maybe useless) RealT distance = 0.; for (IntT i = 0; i < pointsDimension; i++) { RealT t = (*p)->coordinates[i] - queryPoint->coordinates[i]; // L1 distance // distance += (t >= 0) ? t : -t; // Pi--L2 distance, LSH uses L2 by default, we should output L2 distance here. distance += t*t; } // L1 distance // printf("%09d\tdist:%0.1lf", (*p)->index, distance); // L2 distance printf("%09d\tdist:%0.1lf", (*p)->index, sqrt(distance)); printf("\tFILE %s LINE:%d:%d NODE_KIND:%d nVARs:%d NUM_NODE:%d TBID:%d TEID:%d\n", (*p)->filename, (*p)->prop[ENUM_PPROP_LINE-1], (*p)->prop[ENUM_PPROP_OFFSET-1], (*p)->prop[ENUM_PPROP_NODE_KIND-1], (*p)->prop[ENUM_PPROP_nVARs-1], (*p)->prop[ENUM_PPROP_NUM_NODE-1], (*p)->prop[ENUM_PPROP_TBID-1], (*p)->prop[ENUM_PPROP_TEID-1]); //CR_ASSERT(distance(pointsDimension, queryPoint, *p) <= listOfRadii[r]); //DPRINTF("Distance: %lf\n", distance(pointsDimension, queryPoint, result[j])); //printRealVector("NN: ", pointsDimension, result[j]->coordinates); } } } // end of enumeration of a bucket } // end of !no_filtering } else { if ( nNNs>=lowerBound ) { // filter out non-clones anyway nBuckets++; printf("\n"); for (PPointT *p = cur; p < end; p++) { ASSERT(*p != NULL); nBucketedPoints++; seen[(*p)->index] = true; // compute the distance to the query point (maybe useless) RealT distance = 0.; for (IntT i = 0; i < pointsDimension; i++) { RealT t = (*p)->coordinates[i] - queryPoint->coordinates[i]; // L1 distance // distance += (t >= 0) ? t : -t; // Pi--L2 distance, LSH uses L2 by default, we should output L2 distance here. distance += t*t; } // L1 distance // printf("%09d\tdist:%0.1lf", (*p)->index, distance); // L2 distance printf("%09d\tdist:%0.1lf", (*p)->index, sqrt(distance)); printf("\tFILE %s LINE:%d:%d NODE_KIND:%d nVARs:%d NUM_NODE:%d TBID:%d TEID:%d\n", (*p)->filename, (*p)->prop[ENUM_PPROP_LINE-1], (*p)->prop[ENUM_PPROP_OFFSET-1], (*p)->prop[ENUM_PPROP_NODE_KIND-1], (*p)->prop[ENUM_PPROP_nVARs-1], (*p)->prop[ENUM_PPROP_NUM_NODE-1], (*p)->prop[ENUM_PPROP_TBID-1], (*p)->prop[ENUM_PPROP_TEID-1]); //CR_ASSERT(distance(pointsDimension, queryPoint, *p) <= listOfRadii[r]); //DPRINTF("Distance: %lf\n", distance(pointsDimension, queryPoint, result[j])); //printRealVector("NN: ", pointsDimension, result[j]->coordinates); } // end of enumeration of a bucket } // end of nNNs>=lowerBound } // end of no_filtering and exploration of NNs } // for (...nRadii...) } // Simple statistics and finish if (nQueries > 0) { meanQueryTime = meanQueryTime / nQueries; printf("\n%ld queries, Mean query time: %0.6lf\n", nQueries, (double)meanQueryTime); printf("%ld buckets, %ld points (out of %ld, %.2f %%) in them\n", nBuckets, nBucketedPoints, nPoints, 100*(float)nBucketedPoints/(float)nPoints); } else { printf("No query\n"); } //freePRNearNeighborStruct(nnStruct); return 0; }
// Determines the run-time coefficients of the different parts of the //确定查询算法不同部分的运行时间 // query algorithm. Values that are computed and returned are // <lshPrecomp>, <uhashOver>, <distComp>. <lshPrecomp> is the time for // pre-computing one function from the LSH family. <uhashOver> is the // time for getting a bucket from a hash table (of buckets).<distComp> // is the time to compute one distance between two points. These times // are computed by constructing a R-NN DS on a sample data set and // running a sample query set on it. void determineRTCoefficients(RealT thresholdR, RealT successProbability, BooleanT useUfunctions, IntT typeHT, //建立hash表的类型 IntT dimension, Int32T nPoints, PPointT *realData, RealT &lshPrecomp, RealT &uhashOver, RealT &distComp){ // use a subset of the original data set. 使用原始数据集的一个子集 // there is not much theory behind the formula below. //减小运算规模 IntT n = nPoints / 50; //最多生成n各点,缩小50倍 if (n < 100) { //如果生成的点的个数小于100,则使桶的数量与数据集点的数量一样多 n = nPoints; } if (n > 10000) { n = 10000; } // Initialize the data set to use. PPointT *dataSet; FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); for(IntT i = 0; i < n; i++){ //从真实数据集中随机取n个点 (最多10000个) dataSet[i] = realData[genRandomInt(0, nPoints - 1)]; } IntT hashTableSize = n; //哈希表大小也初始化为n,是指hashTableSize放的点的个数,还是放的桶的个数? RNNParametersT algParameters; algParameters.parameterR = thresholdR; //半径 algParameters.successProbability = successProbability; algParameters.dimension = dimension; #ifdef USE_L1_DISTANCE algParameters.parameterR2 = thresholdR; //使用L1距离,R2=R #else algParameters.parameterR2 = SQR(thresholdR); //使用L2 R2=R^2 #endif algParameters.useUfunctions = useUfunctions; algParameters.parameterK = 16; //k 设定为16,只是测试,估算运算时间,可能是先随机设置一个时间,之后再在代码中改成16,因为16是bestK. algParameters.parameterW = PARAMETER_W_DEFAULT; //W=4,manuel中说经过多次测试,4是最好的值 algParameters.parameterT = n; //点的个数 algParameters.typeHT = typeHT; //桶的类型HT_HYBRID_CHAINS,在line405里面定义的。 if (algParameters.useUfunctions){ algParameters.parameterM = computeMForULSH(algParameters.parameterK, algParameters.successProbability); //经过改进的L和M algParameters.parameterL = algParameters.parameterM * (algParameters.parameterM - 1) / 2; }else{ algParameters.parameterM = computeLfromKP(algParameters.parameterK, algParameters.successProbability); //论文里面的M=L algParameters.parameterL = algParameters.parameterM; } // FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); // for(IntT i = 0; i < n; i++){ // FAILIF(NULL == (dataSet[i] = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (dataSet[i]->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // dataSet[i]->index = i; // sqrLength = 0; // for(IntT d = 0; d < dimension; d++){ // if (i == 0) { // dataSet[i]->coordinates[d] = genUniformRandom(-100, 100); // }else{ // dataSet[i]->coordinates[d] = dataSet[0]->coordinates[d]; // } // sqrLength += SQR(dataSet[i]->coordinates[d]); // } // dataSet[i]->sqrLength = sqrLength; // } // switch on timing BooleanT tempTimingOn = timingOn; //初始化为True timingOn = TRUE; // initialize result arrays PPointT *result = NULL; //结果集以及其初始化 IntT resultSize = 0; IntT nNNs; IntT nSucReps; do{ // create the test structure PRNearNeighborStructT nnStruct; switch(algParameters.typeHT){ case HT_LINKED_LIST: nnStruct = initLSH(algParameters, n); // add points to the test structure for(IntT i = 0; i < n; i++){ addNewPointToPRNearNeighborStruct(nnStruct, realData[i]); } break; case HT_HYBRID_CHAINS: nnStruct = initLSH_WithDataSet(algParameters, n, dataSet); //初始化数据结构,参数集,点的个数,数据集,对点进行映射转换,桶进行映射转换,点存入桶中 break; default: ASSERT(FALSE); } // query point PPointT queryPoint; // FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // RealT sqrLength = 0; // for(IntT i = 0; i < dimension; i++){ // queryPoint->coordinates[i] = dataSet[0]->coordinates[i]; // //queryPoint->coordinates[i] = 0.1; // sqrLength += SQR(queryPoint->coordinates[i]); // } //queryPoint->coordinates[0] = dataPoint->coordinates[0] + 0.0001; //queryPoint->sqrLength = sqrLength; // reset the R parameter so that there are no NN neighbors. setResultReporting(nnStruct, FALSE); //DPRINTF1("X\n"); lshPrecomp = 0; uhashOver = 0; distComp = 0; IntT nReps = 20; nSucReps = 0; for(IntT rep = 0; rep < nReps; rep++){ queryPoint = realData[genRandomInt(0, nPoints - 1)]; //查询点为数据集中随机抽取出来的一个点 timeComputeULSH = 0; timeGetBucket = 0; timeCycleBucket = 0; nOfDistComps = 0; //点与点比较的次数 //返回查找到的近邻点数,并将查询到的近邻点存入result中。 nNNs = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize); //DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH); //DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket); //DPRINTF("Time to cycle through buckets: %0.9lf\n", timeCycleBucket); //DPRINTF("N of dist comp: %d\n", nOfDistComps); ASSERT(nNNs == 0); //若一个点都没有找到,将发生中断。 if (nOfDistComps >= MIN(n / 10, 100)){ //与足够的点比较过,才将时间计入 nSucReps++; lshPrecomp += timeComputeULSH / algParameters.parameterK / algParameters.parameterM; //一个点对一个哈希函数的处理时间。共有k*L个哈希函数 uhashOver += timeGetBucket / algParameters.parameterL; //找到一个链表中桶的时间 distComp += timeCycleBucket / nOfDistComps; //遍历链表中桶,并与桶里面的点比较的时间 } } if (nSucReps >= 5){ lshPrecomp /= nSucReps; uhashOver /= nSucReps; distComp /= nSucReps; DPRINTF1("RT coeffs computed.\n"); }else{ algParameters.parameterR *= 2; // double the radius and repeat //比较的点数不够,将半径扩大,重复比较 DPRINTF1("Could not determine the RT coeffs. Repeating.\n"); } freePRNearNeighborStruct(nnStruct); }while(nSucReps < 5); //做一个有效值的判断,要获得5次有效值 FREE(dataSet); FREE(result); timingOn = tempTimingOn; }
/* The main entry to LSH package. Depending on the command line parameters, the function computes the R-NN data structure optimal parameters and/or construct the R-NN data structure and runs the queries on the data structure. */ int main_T(int nargs, char **args) { //先分析参数 /* 官方lsh文件:10个参数 1000 9 784 0.9 0.6 mnist1k.dts mnist1k.q bin/LSHMain $nDataSet $nQuerySet $dimension $successProbability "$1" "$2" "$3" $m -c*/ //算参数 bin/LSHMain 1000 9 784 0.9 "0.6" "mnist1k.dts" "mnist1k.q" 1002000000 -c //bin/LSHMain $nDataSet $nQuerySet $dimension $successProbability 1.0 "$1" "$2" $m -p "$3" //匹配 bin/LSHMain 1000 9 784 0.9 1.0 "mnist1k.dts" "mnist1k.q" 1002000000 -p "outputparma.txt" if(nargs < 9) { usage(args[0]); exit(1); } //initializeLSHGlobal(); // Parse part of the command-line parameters. nPoints = atoi(args[1]); IntT nQueries = atoi(args[2]); pointsDimension = atoi(args[3]); successProbability = atof(args[4]); char* endPtr[1]; RealT thresholdR = strtod(args[5], endPtr);//点相邻的距离阈值 //str-to -double 将字符串转换成浮点数的函数 //endPtr 接收数字结尾后非字符串字母 //这个r阈值是什么呢? if (thresholdR == 0 || endPtr[1] == args[5]) {//如果阈值为0,或者第一个字符就不是数字, //表示是用文件保存的 //这大概是用于测试哪个阈值好的 // The value for R is not specified, instead there is a file // specifying multiple R's. thresholdR = 0; // Read in the file FILE *radiiFile = fopen(args[5], "rt"); FAILIF(radiiFile == NULL); fscanf(radiiFile, "%d\n", &nRadii); ASSERT(nRadii > 0); FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT)))); FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT)))); for(IntT i = 0; i < nRadii; i++) { FSCANF_REAL(radiiFile, &listOfRadii[i]); ASSERT(listOfRadii[i] > 0); FSCANF_REAL(radiiFile, &memRatiosForNNStructs[i]); ASSERT(memRatiosForNNStructs[i] > 0); } } else { nRadii = 1; FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT)))); FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT)))); listOfRadii[0] = thresholdR; memRatiosForNNStructs[0] = 1; }//对阈值R 和Radiii的处理 DPRINTF("No. radii: %d\n", nRadii); //thresholdR = atof(args[5]); availableTotalMemory = atoll(args[8]);//$M表示的是内存空间大小 if (nPoints > MAX_N_POINTS) { printf("Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints); fprintf(ERROR_OUTPUT, "Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints); exit(1); } readDataSetFromFile(args[6]);//点读到dataSetPoints //这个totalAllocatedMemory初始化为0,但是 //#define MALLOC(amount) ((amount > 0) ? totalAllocatedMemory += amount, malloc(amount) : NULL) //这样,每次申请内存都会统计到了 DPRINTF("Allocated memory (after reading data set): %lld\n", totalAllocatedMemory); Int32T nSampleQueries = N_SAMPLE_QUERY_POINTS; PPointT sampleQueries[N_SAMPLE_QUERY_POINTS]; Int32T sampleQBoundaryIndeces[N_SAMPLE_QUERY_POINTS]; // PPointT sampleQueries[nSampleQueries]; // Int32T sampleQBoundaryIndeces[nSampleQueries]; if ((nargs <= 9) || (strcmp("-c", args[9]) == 0) ) { // In this cases, we need to generate a sample query set for // computing the optimal parameters. // Generate a sample query set. FILE *queryFile = fopen(args[7], "rt"); if (strcmp(args[7], ".") == 0 || queryFile == NULL || nQueries <= 0) {//没有查询文件,就用所有点产生随机点 // Choose several data set points for the sample query points. for(IntT i = 0; i < nSampleQueries; i++){ sampleQueries[i] = dataSetPoints[genRandomInt(0, nPoints - 1)]; } } else { //从查询文件中选取随机的点, // Choose several actual query points for the sample query points. nSampleQueries = MIN(nSampleQueries, nQueries); Int32T sampleIndeces[N_SAMPLE_QUERY_POINTS]; //Int32T sampleIndeces[nSampleQueries]; for(IntT i = 0; i < nSampleQueries; i++) { sampleIndeces[i] = genRandomInt(0, nQueries - 1); } qsort(sampleIndeces, nSampleQueries, sizeof(*sampleIndeces), compareInt32T); //printIntVector("sampleIndeces: ", nSampleQueries, sampleIndeces); Int32T j = 0; for(Int32T i = 0; i < nQueries; i++) { if (i == sampleIndeces[j]) { sampleQueries[j] = readPoint(queryFile); j++; while (i == sampleIndeces[j]) { sampleQueries[j] = sampleQueries[j - 1]; j++; } }else { fscanf(queryFile, "%[^\n]", sBuffer); fscanf(queryFile, "\n"); } } nSampleQueries = j; fclose(queryFile); } //前面那么多,好像就是在申请内存,读文件,读入参数 // Compute the array sampleQBoundaryIndeces that specifies how to // segregate the sample query points according to their distance // to NN. //采用遍历的方法,计算查询点的最近邻(并且距离小于listOfRadii【nRadii】) sortQueryPointsByRadii(pointsDimension, nSampleQueries, sampleQueries, nPoints, dataSetPoints, nRadii, listOfRadii, sampleQBoundaryIndeces); }//if ((nargs < 9) || (strcmp("-c", args[9]) == 0)) RNNParametersT *algParameters = NULL; PRNearNeighborStructT *nnStructs = NULL; if (nargs > 9) {/* 官方lsh文件:10个参数 bin/LSHMain $nDataSet $nQuerySet $dimension $successProbability "$1" "$2" "$3" $m -c */ // Additional command-line parameter is specified. if (strcmp("-c", args[9]) == 0) //-c表示参数优化 { // Only compute the R-NN DS parameters and output them to stdout. printf("%d\n", nRadii); transformMemRatios(); for(IntT i = 0; i < nRadii; i++) { // which sample queries to use Int32T segregatedQStart = (i == 0) ? 0 : sampleQBoundaryIndeces[i - 1]; Int32T segregatedQNumber = nSampleQueries - segregatedQStart; if (segregatedQNumber == 0) { // XXX: not the right answer segregatedQNumber = nSampleQueries; segregatedQStart = 0; } ASSERT(segregatedQStart < nSampleQueries); ASSERT(segregatedQStart >= 0); ASSERT(segregatedQStart + segregatedQNumber <= nSampleQueries); ASSERT(segregatedQNumber >= 0); //从文件读取点,然后计算优化后的参数 RNNParametersT optParameters = computeOptimalParameters(listOfRadii[i], successProbability, nPoints, pointsDimension, dataSetPoints, segregatedQNumber, sampleQueries + segregatedQStart, /*对内存的约束,就体现在这里, availableTotalMemory总共的内存(传入) - totalAllocatedMemory(使用mallloc分配的)*1=内存上限 然后(L * nPoints > memoryUpperBound / 12 来约束 */ (MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i])); printRNNParameters(stdout, optParameters); } exit(0); } else if (strcmp("-p", args[9]) == 0) {//-p表示从文件读入参数,然后建立结构体 // Read the R-NN DS parameters from the given file and run the // queries on the constructed data structure. if (nargs < 10) { usage(args[0]); exit(1); } FILE *pFile = fopen(args[10], "rt"); FAILIFWR(pFile == NULL, "Could not open the params file."); fscanf(pFile, "%d\n", &nRadii); DPRINTF1("Using the following R-NN DS parameters:\n"); DPRINTF("N radii = %d\n", nRadii); FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT)))); FAILIF(NULL == (algParameters = (RNNParametersT*)MALLOC(nRadii * sizeof(RNNParametersT)))); for(IntT i = 0; i < nRadii; i++) {//默认i=1 algParameters[i] = readRNNParameters(pFile);//从文件读参数 printRNNParameters(stderr, algParameters[i]); nnStructs[i] = initLSH_WithDataSet(algParameters[i], nPoints, dataSetPoints); //核心 //初始化整个数据结构 包括整体+l个hash表 +点映射到桶 } pointsDimension = algParameters[0].dimension; FREE(listOfRadii); FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT)))); for(IntT i = 0; i < nRadii; i++) { listOfRadii[i] = algParameters[i].parameterR; } } else { // Wrong option. usage(args[0]); exit(1); } }//if (nargs > 9) else { FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT)))); // Determine the R-NN DS parameters, construct the DS and run the queries. transformMemRatios(); for(IntT i = 0; i < nRadii; i++) { // XXX: segregate the sample queries... //建立查询结构,自动优化参数 nnStructs[i] = initSelfTunedRNearNeighborWithDataSet(listOfRadii[i], successProbability, nPoints, pointsDimension, dataSetPoints, nSampleQueries, sampleQueries, (MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i])); } } // if (nargs <= 9) //上面都是根据不同配置,对参数的优化,建立查询结构 DPRINTF1("X\n"); IntT resultSize = nPoints; PPointT *result = (PPointT*)MALLOC(resultSize * sizeof(*result)); PPointT queryPoint; FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT)))); FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(pointsDimension * sizeof(RealT)))); //读取查询点的文件 FILE *queryFile = fopen(args[7], "rt"); FAILIF(queryFile == NULL); TimeVarT meanQueryTime = 0; PPointAndRealTStructT *distToNN = NULL; for(IntT i = 0; i < nQueries; i++) {//对于每一个要查询的点 RealT sqrLength = 0; // read in the query point. for(IntT d = 0; d < pointsDimension; d++) { FSCANF_REAL(queryFile, &(queryPoint->coordinates[d])); sqrLength += SQR(queryPoint->coordinates[d]); /*//test if (d >150 && d<160) { printf(" %lf ",queryPoint->coordinates[d]); } if ( d==160) { printf("原始的文件数据\n"); } */ } queryPoint->sqrLength = sqrLength; //printRealVector("Query: ", pointsDimension, queryPoint->coordinates); // get the near neighbors. IntT nNNs = 0; for(IntT r = 0; r < nRadii; r++) {//查询n个近邻点,并计算距离 //查询核心 nNNs = getRNearNeighbors(nnStructs[r], queryPoint, result, resultSize); printf("Total time for R-NN query at radius %0.6lf (radius no. %d):\t%0.6lf\n", (double)(listOfRadii[r]), r, timeRNNQuery); meanQueryTime += timeRNNQuery; if (nNNs > 0) { printf("Query point %d: found %d NNs at distance %0.6lf (%dth radius). First %d NNs are:\n", i, nNNs, (double)(listOfRadii[r]), r, MIN(nNNs, MAX_REPORTED_POINTS)); // compute the distances to the found NN, and sort according to the distance //计算近邻点和查询点的距离 FAILIF(NULL == (distToNN = (PPointAndRealTStructT*)REALLOC(distToNN, nNNs * sizeof(*distToNN)))); for(IntT p = 0; p < nNNs; p++) { distToNN[p].ppoint = result[p]; distToNN[p].real = distance(pointsDimension, queryPoint, result[p]); } qsort(distToNN, nNNs, sizeof(*distToNN), comparePPointAndRealTStructT); // Print the points for(IntT j = 0; j < MIN(nNNs, MAX_REPORTED_POINTS); j++) { ASSERT(distToNN[j].ppoint != NULL); printf("%09d\tDistance:%0.6lf\n", distToNN[j].ppoint->index, distToNN[j].real); CR_ASSERT(distToNN[j].real <= listOfRadii[r]); //DPRINTF("Distance: %lf\n", distance(pointsDimension, queryPoint, result[j])); //printRealVector("NN: ", pointsDimension, result[j]->coordinates); } break; } } if (nNNs == 0) { printf("Query point %d: no NNs found.\n", i); } }// for(IntT i = 0; i < nQueries; i++)每个点查询 // if (nQueries > 0) { meanQueryTime = meanQueryTime / nQueries; printf("Mean query time: %0.6lf\n", (double)meanQueryTime); } for(IntT i = 0; i < nRadii; i++) { freePRNearNeighborStruct(nnStructs[i]); } // XXX: should ideally free the other stuff as well. return 0; }
/* The main entry to LSH package. Depending on the command line parameters, the function computes the R-NN data structure optimal parameters and/or construct the R-NN data structure and runs the queries on the data structure. */ int main(int nargs, char **args){ if(nargs < 9){ usage(args[0]); exit(1); } //initializeLSHGlobal(); // Parse part of the command-line parameters. nPoints = atoi(args[1]); IntT nQueries = atoi(args[2]); pointsDimension = atoi(args[3]); successProbability = atof(args[4]); char* endPtr[1]; RealT thresholdR = strtod(args[5], endPtr); if (thresholdR == 0 || endPtr[1] == args[5]){ // The value for R is not specified, instead there is a file // specifying multiple R's. thresholdR = 0; // Read in the file FILE *radiiFile = fopen(args[5], "rt"); FAILIF(radiiFile == NULL); fscanf(radiiFile, "%d\n", &nRadii); ASSERT(nRadii > 0); FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT)))); FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT)))); for(IntT i = 0; i < nRadii; i++){ FSCANF_REAL(radiiFile, &listOfRadii[i]); ASSERT(listOfRadii[i] > 0); FSCANF_REAL(radiiFile, &memRatiosForNNStructs[i]); ASSERT(memRatiosForNNStructs[i] > 0); } }else{ nRadii = 1; FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT)))); FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT)))); listOfRadii[0] = thresholdR; memRatiosForNNStructs[0] = 1; } DPRINTF("No. radii: %d\n", nRadii); //thresholdR = atof(args[5]); availableTotalMemory = atoll(args[8]); if (nPoints > MAX_N_POINTS) { // 104w points printf("Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints); fprintf(ERROR_OUTPUT, "Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints); exit(1); } readDataSetFromFile(args[6]); // read points into data structure DPRINTF("Allocated memory (after reading data set): %lld\n", totalAllocatedMemory); Int32T nSampleQueries = N_SAMPLE_QUERY_POINTS; PPointT sampleQueries[nSampleQueries]; Int32T sampleQBoundaryIndeces[nSampleQueries]; if ((nargs < 9) || (strcmp("-c", args[9]) == 0)){ // In this cases, we need to generate a sample query set for // computing the optimal parameters. // Generate a sample query set. FILE *queryFile = fopen(args[7], "rt"); if (strcmp(args[7], ".") == 0 || queryFile == NULL || nQueries <= 0){ // Choose several data set points for the sample query points. for(IntT i = 0; i < nSampleQueries; i++){ sampleQueries[i] = dataSetPoints[genRandomInt(0, nPoints - 1)]; } }else{ // Choose several actual query points for the sample query points. nSampleQueries = MIN(nSampleQueries, nQueries); Int32T sampleIndeces[nSampleQueries]; for(IntT i = 0; i < nSampleQueries; i++){ sampleIndeces[i] = genRandomInt(0, nQueries - 1); } qsort(sampleIndeces, nSampleQueries, sizeof(*sampleIndeces), compareInt32T); //printIntVector("sampleIndeces: ", nSampleQueries, sampleIndeces); Int32T j = 0; for(Int32T i = 0; i < nQueries; i++){ if (i == sampleIndeces[j]){ sampleQueries[j] = readPoint(queryFile); j++; while (i == sampleIndeces[j]){ sampleQueries[j] = sampleQueries[j - 1]; j++; } }else{ fscanf(queryFile, "%[^\n]", sBuffer); fscanf(queryFile, "\n"); } } nSampleQueries = j; fclose(queryFile); } // Compute the array sampleQBoundaryIndeces that specifies how to // segregate the sample query points according to their distance // to NN. sortQueryPointsByRadii(pointsDimension, nSampleQueries, sampleQueries, nPoints, dataSetPoints, nRadii, listOfRadii, sampleQBoundaryIndeces); } RNNParametersT *algParameters = NULL; PRNearNeighborStructT *nnStructs = NULL; if (nargs > 9) { // Additional command-line parameter is specified. if (strcmp("-c", args[9]) == 0) { // Only compute the R-NN DS parameters and output them to stdout. printf("%d\n", nRadii); transformMemRatios(); for(IntT i = 0; i < nRadii; i++){ // which sample queries to use Int32T segregatedQStart = (i == 0) ? 0 : sampleQBoundaryIndeces[i - 1]; Int32T segregatedQNumber = nSampleQueries - segregatedQStart; if (segregatedQNumber == 0) { // XXX: not the right answer segregatedQNumber = nSampleQueries; segregatedQStart = 0; } ASSERT(segregatedQStart < nSampleQueries); ASSERT(segregatedQStart >= 0); ASSERT(segregatedQStart + segregatedQNumber <= nSampleQueries); ASSERT(segregatedQNumber >= 0); RNNParametersT optParameters = computeOptimalParameters(listOfRadii[i], successProbability, nPoints, pointsDimension, dataSetPoints, segregatedQNumber, sampleQueries + segregatedQStart, (MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i])); printRNNParameters(stdout, optParameters); } exit(0); } else if (strcmp("-p", args[9]) == 0) { // Read the R-NN DS parameters from the given file and run the // queries on the constructed data structure. if (nargs < 10){ usage(args[0]); exit(1); } FILE *pFile = fopen(args[10], "rt"); FAILIFWR(pFile == NULL, "Could not open the params file."); fscanf(pFile, "%d\n", &nRadii); DPRINTF1("Using the following R-NN DS parameters:\n"); DPRINTF("N radii = %d\n", nRadii); FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT)))); FAILIF(NULL == (algParameters = (RNNParametersT*)MALLOC(nRadii * sizeof(RNNParametersT)))); for(IntT i = 0; i < nRadii; i++){ algParameters[i] = readRNNParameters(pFile); printRNNParameters(stderr, algParameters[i]); nnStructs[i] = initLSH_WithDataSet(algParameters[i], nPoints, dataSetPoints); } pointsDimension = algParameters[0].dimension; FREE(listOfRadii); FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT)))); for(IntT i = 0; i < nRadii; i++){ listOfRadii[i] = algParameters[i].parameterR; } } else{ // Wrong option. usage(args[0]); exit(1); } } else { FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT)))); // Determine the R-NN DS parameters, construct the DS and run the queries. transformMemRatios(); for(IntT i = 0; i < nRadii; i++){ // XXX: segregate the sample queries... nnStructs[i] = initSelfTunedRNearNeighborWithDataSet(listOfRadii[i], successProbability, nPoints, pointsDimension, dataSetPoints, nSampleQueries, sampleQueries, (MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i])); } } DPRINTF1("X\n"); IntT resultSize = nPoints; PPointT *result = (PPointT*)MALLOC(resultSize * sizeof(*result)); PPointT queryPoint; FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT)))); FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(pointsDimension * sizeof(RealT)))); FILE *queryFile = fopen(args[7], "rt"); FAILIF(queryFile == NULL); TimeVarT meanQueryTime = 0; PPointAndRealTStructT *distToNN = NULL; for(IntT i = 0; i < nQueries; i++){ RealT sqrLength = 0; // read in the query point. for(IntT d = 0; d < pointsDimension; d++){ FSCANF_REAL(queryFile, &(queryPoint->coordinates[d])); sqrLength += SQR(queryPoint->coordinates[d]); } queryPoint->sqrLength = sqrLength; //printRealVector("Query: ", pointsDimension, queryPoint->coordinates); // get the near neighbors. IntT nNNs = 0; for(IntT r = 0; r < nRadii; r++){ nNNs = getRNearNeighbors(nnStructs[r], queryPoint, result, resultSize); printf("Total time for R-NN query at radius %0.6lf (radius no. %d):\t%0.6lf\n", (double)(listOfRadii[r]), r, timeRNNQuery); meanQueryTime += timeRNNQuery; if (nNNs > 0){ printf("Query point %d: found %d NNs at distance %0.6lf (%dth radius). First %d NNs are:\n", i, nNNs, (double)(listOfRadii[r]), r, MIN(nNNs, MAX_REPORTED_POINTS)); // compute the distances to the found NN, and sort according to the distance FAILIF(NULL == (distToNN = (PPointAndRealTStructT*)REALLOC(distToNN, nNNs * sizeof(*distToNN)))); for(IntT p = 0; p < nNNs; p++){ distToNN[p].ppoint = result[p]; distToNN[p].real = distance(pointsDimension, queryPoint, result[p]); } qsort(distToNN, nNNs, sizeof(*distToNN), comparePPointAndRealTStructT); // Print the points for(IntT j = 0; j < MIN(nNNs, MAX_REPORTED_POINTS); j++){ ASSERT(distToNN[j].ppoint != NULL); printf("%09d\tDistance:%0.6lf\n", distToNN[j].ppoint->index, distToNN[j].real); CR_ASSERT(distToNN[j].real <= listOfRadii[r]); //DPRINTF("Distance: %lf\n", distance(pointsDimension, queryPoint, result[j])); //printRealVector("NN: ", pointsDimension, result[j]->coordinates); } break; } } if (nNNs == 0){ printf("Query point %d: no NNs found.\n", i); } } if (nQueries > 0){ meanQueryTime = meanQueryTime / nQueries; printf("Mean query time: %0.6lf\n", (double)meanQueryTime); } for(IntT i = 0; i < nRadii; i++){ freePRNearNeighborStruct(nnStructs[i]); } // XXX: should ideally free the other stuff as well. return 0; }
/* The main entry to LSH package. Depending on the command line parameters, the function computes the R-NN data structure optimal parameters and/or construct the R-NN data structure and runs the queries on the data structure. */ int main(int nargs, char **args){ if(nargs < 9){ usage(args[0]); exit(1); } //initializeLSHGlobal(); // Parse part of the command-line parameters. nPoints = atoi(args[1]); IntT nQueries = atoi(args[2]); pointsDimension = atoi(args[3]); successProbability = atof(args[4]); char* endPtr[1]; RealT thresholdR = strtod(args[5], endPtr); //strtod将字符串转换成浮点数 //r=0.6 //strtod()会扫描参数nptr字符串,跳过前面的空格字符,直到遇上数字或正负符号才开始做转换 //,到出现非数字或字符串结束时('')才结束转换, 并将结果返回。 //若endptr不为NULL,则会将遇到不合条件而终止的nptr中的字符指针由endptr传回。 if (thresholdR == 0 || endPtr[1] == args[5]){ //确保阈值合法 // The value for R is not specified, instead there is a file // specifying multiple R's. thresholdR = 0; // Read in the file FILE *radiiFile = fopen(args[5], "rt"); FAILIF(radiiFile == NULL); fscanf(radiiFile, "%d\n", &nRadii); ASSERT(nRadii > 0); FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT)))); FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT)))); for(IntT i = 0; i < nRadii; i++){ FSCANF_REAL(radiiFile, &listOfRadii[i]); ASSERT(listOfRadii[i] > 0); FSCANF_REAL(radiiFile, &memRatiosForNNStructs[i]); ASSERT(memRatiosForNNStructs[i] > 0); } }else{ nRadii = 1; //半径的个数为1个 FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT)))); FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT)))); listOfRadii[0] = thresholdR; memRatiosForNNStructs[0] = 1; } DPRINTF("No. radii: %d\n", nRadii); //thresholdR = atof(args[5]); availableTotalMemory = atoll(args[8]); if (nPoints > MAX_N_POINTS) { printf("Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints); fprintf(ERROR_OUTPUT, "Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints); exit(1); } readDataSetFromFile(args[6]); //数据集的文件名 DPRINTF("Allocated memory (after reading data set): %lld\n", totalAllocatedMemory); Int32T nSampleQueries = N_SAMPLE_QUERY_POINTS; //样本查询点的个数,100 PPointT sampleQueries[nSampleQueries]; //对查询点编号 Int32T sampleQBoundaryIndeces[nSampleQueries]; //第一个大于半径的点的编号,如果有多个半径的话,就会记录更多 if ((nargs < 9) || (strcmp("-c", args[9]) == 0)){ //计算最优参数 // In this cases, we need to generate a sample query set for // computing the optimal parameters. // Generate a sample query set. FILE *queryFile = fopen(args[7], "rt"); //打开查询集,以只读文本方式打开 if (strcmp(args[7], ".") == 0 || queryFile == NULL || nQueries <= 0){ // Choose several data set points for the sample query points. //如果没有查询点就随机选择几个数据集点作为查询点 for(IntT i = 0; i < nSampleQueries; i++){ sampleQueries[i] = dataSetPoints[genRandomInt(0, nPoints - 1)]; } }else{ // Choose several actual query points for the sample query points. nSampleQueries = MIN(nSampleQueries, nQueries); //MIN(100,9) Int32T sampleIndeces[nSampleQueries]; //定义了一个查询点样本索引数组 for(IntT i = 0; i < nSampleQueries; i++){ ////为什么要对查询点索引进行随机变化? 想把样本查询点控制在一定的范围内,如果查询点过多,则样本点最多取100个查询点。 sampleIndeces[i] = genRandomInt(0, nQueries - 1); //对查询点做了一下顺序的变化,对查询点的索引做随机处理。 } // 根据你给的比较条件进行快速排序,通过指针的移动实验排序,排序之后的结果仍然放在原数组中,必须自己写一个比较函数 //http://www.slyar.com/blog/stdlib-qsort.html qsort(数组起始地址,数组元素大小,每个元素的大小,函数指针指向比较函数) qsort(sampleIndeces, nSampleQueries, sizeof(*sampleIndeces), compareInt32T); //qsort,C语言标准库函数,对样本查询点的索引值进行排序 //printIntVector("sampleIndeces: ", nSampleQueries, sampleIndeces); Int32T j = 0; for(Int32T i = 0; i < nQueries; i++){ if (i == sampleIndeces[j]){ //如果样本查询点的索引值与实际查询点的索引值一致,读入点 sampleQueries[j] = readPoint(queryFile); j++; while (i == sampleIndeces[j]){ //如果样本查询点之后的索引值与实践查询点的索引值一致,则直接将此点的值赋给后面一点的值 sampleQueries[j] = sampleQueries[j - 1]; //覆盖之后索引点的值 j++; //取后面的点 } }else{ fscanf(queryFile, "%[^\n]", sBuffer); fscanf(queryFile, "\n"); } } nSampleQueries = j; fclose(queryFile); } // Compute the array sampleQBoundaryIndeces that specifies how to // segregate the sample query points according to their distance // to NN. //边界sampleQBoundaryIndeces只会存取一个点的索引,该点的大小为第一个大于半径点的值 sortQueryPointsByRadii(pointsDimension, nSampleQueries, //查询集的点的个数 sampleQueries, //查询点的集合,函数运行完成后,点的值将以距离数据集合的距离由小到大的顺序排序 nPoints, //数据集点的个数 dataSetPoints, //数据集集合 nRadii, //半径的个数 listOfRadii, //半径的值 sampleQBoundaryIndeces); } //之前的东西-c运行的,-p是不会运行的 RNNParametersT *algParameters = NULL; PRNearNeighborStructT *nnStructs = NULL; if (nargs > 9) { // Additional command-line parameter is specified. if (strcmp("-c", args[9]) == 0) { // Only compute the R-NN DS parameters and output them to stdout. // 如果是-c,就只计算数据集参数,然后输出 printf("%d\n", nRadii); //打印出半径的个数:1个。 将写入到参数文件中, transformMemRatios(); //memRatiosForNNstructs,转换内存使用率。假设每个结构为1,每个半径占用的总内存的比率,用于计算内存 for(IntT i = 0; i < nRadii; i++){ //看使用哪个样本查询点 // which sample queries to use Int32T segregatedQStart = (i == 0) ? 0 : sampleQBoundaryIndeces[i - 1]; //起始点的位置 Int32T segregatedQNumber = nSampleQueries - segregatedQStart; //查询点的个数 if (segregatedQNumber == 0) { //如果计算所得点的个数为0,就查询所有的点,从0到最后 // XXX: not the right answer segregatedQNumber = nSampleQueries; segregatedQStart = 0; } ASSERT(segregatedQStart < nSampleQueries); ASSERT(segregatedQStart >= 0); ASSERT(segregatedQStart + segregatedQNumber <= nSampleQueries); ASSERT(segregatedQNumber >= 0); RNNParametersT optParameters = computeOptimalParameters(listOfRadii[i], //计算最优的运行时间, successProbability, nPoints, pointsDimension, dataSetPoints, segregatedQNumber, sampleQueries + segregatedQStart, (MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i])); //比率 ////memRatioForNNStructs[i]:近邻结构体每个半径所占用的内存比率,计算能用多少内存 printRNNParameters(stdout, optParameters); //将参数打印出来 } exit(0); } else if (strcmp("-p", args[9]) == 0) { // Read the R-NN DS parameters from the given file and run the // queries on the constructed data structure. if (nargs < 10){ usage(args[0]); exit(1); } FILE *pFile = fopen(args[10], "rt"); //读取参数文件,由lsh_computeParas产生 FAILIFWR(pFile == NULL, "Could not open the params file."); fscanf(pFile, "%d\n", &nRadii); //这里只取了参数文件中的半径,那参数文件中的其他数据怎样被取用的?? DPRINTF1("Using the following R-NN DS parameters:\n"); //使用R-NN DS(DateSet)参数 DPRINTF("N radii = %d\n", nRadii); //不知道将数据输出到哪里了?? // printf("Using the following R-NN DS parameters:\n"); // printf("N radii=%d\n",nRadii); FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT)))); FAILIF(NULL == (algParameters = (RNNParametersT*)MALLOC(nRadii * sizeof(RNNParametersT)))); for(IntT i = 0; i < nRadii; i++){ algParameters[i] = readRNNParameters(pFile); //将参数信息,输出到屏幕上 // printRNNParameters(stderr, algParameters[i]);@727 //printRNNParameters(stdout,algParameters[i]); nnStructs[i] = initLSH_WithDataSet(algParameters[i], nPoints, dataSetPoints); //根据用户输入的参数,初始化结构 } pointsDimension = algParameters[0].dimension; FREE(listOfRadii); FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT)))); for(IntT i = 0; i < nRadii; i++){ listOfRadii[i] = algParameters[i].parameterR; } } else{ // Wrong option. usage(args[0]); exit(1); } } else { FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT)))); // Determine the R-NN DS parameters, construct the DS and run the queries. transformMemRatios(); for(IntT i = 0; i < nRadii; i++){ // XXX: segregate the sample queries... nnStructs[i] = initSelfTunedRNearNeighborWithDataSet(listOfRadii[i], successProbability, nPoints, pointsDimension, dataSetPoints, nSampleQueries, sampleQueries, (MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i])); } } // DPRINTF1("X\n");@ printf("X\n"); IntT resultSize = nPoints; PPointT *result = (PPointT*)MALLOC(resultSize * sizeof(*result)); PPointT queryPoint; FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT)))); FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(pointsDimension * sizeof(RealT)))); FILE *queryFile = fopen(args[7], "rt"); FAILIF(queryFile == NULL); TimeVarT meanQueryTime = 0; PPointAndRealTStructT *distToNN = NULL; for(IntT i = 0; i < nQueries; i++){ RealT sqrLength = 0; // read in the query point. for(IntT d = 0; d < pointsDimension; d++){ FSCANF_REAL(queryFile, &(queryPoint->coordinates[d])); sqrLength += SQR(queryPoint->coordinates[d]); //向量到原点的距离 } queryPoint->sqrLength = sqrLength; //printRealVector("Query: ", pointsDimension, queryPoint->coordinates); // get the near neighbors. IntT nNNs = 0; for(IntT r = 0; r < nRadii; r++){ nNNs = getRNearNeighbors(nnStructs[r], queryPoint, result, resultSize); printf("Total time for R-NN query at radius %0.6lf (radius no. %d):\t%0.6lf\n", (double)(listOfRadii[r]), r, timeRNNQuery); meanQueryTime += timeRNNQuery; if (nNNs > 0){ printf("Query point %d: found %d NNs at distance %0.6lf (%dth radius). First %d NNs are:\n", i, nNNs, (double)(listOfRadii[r]), r, MIN(nNNs, MAX_REPORTED_POINTS)); // compute the distances to the found NN, and sort according to the distance FAILIF(NULL == (distToNN = (PPointAndRealTStructT*)REALLOC(distToNN, nNNs * sizeof(*distToNN)))); for(IntT p = 0; p < nNNs; p++){ distToNN[p].ppoint = result[p]; distToNN[p].real = distance(pointsDimension, queryPoint, result[p]); } qsort(distToNN, nNNs, sizeof(*distToNN), comparePPointAndRealTStructT); //C语言标准的函数 // Print the points for(IntT j = 0; j < MIN(nNNs, MAX_REPORTED_POINTS); j++){ ASSERT(distToNN[j].ppoint != NULL); printf("%09d\tDistance:%0.6lf\n", distToNN[j].ppoint->index, distToNN[j].real); //打印点的坐标 CR_ASSERT(distToNN[j].real <= listOfRadii[r]); //DPRINTF("Distance: %lf\n", distance(pointsDimension, queryPoint, result[j])); //printRealVector("NN: ", pointsDimension, result[j]->coordinates); } break; } } if (nNNs == 0){ printf("Query point %d: no NNs found.\n", i); } } if (nQueries > 0){ meanQueryTime = meanQueryTime / nQueries; printf("Mean query time: %0.6lf\n", (double)meanQueryTime); } for(IntT i = 0; i < nRadii; i++){ freePRNearNeighborStruct(nnStructs[i]); } // XXX: should ideally free the other stuff as well. return 0; }
// Determines the run-time coefficients of the different parts of the // query algorithm. Values that are computed and returned are // <lshPrecomp>, <uhashOver>, <distComp>. <lshPrecomp> is the time for // pre-computing one function from the LSH family. <uhashOver> is the // time for getting a bucket from a hash table (of buckets).<distComp> // is the time to compute one distance between two points. These times // are computed by constructing a R-NN DS on a sample data set and // running a sample query set on it. void determineRTCoefficients(RealT thresholdR, RealT successProbability, BooleanT useUfunctions, IntT typeHT, IntT dimension, Int32T nPoints, PPointT *realData, RealT &lshPrecomp, RealT &uhashOver, RealT &distComp){ // use a subset of the original data set. // there is not much theory behind the formula below. IntT n = nPoints / 50; if (n < 100) { n = nPoints; } if (n > 10000) { n = 10000; } // Initialize the data set to use. PPointT *dataSet; FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); for(IntT i = 0; i < n; i++){ dataSet[i] = realData[genRandomInt(0, nPoints - 1)]; } IntT hashTableSize = n; RNNParametersT algParameters; algParameters.parameterR = thresholdR; algParameters.successProbability = successProbability; algParameters.dimension = dimension; #ifdef USE_L1_DISTANCE algParameters.parameterR2 = thresholdR; #else algParameters.parameterR2 = SQR(thresholdR); #endif algParameters.useUfunctions = useUfunctions; algParameters.parameterK = 16; algParameters.parameterW = PARAMETER_W_DEFAULT; algParameters.parameterT = n; algParameters.typeHT = typeHT; if (algParameters.useUfunctions){ algParameters.parameterM = computeMForULSH(algParameters.parameterK, algParameters.successProbability); algParameters.parameterL = algParameters.parameterM * (algParameters.parameterM - 1) / 2; }else{ algParameters.parameterM = computeLfromKP(algParameters.parameterK, algParameters.successProbability); algParameters.parameterL = algParameters.parameterM; } // FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT)))); // for(IntT i = 0; i < n; i++){ // FAILIF(NULL == (dataSet[i] = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (dataSet[i]->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // dataSet[i]->index = i; // sqrLength = 0; // for(IntT d = 0; d < dimension; d++){ // if (i == 0) { // dataSet[i]->coordinates[d] = genUniformRandom(-100, 100); // }else{ // dataSet[i]->coordinates[d] = dataSet[0]->coordinates[d]; // } // sqrLength += SQR(dataSet[i]->coordinates[d]); // } // dataSet[i]->sqrLength = sqrLength; // } // switch on timing BooleanT tempTimingOn = timingOn; timingOn = TRUE; // initialize result arrays PPointT *result = NULL; IntT resultSize = 0; IntT nNNs; IntT nSucReps; do{ // create the test structure PRNearNeighborStructT nnStruct; switch(algParameters.typeHT){ case HT_LINKED_LIST: nnStruct = initLSH(algParameters, n); // add points to the test structure for(IntT i = 0; i < n; i++){ addNewPointToPRNearNeighborStruct(nnStruct, realData[i]); } break; case HT_HYBRID_CHAINS: nnStruct = initLSH_WithDataSet(algParameters, n, dataSet); break; default: ASSERT(FALSE); } // query point PPointT queryPoint; // FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT)))); // FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT)))); // RealT sqrLength = 0; // for(IntT i = 0; i < dimension; i++){ // queryPoint->coordinates[i] = dataSet[0]->coordinates[i]; // //queryPoint->coordinates[i] = 0.1; // sqrLength += SQR(queryPoint->coordinates[i]); // } //queryPoint->coordinates[0] = dataPoint->coordinates[0] + 0.0001; //queryPoint->sqrLength = sqrLength; // reset the R parameter so that there are no NN neighbors. setResultReporting(nnStruct, FALSE); //DPRINTF1("X\n"); lshPrecomp = 0; uhashOver = 0; distComp = 0; IntT nReps = 20; nSucReps = 0; for(IntT rep = 0; rep < nReps; rep++){ queryPoint = realData[genRandomInt(0, nPoints - 1)]; timeComputeULSH = 0; timeGetBucket = 0; timeCycleBucket = 0; nOfDistComps = 0; nNNs = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize); //DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH); //DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket); //DPRINTF("Time to cycle through buckets: %0.9lf\n", timeCycleBucket); //DPRINTF("N of dist comp: %d\n", nOfDistComps); ASSERT(nNNs == 0); if (nOfDistComps >= MIN(n / 10, 100)){ nSucReps++; lshPrecomp += timeComputeULSH / algParameters.parameterK / algParameters.parameterM; uhashOver += timeGetBucket / algParameters.parameterL; distComp += timeCycleBucket / nOfDistComps; } } if (nSucReps >= 5){ lshPrecomp /= nSucReps; uhashOver /= nSucReps; distComp /= nSucReps; DPRINTF1("RT coeffs computed.\n"); }else{ algParameters.parameterR *= 2; // double the radius and repeat DPRINTF1("Could not determine the RT coeffs. Repeating.\n"); } freePRNearNeighborStruct(nnStruct); }while(nSucReps < 5); FREE(dataSet); FREE(result); timingOn = tempTimingOn; }