예제 #1
0
파일: NearNeighbors.cpp 프로젝트: lvpei/v9
Int32T getRNearNeighbors(PRNearNeighborStructT nnStruct, PPointT queryPoint, PPointT *(&result), Int32T &resultSize){
  DPRINTF("Estimated ULSH comp: %0.6lf\n", lshPrecomp * nnStruct->nHFTuples * nnStruct->hfTuplesLength);
  DPRINTF("Estimated UH overhead: %0.6lf\n", uhashOver * nnStruct->nHFTuples);
//   RealT estNColls = estimateNCollisions(nnStruct->nPoints, 
// 					nnStruct->dimension, 
// 					nnStruct->points, 
// 					queryPoint, 
// 					nnStruct->parameterK, 
// 					nnStruct->parameterL, 
// 					nnStruct->parameterR);
//   DPRINTF("Estimated #collisions (query specific): %0.6lf\n", (double)estNColls);
//   estNColls = (double)estimateNDistinctCollisions(nnStruct->nPoints, 
// 						  nnStruct->dimension, 
// 						  nnStruct->points, 
// 						  queryPoint, 
// 						  nnStruct->useUfunctions, 
// 						  nnStruct->hfTuplesLength, 
// 						  nnStruct->nHFTuples, 
// 						  nnStruct->parameterR);
//   DPRINTF("Estimated #distinct collisions (query specific): %0.6lf\n", estNColls);
//   DPRINTF("Estimated Dist comp time (query specific): %0.6lf\n", distComp * estNColls);

  // reset all the timers
  timeRNNQuery = 0;
  timeComputeULSH = 0;
  timeGetBucket = 0;
  timeCycleBucket = 0;
  timeDistanceComputation = 0;
  timeResultStoring = 0;
  timeCycleProc = 0;
  timePrecomputeHash = 0;
  timeGBHash = 0;
  timeChainTraversal = 0;
  nOfDistComps = 0;
  timeTotalBuckets = 0;

  TIMEV_START(timeRNNQuery);
  noExpensiveTiming = !DEBUG_PROFILE_TIMING;
  Int32T nNearNeighbors = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize);
  TIMEV_END(timeRNNQuery);

  DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH);
  DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket);
  DPRINTF("Time to cycle through buckets: %0.6lf\n", timeCycleBucket);
  DPRINTF("Time to for processing buckets (UH+examining points): %0.6lf\n", timeTotalBuckets);
  //DPRINTF("Time to copy ULSHs: %0.6lf\n", timeCopyingULSHs);
  //DPRINTF("Time to unmark points: %0.6lf\n", timeUnmarking);
  DPRINTF("Time for distance comps: %0.6lf\n", timeDistanceComputation);
  DPRINTF("Time to store result: %0.6lf\n", timeResultStoring);
  //printf("Time for cycle processing: %0.6lf\n", timeCycleProc);
  //printf("Time for precomputing hashes: %0.6lf\n", timePrecomputeHash);
  //printf("Time for GB hash: %0.6lf\n", timeGBHash);
  //printf("Time for traversal of chains: %0.6lf\n", timeChainTraversal);
  DPRINTF("Number of dist comps: %d\n", nOfDistComps);
  DPRINTF("Number buckets in chains: %d\n", nBucketsInChains);
  DPRINTF("Number buckets in chains / L: %0.3lf\n", (double)nBucketsInChains / nnStruct->nHFTuples);
  DPRINTF("Cumulative time for R-NN query: %0.6lf\n", timeRNNQuery);

  return nNearNeighbors;
}
예제 #2
0
// Determines the run-time coefficients of the different parts of the  //确定查询算法不同部分的运行时间
// query algorithm. Values that are computed and returned are
// <lshPrecomp>, <uhashOver>, <distComp>. <lshPrecomp> is the time for
// pre-computing one function from the LSH family. <uhashOver> is the
// time for getting a bucket from a hash table (of buckets).<distComp>
// is the time to compute one distance between two points. These times
// are computed by constructing a R-NN DS on a sample data set and
// running a sample query set on it.
void determineRTCoefficients(RealT thresholdR, 
			     RealT successProbability, 
			     BooleanT useUfunctions, 
			     IntT typeHT, //建立hash表的类型
			     IntT dimension, 
			     Int32T nPoints, 
			     PPointT *realData, 
			     RealT &lshPrecomp, 
			     RealT &uhashOver, 
			     RealT &distComp){

  // use a subset of the original data set.   使用原始数据集的一个子集
  // there is not much theory behind the formula below.    //减小运算规模
  IntT n = nPoints / 50;    //最多生成n各点,缩小50倍
  if (n < 100) {            //如果生成的点的个数小于100,则使桶的数量与数据集点的数量一样多
    n = nPoints;
  }
  if (n > 10000) {
    n = 10000;
  }

  // Initialize the data set to use.
  PPointT *dataSet;
  FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT))));
  for(IntT i = 0; i < n; i++){           //从真实数据集中随机取n个点 (最多10000个)
    dataSet[i] = realData[genRandomInt(0, nPoints - 1)];
  }

  IntT hashTableSize = n;                //哈希表大小也初始化为n,是指hashTableSize放的点的个数,还是放的桶的个数?
  RNNParametersT algParameters;
  algParameters.parameterR = thresholdR;   //半径
  algParameters.successProbability = successProbability;
  algParameters.dimension = dimension;
#ifdef USE_L1_DISTANCE
  algParameters.parameterR2 = thresholdR;       //使用L1距离,R2=R
#else  
  algParameters.parameterR2 = SQR(thresholdR);   //使用L2  R2=R^2
#endif
  algParameters.useUfunctions = useUfunctions;
  algParameters.parameterK = 16;       //k 设定为16,只是测试,估算运算时间,可能是先随机设置一个时间,之后再在代码中改成16,因为16是bestK.
  algParameters.parameterW = PARAMETER_W_DEFAULT;    //W=4,manuel中说经过多次测试,4是最好的值
  algParameters.parameterT = n;                     //点的个数
  algParameters.typeHT = typeHT;                      //桶的类型HT_HYBRID_CHAINS,在line405里面定义的。

  if (algParameters.useUfunctions){
    algParameters.parameterM = computeMForULSH(algParameters.parameterK, algParameters.successProbability);     //经过改进的L和M
    algParameters.parameterL = algParameters.parameterM * (algParameters.parameterM - 1) / 2;
  }else{
    algParameters.parameterM = computeLfromKP(algParameters.parameterK, algParameters.successProbability);          //论文里面的M=L 
    algParameters.parameterL = algParameters.parameterM;
  }

//   FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT))));
//   for(IntT i = 0; i < n; i++){
//     FAILIF(NULL == (dataSet[i] = (PPointT)MALLOC(sizeof(PointT))));
//     FAILIF(NULL == (dataSet[i]->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT))));

//     dataSet[i]->index = i;
//     sqrLength = 0;
//     for(IntT d = 0; d < dimension; d++){
//       if (i == 0) {
// 	dataSet[i]->coordinates[d] = genUniformRandom(-100, 100);
//       }else{
// 	dataSet[i]->coordinates[d] = dataSet[0]->coordinates[d];
//       }
//       sqrLength += SQR(dataSet[i]->coordinates[d]);
//     }
//     dataSet[i]->sqrLength = sqrLength;
//   }

  // switch on timing
  BooleanT tempTimingOn = timingOn;    //初始化为True
  timingOn = TRUE;

  // initialize result arrays
  PPointT *result = NULL;             //结果集以及其初始化
  IntT resultSize = 0;
  IntT nNNs;
  IntT nSucReps;

  do{
    // create the test structure
    PRNearNeighborStructT nnStruct;
    switch(algParameters.typeHT){
    case HT_LINKED_LIST:
      nnStruct = initLSH(algParameters, n);
      // add points to the test structure
      for(IntT i = 0; i < n; i++){
	addNewPointToPRNearNeighborStruct(nnStruct, realData[i]);
      }
      break;
    case HT_HYBRID_CHAINS:
      nnStruct = initLSH_WithDataSet(algParameters, n, dataSet);   //初始化数据结构,参数集,点的个数,数据集,对点进行映射转换,桶进行映射转换,点存入桶中
      break;
    default:
      ASSERT(FALSE);
    }

    // query point
    PPointT queryPoint;
//     FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT))));
//     FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT))));
//     RealT sqrLength = 0;
//     for(IntT i = 0; i < dimension; i++){
//       queryPoint->coordinates[i] = dataSet[0]->coordinates[i];
//       //queryPoint->coordinates[i] = 0.1;
//       sqrLength += SQR(queryPoint->coordinates[i]);
//     }
    //queryPoint->coordinates[0] = dataPoint->coordinates[0] + 0.0001;
    //queryPoint->sqrLength = sqrLength;

    // reset the R parameter so that there are no NN neighbors.
    setResultReporting(nnStruct, FALSE);
    //DPRINTF1("X\n");

    lshPrecomp = 0;
    uhashOver = 0;
    distComp = 0;
    IntT nReps = 20;
    nSucReps = 0;
    for(IntT rep = 0; rep < nReps; rep++){
      queryPoint = realData[genRandomInt(0, nPoints - 1)];   //查询点为数据集中随机抽取出来的一个点
      timeComputeULSH = 0;
      timeGetBucket = 0;
      timeCycleBucket = 0;
      nOfDistComps = 0;                //点与点比较的次数
	  //返回查找到的近邻点数,并将查询到的近邻点存入result中。
      nNNs = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize);   
      //DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH);
      //DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket);
      //DPRINTF("Time to cycle through buckets: %0.9lf\n", timeCycleBucket);
      //DPRINTF("N of dist comp: %d\n", nOfDistComps);

      ASSERT(nNNs == 0);    //若一个点都没有找到,将发生中断。
      if (nOfDistComps >= MIN(n / 10, 100)){    //与足够的点比较过,才将时间计入
	nSucReps++;
	lshPrecomp += timeComputeULSH / algParameters.parameterK / algParameters.parameterM;  //一个点对一个哈希函数的处理时间。共有k*L个哈希函数
	uhashOver += timeGetBucket / algParameters.parameterL;     //找到一个链表中桶的时间
	distComp += timeCycleBucket / nOfDistComps;   //遍历链表中桶,并与桶里面的点比较的时间
      }
    }

    if (nSucReps >= 5){
      lshPrecomp /= nSucReps;
      uhashOver /= nSucReps;
      distComp /= nSucReps;
      DPRINTF1("RT coeffs computed.\n");
    }else{
      algParameters.parameterR *= 2; // double the radius and repeat  //比较的点数不够,将半径扩大,重复比较
      DPRINTF1("Could not determine the RT coeffs. Repeating.\n");
    }

    freePRNearNeighborStruct(nnStruct);

  }while(nSucReps < 5);       //做一个有效值的判断,要获得5次有效值

  FREE(dataSet);
  FREE(result);

  timingOn = tempTimingOn;
}
예제 #3
0
// Determines the run-time coefficients of the different parts of the
// query algorithm. Values that are computed and returned are
// <lshPrecomp>, <uhashOver>, <distComp>. <lshPrecomp> is the time for
// pre-computing one function from the LSH family. <uhashOver> is the
// time for getting a bucket from a hash table (of buckets).<distComp>
// is the time to compute one distance between two points. These times
// are computed by constructing a R-NN DS on a sample data set and
// running a sample query set on it.
void determineRTCoefficients(RealT thresholdR, 
			     RealT successProbability, 
			     BooleanT useUfunctions, 
			     IntT typeHT, 
			     IntT dimension, 
			     Int32T nPoints, 
			     PPointT *realData, 
			     RealT &lshPrecomp, 
			     RealT &uhashOver, 
			     RealT &distComp){

  // use a subset of the original data set.
  // there is not much theory behind the formula below.
  IntT n = nPoints / 50;
  if (n < 100) {
    n = nPoints;
  }
  if (n > 10000) {
    n = 10000;
  }

  // Initialize the data set to use.
  PPointT *dataSet;
  FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT))));
  for(IntT i = 0; i < n; i++){
    dataSet[i] = realData[genRandomInt(0, nPoints - 1)];
  }

  IntT hashTableSize = n;
  RNNParametersT algParameters;
  algParameters.parameterR = thresholdR;
  algParameters.successProbability = successProbability;
  algParameters.dimension = dimension;
#ifdef USE_L1_DISTANCE
  algParameters.parameterR2 = thresholdR;
#else  
  algParameters.parameterR2 = SQR(thresholdR);
#endif
  algParameters.useUfunctions = useUfunctions;
  algParameters.parameterK = 16;
  algParameters.parameterW = PARAMETER_W_DEFAULT;
  algParameters.parameterT = n;
  algParameters.typeHT = typeHT;

  if (algParameters.useUfunctions){
    algParameters.parameterM = computeMForULSH(algParameters.parameterK, algParameters.successProbability);
    algParameters.parameterL = algParameters.parameterM * (algParameters.parameterM - 1) / 2;
  }else{
    algParameters.parameterM = computeLfromKP(algParameters.parameterK, algParameters.successProbability);
    algParameters.parameterL = algParameters.parameterM;
  }

//   FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT))));
//   for(IntT i = 0; i < n; i++){
//     FAILIF(NULL == (dataSet[i] = (PPointT)MALLOC(sizeof(PointT))));
//     FAILIF(NULL == (dataSet[i]->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT))));

//     dataSet[i]->index = i;
//     sqrLength = 0;
//     for(IntT d = 0; d < dimension; d++){
//       if (i == 0) {
// 	dataSet[i]->coordinates[d] = genUniformRandom(-100, 100);
//       }else{
// 	dataSet[i]->coordinates[d] = dataSet[0]->coordinates[d];
//       }
//       sqrLength += SQR(dataSet[i]->coordinates[d]);
//     }
//     dataSet[i]->sqrLength = sqrLength;
//   }

  // switch on timing
  BooleanT tempTimingOn = timingOn;
  timingOn = TRUE;

  // initialize result arrays
  PPointT *result = NULL;
  IntT resultSize = 0;
  IntT nNNs;
  IntT nSucReps;

  do{
    // create the test structure
    PRNearNeighborStructT nnStruct;
    switch(algParameters.typeHT){
    case HT_LINKED_LIST:
      nnStruct = initLSH(algParameters, n);
      // add points to the test structure
      for(IntT i = 0; i < n; i++){
	addNewPointToPRNearNeighborStruct(nnStruct, realData[i]);
      }
      break;
    case HT_HYBRID_CHAINS:
      nnStruct = initLSH_WithDataSet(algParameters, n, dataSet);
      break;
    default:
      ASSERT(FALSE);
    }

    // query point
    PPointT queryPoint;
//     FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT))));
//     FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT))));
//     RealT sqrLength = 0;
//     for(IntT i = 0; i < dimension; i++){
//       queryPoint->coordinates[i] = dataSet[0]->coordinates[i];
//       //queryPoint->coordinates[i] = 0.1;
//       sqrLength += SQR(queryPoint->coordinates[i]);
//     }
    //queryPoint->coordinates[0] = dataPoint->coordinates[0] + 0.0001;
    //queryPoint->sqrLength = sqrLength;

    // reset the R parameter so that there are no NN neighbors.
    setResultReporting(nnStruct, FALSE);
    //DPRINTF1("X\n");

    lshPrecomp = 0;
    uhashOver = 0;
    distComp = 0;
    IntT nReps = 20;
    nSucReps = 0;
    for(IntT rep = 0; rep < nReps; rep++){
      queryPoint = realData[genRandomInt(0, nPoints - 1)];
      timeComputeULSH = 0;
      timeGetBucket = 0;
      timeCycleBucket = 0;
      nOfDistComps = 0;
      nNNs = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize);
      //DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH);
      //DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket);
      //DPRINTF("Time to cycle through buckets: %0.9lf\n", timeCycleBucket);
      //DPRINTF("N of dist comp: %d\n", nOfDistComps);

      ASSERT(nNNs == 0);
      if (nOfDistComps >= MIN(n / 10, 100)){
	nSucReps++;
	lshPrecomp += timeComputeULSH / algParameters.parameterK / algParameters.parameterM;
	uhashOver += timeGetBucket / algParameters.parameterL;
	distComp += timeCycleBucket / nOfDistComps;
      }
    }

    if (nSucReps >= 5){
      lshPrecomp /= nSucReps;
      uhashOver /= nSucReps;
      distComp /= nSucReps;
      DPRINTF1("RT coeffs computed.\n");
    }else{
      algParameters.parameterR *= 2; // double the radius and repeat
      DPRINTF1("Could not determine the RT coeffs. Repeating.\n");
    }

    freePRNearNeighborStruct(nnStruct);

  }while(nSucReps < 5);

  FREE(dataSet);
  FREE(result);

  timingOn = tempTimingOn;
}