Exemplo n.º 1
  The main entry to LSH package. Depending on the command line
  parameters, the function computes the R-NN data structure optimal
  parameters and/or construct the R-NN data structure and runs the
  queries on the data structure.
int main(int argc, char *argv[]){

  FAILIF(0 != regcomp(&preg[ENUM_PPROP_FILE], "FILE:([^,]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_LINE], "LINE:([0-9]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_OFFSET], "OFFSET:([0-9]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_NODE_KIND], "NODE_KIND:([0-9]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_NUM_NODE], "NUM_NODE:([0-9]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_NUM_DECL], "NUM_DECL:([0-9]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_NUM_STMT], "NUM_STMT:([0-9]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_NUM_EXPR], "NUM_EXPR:([0-9]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_TBID], "TBID:([-]?[0-9]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_TEID], "TEID:([-]?[0-9]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_nVARs], "VARs:\\{[^}]*\\}([0-9]+)", REG_EXTENDED));
  FAILIF(0 != regcomp(&preg[ENUM_PPROP_OIDs], "OIDs:\\{[^}]*\\}([0-9]+)", REG_EXTENDED)); // TODO, pair-wise comparision of Vars.

  availableTotalMemory = 800000000;

  // Parse part of the command-line parameters.
  bool computeParameters = false;
  char *paramsFile = NULL;

  // Parameters for filtering:
  bool no_filtering = false, bug_detecting = true;
  int upperBound = 0, lowerBound = 2;
  int minNumNodes = 0, min_nVars = 0;
  int max_num_diff_vars = 16;
  float max_num_diff_nVars_diff = 0.5, max_nVars_diff = 0.35;
  bool interfiles = false;
  int min_lines = 0;

  for (int opt; (opt = getopt(argc, argv, "ABl:v:V:e:E:a:m:N:d:p:P:R:M:cFf:b:t:")) != -1; ) {
    // Needed: -p -f -R
    switch (opt) {
    case 'A': 
      fprintf(stderr, "Warning: output all clones. Takes more time...\n");
      no_filtering = true; break;
    case 'B':
      fprintf(stderr, "Warning: no filtering for bugs now.\n");
      bug_detecting = false; break;
    case 'l': min_lines = atoi(optarg); break;
    case 'v': min_nVars = atoi(optarg); break;
    case 'V': max_num_diff_vars = atoi(optarg); break;
    case 'e': max_num_diff_nVars_diff = atof(optarg); break;
    case 'E': max_nVars_diff = atof(optarg); break;
    case 'm': minNumNodes = atoi(optarg); break;
    case 'b': lowerBound = atoi(optarg); break;
    case 't': upperBound = atoi(optarg); break;
    case 'N': nPoints = atol(optarg); break;
    case 'd': pointsDimension = atol(optarg); break;
    case 'p': paramsFile = optarg; break;
    case 'P': successProbability = atof(optarg); break;
    case 'M': availableTotalMemory = atol(optarg); break;
    case 'a': prefetch = atol(optarg); break;
    case 'c':
      fprintf(stderr, "Warning: will compute parameters\n");
      computeParameters = true;
    case 'F':
      fprintf(stderr, "Warning: inter-file clone detection. Takes more time...\n");
      interfiles = true; break;
    case 'R':
      nRadii = 1;
      FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT))));
      FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT))));
      listOfRadii[0] = strtod(optarg, NULL);
      memRatiosForNNStructs[0] = 1;
    case 'f':
      DPRINTF("Allocated memory (after reading data set): %ld\n", totalAllocatedMemory);
      fprintf(stderr, "Unknown option: -%c\n", opt);
      usage(1, argv[0]);
  if (optind < argc) {
    fprintf(stderr, "There are unprocessed parameters left\n");
    usage(1, argv[0]);


  if (nPoints > MAX_N_POINTS) {
    printf("Error: the structure supports at most %ld points (%ld were specified).\n", MAX_N_POINTS, nPoints);
    fprintf(ERROR_OUTPUT, "Error: the structure supports at most %ld points (%ld were specified).\n", MAX_N_POINTS, nPoints);

  if (computeParameters == false)
    computeParameters = readParamsFile(paramsFile);

  if (computeParameters) {
    IntT nSampleQueries = N_SAMPLE_QUERY_POINTS;
    PPointT sampleQueries[nSampleQueries];
    IntT sampleQBoundaryIndeces[nSampleQueries];
    // Choose several data set points for the sample query points.
    for(IntT i = 0; i < nSampleQueries; i++){
      sampleQueries[i] = dataSetPoints[genRandomInt(0, nPoints - 1)];

    // Compute the array sampleQBoundaryIndeces that specifies how to
    // segregate the sample query points according to their distance
    // to NN.

    // Compute the R-NN DS parameters
    // if a parameter file is given, output them to that file, and continue
    // otherwise, output them to stdout, and exit

    FILE *fd;
    if (paramsFile == NULL) {
      fd = stdout;
    } else {
      fd = fopen(paramsFile, "wt");
      if (fd == NULL) {
	fprintf(stderr, "Unable to write to parameter file %s\n", paramsFile);

    fprintf(fd, "%ld\n", nRadii);
    for(IntT i = 0; i < nRadii; i++) {
      // which sample queries to use
      IntT segregatedQStart = (i == 0) ? 0 : sampleQBoundaryIndeces[i - 1];
      IntT segregatedQNumber = nSampleQueries - segregatedQStart;
      if (segregatedQNumber == 0) {
	// XXX: not the right answer
	segregatedQNumber = nSampleQueries;
	segregatedQStart = 0;
      ASSERT(segregatedQStart < nSampleQueries);
      ASSERT(segregatedQStart >= 0);
      ASSERT(segregatedQStart + segregatedQNumber <= nSampleQueries);
      ASSERT(segregatedQNumber >= 0);
      RNNParametersT optParameters = computeOptimalParameters(listOfRadii[i],
							      sampleQueries + segregatedQStart,
							      (UnsT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i]));
      printRNNParameters(fd, optParameters);
    if (fd == stdout) {
    } else {
      ASSERT(readParamsFile(paramsFile) == false);

  // output vector clusters according to the filtering parameters.
  printf("========================= Structure built =========================\n");
  printf("nPoints = %ld, Dimension = %ld\n", nPoints, pointsDimension);
  printf("no_filtering (0/1) = %d, inter-file (0/1) = %d, prefetch = %ld\n", no_filtering, interfiles, prefetch);
  printf("*** Filtering Parameters for individual vectors ***\n");
  printf("minNumNodes = %d, min_nVars = %d, min_lines = %d\n", minNumNodes, min_nVars, min_lines);
  printf("*** Filtering Parameters for clusters ***\n");
  printf("lowerBound = %d, upperBound = %d\n", lowerBound, upperBound);
  printf("Max num of different nVars = %d, Max diff among different nVars = %g, \nMax diff among the num of different nVars = %g\n", max_num_diff_vars, max_nVars_diff, max_num_diff_nVars_diff);

  IntT resultSize = nPoints;
  PPointT *result = (PPointT*)MALLOC(resultSize * sizeof(*result));
  PPointT queryPoint;
  FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT))));
  FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(pointsDimension * sizeof(RealT))));

  TimeVarT meanQueryTime = 0;
  IntT nQueries = 0;
  bool seen[nPoints];
  IntT nBuckets = 0, nBucketedPoints = 0;

  memset(seen, 0, nPoints * sizeof(bool));
  for(IntT i = 0; i < nPoints; nQueries++, i++) {

    // find the next unseen point
    while (i < nPoints && seen[i]) i++;
    if (i >= nPoints) break;
    queryPoint = dataSetPoints[i];

    // get the near neighbors.
    IntT nNNs = 0;
    for(IntT r = 0; r < nRadii; r++) { // nRadii is always 1 so far.
      nNNs = getRNearNeighbors(nnStructs[r], queryPoint, result, resultSize);
      //printf("Total time for R-NN query at radius %0.6lf (radius no. %ld):\t%0.6lf\n", (double)(listOfRadii[r]), r, timeRNNQuery);
      meanQueryTime += timeRNNQuery;

      //printf("\nQuery point %ld: found %ld NNs at distance %0.6lf (radius no. %ld). NNs are:\n",
      //       i, nNNs, (double)(listOfRadii[r]), r);

      // sort by filename, then number of variables, then line number
      qsort(result, nNNs, sizeof(*result), comparePoints);

      // The result array may contain the queryPoint, so do not output it in the following.

      PPointT *cur = result, *end = result + nNNs;

      if ( ! no_filtering ) { // Filter out certain vectors and clusters.
	while (cur < end)  {	// Shall we discard the rest results
				// and start over for a new point? Not
				// now for the sake of
				// performance...TODO
	  ASSERT(*cur != NULL);
	  // Look for the first un-filtered point for the next bucket.
	  while ( cur < end ) {
	    if ( pointIsNotFiltered(cur) ) {
	    seen[(*cur)->index] = true;
	  if ( cur >= end )

	  bool worthy = false;
	  int sizeBucket = 1; // 1 means the first un-filtered point
	  PPointT *begin = cur;
	  seen[(*begin)->index] = true;
	  while ( cur < end &&
		  // look for the next point outside the current file
		  // if interfiles is false; that point is the end of
		  // current bucket (assume vectors in a bucket are
		  // sorted by their filenames already).
		  ( interfiles || strcmp((*begin)->filename, (*cur)->filename)==0 ) ) {
	    if ( pointIsNotFiltered(cur) ) {
	      // prepare for filtering

	      // the first heuristics for bugs AFTER filtering:
	      worthy = worthy || (*begin)->prop[ENUM_PPROP_nVARs-1] != (*cur)->prop[ENUM_PPROP_nVARs-1];

	      // the second heuristics for bugs AFTER filtering:
	      worthy = worthy || inconsistentIDchanges((*begin)->oids, (*cur)->oids); // TODO
	    seen[(*cur)->index] = true;
	  // output the bucket if:
	  //   - there are >= 2 different points
	  //   - there are <= upperBound (default 0) && >= lowerBound (default 2) points
	  //   - there are >= 2 different numbers of variables
	  // and update nBuckets and nBucketedPoints consequently
	  if (sizeBucket >= lowerBound && (upperBound < lowerBound || sizeBucket <= upperBound) && ( bug_detecting ? worthy : true ) ) {
	    for (PPointT *p = begin; p < cur; p++)  {
	      ASSERT(*p != NULL);
	      if ( pointIsNotFiltered(p) ) {
		// compute the distance to the query point (maybe useless)
		RealT distance = 0.;
		for (IntT i = 0; i < pointsDimension; i++) {
		  RealT t = (*p)->coordinates[i] - queryPoint->coordinates[i];
		  // L1 distance
// 		  distance += (t >= 0) ? t : -t;
		  // Pi--L2 distance, LSH uses L2 by default, we should output L2 distance here. 
		  distance += t*t;
		// L1 distance
// 		printf("%09d\tdist:%0.1lf", (*p)->index, distance);
		// L2 distance
		printf("%09d\tdist:%0.1lf", (*p)->index, sqrt(distance));
		printf("\tFILE %s LINE:%d:%d NODE_KIND:%d nVARs:%d NUM_NODE:%d TBID:%d TEID:%d\n",
		       (*p)->filename, (*p)->prop[ENUM_PPROP_LINE-1], (*p)->prop[ENUM_PPROP_OFFSET-1],
		       (*p)->prop[ENUM_PPROP_NODE_KIND-1], (*p)->prop[ENUM_PPROP_nVARs-1],
		       (*p)->prop[ENUM_PPROP_NUM_NODE-1], (*p)->prop[ENUM_PPROP_TBID-1], (*p)->prop[ENUM_PPROP_TEID-1]);
		//CR_ASSERT(distance(pointsDimension, queryPoint, *p) <= listOfRadii[r]);
		//DPRINTF("Distance: %lf\n", distance(pointsDimension, queryPoint, result[j]));
		//printRealVector("NN: ", pointsDimension, result[j]->coordinates);
	  } // end of enumeration of a bucket
	}	// end of !no_filtering
      else {
	if ( nNNs>=lowerBound ) { // filter out non-clones anyway
	  for (PPointT *p = cur; p < end; p++)  {
	    ASSERT(*p != NULL);
	    seen[(*p)->index] = true;
	    // compute the distance to the query point (maybe useless)
	    RealT distance = 0.;
	    for (IntT i = 0; i < pointsDimension; i++) {
	      RealT t = (*p)->coordinates[i] - queryPoint->coordinates[i];
	      // L1 distance
// 	      distance += (t >= 0) ? t : -t;
	      // Pi--L2 distance, LSH uses L2 by default, we should output L2 distance here. 
	      distance += t*t;

	    // L1 distance
// 	    printf("%09d\tdist:%0.1lf", (*p)->index, distance);
	    // L2 distance
	    printf("%09d\tdist:%0.1lf", (*p)->index, sqrt(distance));
	    printf("\tFILE %s LINE:%d:%d NODE_KIND:%d nVARs:%d NUM_NODE:%d TBID:%d TEID:%d\n",
		   (*p)->filename, (*p)->prop[ENUM_PPROP_LINE-1], (*p)->prop[ENUM_PPROP_OFFSET-1],
		   (*p)->prop[ENUM_PPROP_NODE_KIND-1], (*p)->prop[ENUM_PPROP_nVARs-1],
		   (*p)->prop[ENUM_PPROP_NUM_NODE-1], (*p)->prop[ENUM_PPROP_TBID-1], (*p)->prop[ENUM_PPROP_TEID-1]);
	    //CR_ASSERT(distance(pointsDimension, queryPoint, *p) <= listOfRadii[r]);
	    //DPRINTF("Distance: %lf\n", distance(pointsDimension, queryPoint, result[j]));
	    //printRealVector("NN: ", pointsDimension, result[j]->coordinates);
	  } // end of enumeration of a bucket
	} // end of nNNs>=lowerBound
      }	// end of no_filtering and  exploration of NNs
    } // for (...nRadii...)

  // Simple statistics and finish
  if (nQueries > 0) {
    meanQueryTime = meanQueryTime / nQueries;
    printf("\n%ld queries, Mean query time: %0.6lf\n", nQueries, (double)meanQueryTime);
    printf("%ld buckets, %ld points (out of %ld, %.2f %%) in them\n",
	   nBuckets, nBucketedPoints, nPoints, 100*(float)nBucketedPoints/(float)nPoints);
  } else {
    printf("No query\n");


  return 0;
Exemplo n.º 2
// Determines the run-time coefficients of the different parts of the  //确定查询算法不同部分的运行时间
// query algorithm. Values that are computed and returned are
// <lshPrecomp>, <uhashOver>, <distComp>. <lshPrecomp> is the time for
// pre-computing one function from the LSH family. <uhashOver> is the
// time for getting a bucket from a hash table (of buckets).<distComp>
// is the time to compute one distance between two points. These times
// are computed by constructing a R-NN DS on a sample data set and
// running a sample query set on it.
void determineRTCoefficients(RealT thresholdR, 
			     RealT successProbability, 
			     BooleanT useUfunctions, 
			     IntT typeHT, //建立hash表的类型
			     IntT dimension, 
			     Int32T nPoints, 
			     PPointT *realData, 
			     RealT &lshPrecomp, 
			     RealT &uhashOver, 
			     RealT &distComp){

  // use a subset of the original data set.   使用原始数据集的一个子集
  // there is not much theory behind the formula below.    //减小运算规模
  IntT n = nPoints / 50;    //最多生成n各点,缩小50倍
  if (n < 100) {            //如果生成的点的个数小于100,则使桶的数量与数据集点的数量一样多
    n = nPoints;
  if (n > 10000) {
    n = 10000;

  // Initialize the data set to use.
  PPointT *dataSet;
  FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT))));
  for(IntT i = 0; i < n; i++){           //从真实数据集中随机取n个点 (最多10000个)
    dataSet[i] = realData[genRandomInt(0, nPoints - 1)];

  IntT hashTableSize = n;                //哈希表大小也初始化为n,是指hashTableSize放的点的个数,还是放的桶的个数?
  RNNParametersT algParameters;
  algParameters.parameterR = thresholdR;   //半径
  algParameters.successProbability = successProbability;
  algParameters.dimension = dimension;
  algParameters.parameterR2 = thresholdR;       //使用L1距离,R2=R
  algParameters.parameterR2 = SQR(thresholdR);   //使用L2  R2=R^2
  algParameters.useUfunctions = useUfunctions;
  algParameters.parameterK = 16;       //k 设定为16,只是测试,估算运算时间,可能是先随机设置一个时间,之后再在代码中改成16,因为16是bestK.
  algParameters.parameterW = PARAMETER_W_DEFAULT;    //W=4,manuel中说经过多次测试,4是最好的值
  algParameters.parameterT = n;                     //点的个数
  algParameters.typeHT = typeHT;                      //桶的类型HT_HYBRID_CHAINS,在line405里面定义的。

  if (algParameters.useUfunctions){
    algParameters.parameterM = computeMForULSH(algParameters.parameterK, algParameters.successProbability);     //经过改进的L和M
    algParameters.parameterL = algParameters.parameterM * (algParameters.parameterM - 1) / 2;
    algParameters.parameterM = computeLfromKP(algParameters.parameterK, algParameters.successProbability);          //论文里面的M=L 
    algParameters.parameterL = algParameters.parameterM;

//   FAILIF(NULL == (dataSet = (PPointT*)MALLOC(n * sizeof(PPointT))));
//   for(IntT i = 0; i < n; i++){
//     FAILIF(NULL == (dataSet[i] = (PPointT)MALLOC(sizeof(PointT))));
//     FAILIF(NULL == (dataSet[i]->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT))));

//     dataSet[i]->index = i;
//     sqrLength = 0;
//     for(IntT d = 0; d < dimension; d++){
//       if (i == 0) {
// 	dataSet[i]->coordinates[d] = genUniformRandom(-100, 100);
//       }else{
// 	dataSet[i]->coordinates[d] = dataSet[0]->coordinates[d];
//       }
//       sqrLength += SQR(dataSet[i]->coordinates[d]);
//     }
//     dataSet[i]->sqrLength = sqrLength;
//   }

  // switch on timing
  BooleanT tempTimingOn = timingOn;    //初始化为True
  timingOn = TRUE;

  // initialize result arrays
  PPointT *result = NULL;             //结果集以及其初始化
  IntT resultSize = 0;
  IntT nNNs;
  IntT nSucReps;

    // create the test structure
    PRNearNeighborStructT nnStruct;
    case HT_LINKED_LIST:
      nnStruct = initLSH(algParameters, n);
      // add points to the test structure
      for(IntT i = 0; i < n; i++){
	addNewPointToPRNearNeighborStruct(nnStruct, realData[i]);
      nnStruct = initLSH_WithDataSet(algParameters, n, dataSet);   //初始化数据结构,参数集,点的个数,数据集,对点进行映射转换,桶进行映射转换,点存入桶中

    // query point
    PPointT queryPoint;
//     FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT))));
//     FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(dimension * sizeof(RealT))));
//     RealT sqrLength = 0;
//     for(IntT i = 0; i < dimension; i++){
//       queryPoint->coordinates[i] = dataSet[0]->coordinates[i];
//       //queryPoint->coordinates[i] = 0.1;
//       sqrLength += SQR(queryPoint->coordinates[i]);
//     }
    //queryPoint->coordinates[0] = dataPoint->coordinates[0] + 0.0001;
    //queryPoint->sqrLength = sqrLength;

    // reset the R parameter so that there are no NN neighbors.
    setResultReporting(nnStruct, FALSE);

    lshPrecomp = 0;
    uhashOver = 0;
    distComp = 0;
    IntT nReps = 20;
    nSucReps = 0;
    for(IntT rep = 0; rep < nReps; rep++){
      queryPoint = realData[genRandomInt(0, nPoints - 1)];   //查询点为数据集中随机抽取出来的一个点
      timeComputeULSH = 0;
      timeGetBucket = 0;
      timeCycleBucket = 0;
      nOfDistComps = 0;                //点与点比较的次数
      nNNs = getNearNeighborsFromPRNearNeighborStruct(nnStruct, queryPoint, result, resultSize);   
      //DPRINTF("Time to compute LSH: %0.6lf\n", timeComputeULSH);
      //DPRINTF("Time to get bucket: %0.6lf\n", timeGetBucket);
      //DPRINTF("Time to cycle through buckets: %0.9lf\n", timeCycleBucket);
      //DPRINTF("N of dist comp: %d\n", nOfDistComps);

      ASSERT(nNNs == 0);    //若一个点都没有找到,将发生中断。
      if (nOfDistComps >= MIN(n / 10, 100)){    //与足够的点比较过,才将时间计入
	lshPrecomp += timeComputeULSH / algParameters.parameterK / algParameters.parameterM;  //一个点对一个哈希函数的处理时间。共有k*L个哈希函数
	uhashOver += timeGetBucket / algParameters.parameterL;     //找到一个链表中桶的时间
	distComp += timeCycleBucket / nOfDistComps;   //遍历链表中桶,并与桶里面的点比较的时间

    if (nSucReps >= 5){
      lshPrecomp /= nSucReps;
      uhashOver /= nSucReps;
      distComp /= nSucReps;
      DPRINTF1("RT coeffs computed.\n");
      algParameters.parameterR *= 2; // double the radius and repeat  //比较的点数不够,将半径扩大,重复比较
      DPRINTF1("Could not determine the RT coeffs. Repeating.\n");


  }while(nSucReps < 5);       //做一个有效值的判断,要获得5次有效值


  timingOn = tempTimingOn;
Exemplo n.º 3
int main_T(int nargs, char **args)

	/* 官方lsh文件:10个参数
	1000 9 784 0.9  0.6  mnist1k.dts  mnist1k.q
	bin/LSHMain $nDataSet $nQuerySet $dimension $successProbability "$1" "$2" "$3" $m -c*/

	//算参数	bin/LSHMain 1000 9 784 0.9  "0.6" "mnist1k.dts" "mnist1k.q" 1002000000  -c

	//bin/LSHMain $nDataSet $nQuerySet $dimension $successProbability 1.0 "$1" "$2" $m -p "$3"

	//匹配	bin/LSHMain 1000 9 784 0.9 1.0  "mnist1k.dts" "mnist1k.q" 1002000000  -p  "outputparma.txt"
  if(nargs < 9)


  // Parse part of the command-line parameters.
  nPoints = atoi(args[1]);
  IntT nQueries = atoi(args[2]);
  pointsDimension = atoi(args[3]);
  successProbability = atof(args[4]);
  char* endPtr[1];
  RealT thresholdR = strtod(args[5], endPtr);//点相邻的距离阈值
  //str-to -double  将字符串转换成浮点数的函数
  //endPtr 接收数字结尾后非字符串字母

  if (thresholdR == 0 || endPtr[1] == args[5])
    // The value for R is not specified, instead there is a file
    // specifying multiple R's.
    thresholdR = 0;

    // Read in the file
    FILE *radiiFile = fopen(args[5], "rt");
    FAILIF(radiiFile == NULL);
    fscanf(radiiFile, "%d\n", &nRadii);
    ASSERT(nRadii > 0);
    FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    for(IntT i = 0; i < nRadii; i++)
      FSCANF_REAL(radiiFile, &listOfRadii[i]);
      ASSERT(listOfRadii[i] > 0);
      FSCANF_REAL(radiiFile, &memRatiosForNNStructs[i]);
      ASSERT(memRatiosForNNStructs[i] > 0);
    nRadii = 1;
    FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    listOfRadii[0] = thresholdR;
    memRatiosForNNStructs[0] = 1;
  }//对阈值R 和Radiii的处理

  DPRINTF("No. radii: %d\n", nRadii);
  //thresholdR = atof(args[5]);
  availableTotalMemory = atoll(args[8]);//$M表示的是内存空间大小

  if (nPoints > MAX_N_POINTS)
    printf("Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints);
    fprintf(ERROR_OUTPUT, "Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints);


  //#define MALLOC(amount) ((amount > 0) ? totalAllocatedMemory += amount, malloc(amount) : NULL)

  DPRINTF("Allocated memory (after reading data set): %lld\n", totalAllocatedMemory);

  Int32T nSampleQueries = N_SAMPLE_QUERY_POINTS;
  PPointT sampleQueries[N_SAMPLE_QUERY_POINTS];
  Int32T sampleQBoundaryIndeces[N_SAMPLE_QUERY_POINTS];
//  PPointT sampleQueries[nSampleQueries];
 // Int32T sampleQBoundaryIndeces[nSampleQueries];
  if ((nargs <= 9)   ||  (strcmp("-c", args[9]) == 0)    )
    // In this cases, we need to generate a sample query set for
    // computing the optimal parameters.

    // Generate a sample query set.
    FILE *queryFile = fopen(args[7], "rt");
    if (strcmp(args[7], ".") == 0 || queryFile == NULL || nQueries <= 0)
      // Choose several data set points for the sample query points.
		for(IntT i = 0; i < nSampleQueries; i++){
			sampleQueries[i] = dataSetPoints[genRandomInt(0, nPoints - 1)];

      // Choose several actual query points for the sample query points.
		  nSampleQueries = MIN(nSampleQueries, nQueries);
		   Int32T sampleIndeces[N_SAMPLE_QUERY_POINTS];
		  //Int32T sampleIndeces[nSampleQueries];
		  for(IntT i = 0; i < nSampleQueries; i++)
			sampleIndeces[i] = genRandomInt(0, nQueries - 1);
		  qsort(sampleIndeces, nSampleQueries, sizeof(*sampleIndeces), compareInt32T);
		  //printIntVector("sampleIndeces: ", nSampleQueries, sampleIndeces);
		  Int32T j = 0;
		  for(Int32T i = 0; i < nQueries; i++)
			  if (i == sampleIndeces[j])
				  sampleQueries[j] = readPoint(queryFile);
				  while (i == sampleIndeces[j])
					  sampleQueries[j] = sampleQueries[j - 1];
				  fscanf(queryFile, "%[^\n]", sBuffer);
				  fscanf(queryFile, "\n");
		  nSampleQueries = j;


    // Compute the array sampleQBoundaryIndeces that specifies how to
    // segregate the sample query points according to their distance
    // to NN.
  }//if ((nargs < 9) || (strcmp("-c", args[9]) == 0))

  RNNParametersT *algParameters = NULL;
  PRNearNeighborStructT *nnStructs = NULL;
  if (nargs > 9) 
  {/* 官方lsh文件:10个参数
bin/LSHMain $nDataSet $nQuerySet $dimension $successProbability "$1" "$2" "$3" $m -c

    // Additional command-line parameter is specified.
    if (strcmp("-c", args[9]) == 0) 	//-c表示参数优化
 // Only compute the R-NN DS parameters and output them to stdout.
      printf("%d\n", nRadii);
      for(IntT i = 0; i < nRadii; i++)
		// which sample queries to use
		Int32T segregatedQStart = (i == 0) ? 0 : sampleQBoundaryIndeces[i - 1];
		Int32T segregatedQNumber = nSampleQueries - segregatedQStart;
		if (segregatedQNumber == 0) 
		  // XXX: not the right answer
		  segregatedQNumber = nSampleQueries;
		  segregatedQStart = 0;
		ASSERT(segregatedQStart < nSampleQueries);
		ASSERT(segregatedQStart >= 0);
		ASSERT(segregatedQStart + segregatedQNumber <= nSampleQueries);
		ASSERT(segregatedQNumber >= 0);

		RNNParametersT optParameters = computeOptimalParameters(listOfRadii[i],
									sampleQueries + segregatedQStart,
availableTotalMemory总共的内存(传入) - totalAllocatedMemory(使用mallloc分配的)*1=内存上限

然后(L * nPoints > memoryUpperBound / 12 来约束
									(MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i]));
		printRNNParameters(stdout, optParameters);
	else if (strcmp("-p", args[9]) == 0) 
      // Read the R-NN DS parameters from the given file and run the
      // queries on the constructed data structure.
      if (nargs < 10)
      FILE *pFile = fopen(args[10], "rt");
      FAILIFWR(pFile == NULL, "Could not open the params file.");
      fscanf(pFile, "%d\n", &nRadii);
      DPRINTF1("Using the following R-NN DS parameters:\n");
      DPRINTF("N radii = %d\n", nRadii);
      FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT))));
      FAILIF(NULL == (algParameters = (RNNParametersT*)MALLOC(nRadii * sizeof(RNNParametersT))));
      for(IntT i = 0; i < nRadii; i++)
		  algParameters[i] = readRNNParameters(pFile);//从文件读参数

		  printRNNParameters(stderr, algParameters[i]);
		  nnStructs[i] = initLSH_WithDataSet(algParameters[i], nPoints, dataSetPoints);
		  //初始化整个数据结构 包括整体+l个hash表 +点映射到桶

      pointsDimension = algParameters[0].dimension;
      FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT))));
      for(IntT i = 0; i < nRadii; i++)
		  listOfRadii[i] = algParameters[i].parameterR;
      // Wrong option.
  }//if (nargs > 9) 
    FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT))));
    // Determine the R-NN DS parameters, construct the DS and run the queries.
    for(IntT i = 0; i < nRadii; i++)
      // XXX: segregate the sample queries...
      nnStructs[i] = initSelfTunedRNearNeighborWithDataSet(listOfRadii[i], 
							   (MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i]));
  } // if (nargs <= 9) 



  IntT resultSize = nPoints;
  PPointT *result = (PPointT*)MALLOC(resultSize * sizeof(*result));
  PPointT queryPoint;
  FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT))));
  FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(pointsDimension * sizeof(RealT))));

  FILE *queryFile = fopen(args[7], "rt");
  FAILIF(queryFile == NULL);
  TimeVarT meanQueryTime = 0;
  PPointAndRealTStructT *distToNN = NULL;
  for(IntT i = 0; i < nQueries; i++)

    RealT sqrLength = 0;
    // read in the query point.
    for(IntT d = 0; d < pointsDimension; d++)

      FSCANF_REAL(queryFile, &(queryPoint->coordinates[d]));
      sqrLength += SQR(queryPoint->coordinates[d]);

	  if (d >150 &&  d<160)
		  printf(" %lf ",queryPoint->coordinates[d]);
	  if ( d==160)
    queryPoint->sqrLength = sqrLength;
    //printRealVector("Query: ", pointsDimension, queryPoint->coordinates);

    // get the near neighbors.
    IntT nNNs = 0;
    for(IntT r = 0; r < nRadii; r++)

      nNNs = getRNearNeighbors(nnStructs[r], queryPoint, result, resultSize);

      printf("Total time for R-NN query at radius %0.6lf (radius no. %d):\t%0.6lf\n", (double)(listOfRadii[r]), r, timeRNNQuery);
      meanQueryTime += timeRNNQuery;

      if (nNNs > 0)
		printf("Query point %d: found %d NNs at distance %0.6lf (%dth radius). First %d NNs are:\n", 
			i, nNNs, (double)(listOfRadii[r]), r, MIN(nNNs, MAX_REPORTED_POINTS));
		// compute the distances to the found NN, and sort according to the distance
		FAILIF(NULL == (distToNN = (PPointAndRealTStructT*)REALLOC(distToNN, nNNs * sizeof(*distToNN))));
		for(IntT p = 0; p < nNNs; p++)
		  distToNN[p].ppoint = result[p];
		  distToNN[p].real = distance(pointsDimension, queryPoint, result[p]);
		qsort(distToNN, nNNs, sizeof(*distToNN), comparePPointAndRealTStructT);

		// Print the points
		for(IntT j = 0; j < MIN(nNNs, MAX_REPORTED_POINTS); j++)
		  ASSERT(distToNN[j].ppoint != NULL);
		  printf("%09d\tDistance:%0.6lf\n", distToNN[j].ppoint->index, distToNN[j].real);
		  CR_ASSERT(distToNN[j].real <= listOfRadii[r]);
		  //DPRINTF("Distance: %lf\n", distance(pointsDimension, queryPoint, result[j]));
		  //printRealVector("NN: ", pointsDimension, result[j]->coordinates);
    if (nNNs == 0)
      printf("Query point %d: no NNs found.\n", i);
  }//  for(IntT i = 0; i < nQueries; i++)每个点查询

  if (nQueries > 0)
    meanQueryTime = meanQueryTime / nQueries;
    printf("Mean query time: %0.6lf\n", (double)meanQueryTime);

  for(IntT i = 0; i < nRadii; i++)
  // XXX: should ideally free the other stuff as well.

  return 0;
Exemplo n.º 4
int main(int nargs, char **args){
  if(nargs < 9){


  // Parse part of the command-line parameters.
  nPoints = atoi(args[1]);
  IntT nQueries = atoi(args[2]);
  pointsDimension = atoi(args[3]);
  successProbability = atof(args[4]);
  char* endPtr[1];
  RealT thresholdR = strtod(args[5], endPtr);
  if (thresholdR == 0 || endPtr[1] == args[5]){
    // The value for R is not specified, instead there is a file
    // specifying multiple R's.
    thresholdR = 0;

    // Read in the file
    FILE *radiiFile = fopen(args[5], "rt");
    FAILIF(radiiFile == NULL);
    fscanf(radiiFile, "%d\n", &nRadii);
    ASSERT(nRadii > 0);
    FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    for(IntT i = 0; i < nRadii; i++){
      FSCANF_REAL(radiiFile, &listOfRadii[i]);
      ASSERT(listOfRadii[i] > 0);
      FSCANF_REAL(radiiFile, &memRatiosForNNStructs[i]);
      ASSERT(memRatiosForNNStructs[i] > 0);
    nRadii = 1;
    FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    listOfRadii[0] = thresholdR;
    memRatiosForNNStructs[0] = 1;
  DPRINTF("No. radii: %d\n", nRadii);
  //thresholdR = atof(args[5]);
  availableTotalMemory = atoll(args[8]);

  if (nPoints > MAX_N_POINTS) { // 104w points
    printf("Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints);
    fprintf(ERROR_OUTPUT, "Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints);

  readDataSetFromFile(args[6]); // read points into data structure
  DPRINTF("Allocated memory (after reading data set): %lld\n", totalAllocatedMemory);

  Int32T nSampleQueries = N_SAMPLE_QUERY_POINTS;
  PPointT sampleQueries[nSampleQueries];
  Int32T sampleQBoundaryIndeces[nSampleQueries];
  if ((nargs < 9) || (strcmp("-c", args[9]) == 0)){
    // In this cases, we need to generate a sample query set for
    // computing the optimal parameters.

    // Generate a sample query set.
    FILE *queryFile = fopen(args[7], "rt");
    if (strcmp(args[7], ".") == 0 || queryFile == NULL || nQueries <= 0){
      // Choose several data set points for the sample query points.
      for(IntT i = 0; i < nSampleQueries; i++){
	sampleQueries[i] = dataSetPoints[genRandomInt(0, nPoints - 1)];
      // Choose several actual query points for the sample query points.
      nSampleQueries = MIN(nSampleQueries, nQueries);
      Int32T sampleIndeces[nSampleQueries];
      for(IntT i = 0; i < nSampleQueries; i++){
	sampleIndeces[i] = genRandomInt(0, nQueries - 1);
      qsort(sampleIndeces, nSampleQueries, sizeof(*sampleIndeces), compareInt32T);
      //printIntVector("sampleIndeces: ", nSampleQueries, sampleIndeces);
      Int32T j = 0;
      for(Int32T i = 0; i < nQueries; i++){
	if (i == sampleIndeces[j]){
	  sampleQueries[j] = readPoint(queryFile);
	  while (i == sampleIndeces[j]){
	    sampleQueries[j] = sampleQueries[j - 1];
	  fscanf(queryFile, "%[^\n]", sBuffer);
	  fscanf(queryFile, "\n");
      nSampleQueries = j;

    // Compute the array sampleQBoundaryIndeces that specifies how to
    // segregate the sample query points according to their distance
    // to NN.

  RNNParametersT *algParameters = NULL;
  PRNearNeighborStructT *nnStructs = NULL;
  if (nargs > 9) {
    // Additional command-line parameter is specified.
    if (strcmp("-c", args[9]) == 0) {
      // Only compute the R-NN DS parameters and output them to stdout.
      printf("%d\n", nRadii);
      for(IntT i = 0; i < nRadii; i++){
	// which sample queries to use
	Int32T segregatedQStart = (i == 0) ? 0 : sampleQBoundaryIndeces[i - 1];
	Int32T segregatedQNumber = nSampleQueries - segregatedQStart;
	if (segregatedQNumber == 0) {
	  // XXX: not the right answer
	  segregatedQNumber = nSampleQueries;
	  segregatedQStart = 0;
	ASSERT(segregatedQStart < nSampleQueries);
	ASSERT(segregatedQStart >= 0);
	ASSERT(segregatedQStart + segregatedQNumber <= nSampleQueries);
	ASSERT(segregatedQNumber >= 0);
	RNNParametersT optParameters = computeOptimalParameters(listOfRadii[i],
								sampleQueries + segregatedQStart,
								(MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i]));
	printRNNParameters(stdout, optParameters);
    } else if (strcmp("-p", args[9]) == 0) {
      // Read the R-NN DS parameters from the given file and run the
      // queries on the constructed data structure.
      if (nargs < 10){
      FILE *pFile = fopen(args[10], "rt");
      FAILIFWR(pFile == NULL, "Could not open the params file.");
      fscanf(pFile, "%d\n", &nRadii);
      DPRINTF1("Using the following R-NN DS parameters:\n");
      DPRINTF("N radii = %d\n", nRadii);
      FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT))));
      FAILIF(NULL == (algParameters = (RNNParametersT*)MALLOC(nRadii * sizeof(RNNParametersT))));
      for(IntT i = 0; i < nRadii; i++){
	algParameters[i] = readRNNParameters(pFile);
	printRNNParameters(stderr, algParameters[i]);
	nnStructs[i] = initLSH_WithDataSet(algParameters[i], nPoints, dataSetPoints);

      pointsDimension = algParameters[0].dimension;
      FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT))));
      for(IntT i = 0; i < nRadii; i++){
	listOfRadii[i] = algParameters[i].parameterR;
    } else{
      // Wrong option.
  } else {
    FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT))));
    // Determine the R-NN DS parameters, construct the DS and run the queries.
    for(IntT i = 0; i < nRadii; i++){
      // XXX: segregate the sample queries...
      nnStructs[i] = initSelfTunedRNearNeighborWithDataSet(listOfRadii[i], 
							   (MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i]));


  IntT resultSize = nPoints;
  PPointT *result = (PPointT*)MALLOC(resultSize * sizeof(*result));
  PPointT queryPoint;
  FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT))));
  FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(pointsDimension * sizeof(RealT))));

  FILE *queryFile = fopen(args[7], "rt");
  FAILIF(queryFile == NULL);
  TimeVarT meanQueryTime = 0;
  PPointAndRealTStructT *distToNN = NULL;
  for(IntT i = 0; i < nQueries; i++){

    RealT sqrLength = 0;
    // read in the query point.
    for(IntT d = 0; d < pointsDimension; d++){
      FSCANF_REAL(queryFile, &(queryPoint->coordinates[d]));
      sqrLength += SQR(queryPoint->coordinates[d]);
    queryPoint->sqrLength = sqrLength;
    //printRealVector("Query: ", pointsDimension, queryPoint->coordinates);

    // get the near neighbors.
    IntT nNNs = 0;
    for(IntT r = 0; r < nRadii; r++){
      nNNs = getRNearNeighbors(nnStructs[r], queryPoint, result, resultSize);
      printf("Total time for R-NN query at radius %0.6lf (radius no. %d):\t%0.6lf\n", (double)(listOfRadii[r]), r, timeRNNQuery);
      meanQueryTime += timeRNNQuery;

      if (nNNs > 0){
	printf("Query point %d: found %d NNs at distance %0.6lf (%dth radius). First %d NNs are:\n", i, nNNs, (double)(listOfRadii[r]), r, MIN(nNNs, MAX_REPORTED_POINTS));
	// compute the distances to the found NN, and sort according to the distance
	FAILIF(NULL == (distToNN = (PPointAndRealTStructT*)REALLOC(distToNN, nNNs * sizeof(*distToNN))));
	for(IntT p = 0; p < nNNs; p++){
	  distToNN[p].ppoint = result[p];
	  distToNN[p].real = distance(pointsDimension, queryPoint, result[p]);
	qsort(distToNN, nNNs, sizeof(*distToNN), comparePPointAndRealTStructT);

	// Print the points
	for(IntT j = 0; j < MIN(nNNs, MAX_REPORTED_POINTS); j++){
	  ASSERT(distToNN[j].ppoint != NULL);
	  printf("%09d\tDistance:%0.6lf\n", distToNN[j].ppoint->index, distToNN[j].real);
	  CR_ASSERT(distToNN[j].real <= listOfRadii[r]);
	  //DPRINTF("Distance: %lf\n", distance(pointsDimension, queryPoint, result[j]));
	  //printRealVector("NN: ", pointsDimension, result[j]->coordinates);
    if (nNNs == 0){
      printf("Query point %d: no NNs found.\n", i);
  if (nQueries > 0){
    meanQueryTime = meanQueryTime / nQueries;
    printf("Mean query time: %0.6lf\n", (double)meanQueryTime);

  for(IntT i = 0; i < nRadii; i++){
  // XXX: should ideally free the other stuff as well.

  return 0;
Exemplo n.º 5
int main(int nargs, char **args){
  if(nargs < 9){


  // Parse part of the command-line parameters.
  nPoints = atoi(args[1]);
  IntT nQueries = atoi(args[2]);
  pointsDimension = atoi(args[3]);
  successProbability = atof(args[4]);
  char* endPtr[1];
  RealT thresholdR = strtod(args[5], endPtr);  //strtod将字符串转换成浮点数   //r=0.6
  //,到出现非数字或字符串结束时('')才结束转换, 并将结果返回。
  if (thresholdR == 0 || endPtr[1] == args[5]){   //确保阈值合法
    // The value for R is not specified, instead there is a file
    // specifying multiple R's.
    thresholdR = 0;

    // Read in the file
    FILE *radiiFile = fopen(args[5], "rt");
    FAILIF(radiiFile == NULL);
    fscanf(radiiFile, "%d\n", &nRadii);
    ASSERT(nRadii > 0);
    FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    for(IntT i = 0; i < nRadii; i++){
      FSCANF_REAL(radiiFile, &listOfRadii[i]);
      ASSERT(listOfRadii[i] > 0);
      FSCANF_REAL(radiiFile, &memRatiosForNNStructs[i]);
      ASSERT(memRatiosForNNStructs[i] > 0);
    nRadii = 1;     //半径的个数为1个
    FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    FAILIF(NULL == (memRatiosForNNStructs = (RealT*)MALLOC(nRadii * sizeof(RealT))));
    listOfRadii[0] = thresholdR;
    memRatiosForNNStructs[0] = 1;
  DPRINTF("No. radii: %d\n", nRadii);
  //thresholdR = atof(args[5]);
  availableTotalMemory = atoll(args[8]);

  if (nPoints > MAX_N_POINTS) {
    printf("Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints);
    fprintf(ERROR_OUTPUT, "Error: the structure supports at most %d points (%d were specified).\n", MAX_N_POINTS, nPoints);

  readDataSetFromFile(args[6]);    //数据集的文件名
  DPRINTF("Allocated memory (after reading data set): %lld\n", totalAllocatedMemory);

  Int32T nSampleQueries = N_SAMPLE_QUERY_POINTS;   //样本查询点的个数,100
  PPointT sampleQueries[nSampleQueries];      //对查询点编号
  Int32T sampleQBoundaryIndeces[nSampleQueries];   //第一个大于半径的点的编号,如果有多个半径的话,就会记录更多
  if ((nargs < 9) || (strcmp("-c", args[9]) == 0)){       //计算最优参数
    // In this cases, we need to generate a sample query set for
    // computing the optimal parameters.

    // Generate a sample query set.
    FILE *queryFile = fopen(args[7], "rt");              //打开查询集,以只读文本方式打开
    if (strcmp(args[7], ".") == 0 || queryFile == NULL || nQueries <= 0){
      // Choose several data set points for the sample query points.  //如果没有查询点就随机选择几个数据集点作为查询点
      for(IntT i = 0; i < nSampleQueries; i++){
	sampleQueries[i] = dataSetPoints[genRandomInt(0, nPoints - 1)];
      // Choose several actual query points for the sample query points.
      nSampleQueries = MIN(nSampleQueries, nQueries);    //MIN(100,9)
      Int32T sampleIndeces[nSampleQueries];              //定义了一个查询点样本索引数组
      for(IntT i = 0; i < nSampleQueries; i++){          
	  ////为什么要对查询点索引进行随机变化? 想把样本查询点控制在一定的范围内,如果查询点过多,则样本点最多取100个查询点。
	      sampleIndeces[i] = genRandomInt(0, nQueries - 1);  //对查询点做了一下顺序的变化,对查询点的索引做随机处理。
	   // 根据你给的比较条件进行快速排序,通过指针的移动实验排序,排序之后的结果仍然放在原数组中,必须自己写一个比较函数
	  //http://www.slyar.com/blog/stdlib-qsort.html qsort(数组起始地址,数组元素大小,每个元素的大小,函数指针指向比较函数)
      qsort(sampleIndeces, nSampleQueries, sizeof(*sampleIndeces), compareInt32T); //qsort,C语言标准库函数,对样本查询点的索引值进行排序
      //printIntVector("sampleIndeces: ", nSampleQueries, sampleIndeces);
      Int32T j = 0;
      for(Int32T i = 0; i < nQueries; i++){
	if (i == sampleIndeces[j]){  //如果样本查询点的索引值与实际查询点的索引值一致,读入点
	  sampleQueries[j] = readPoint(queryFile);
	  while (i == sampleIndeces[j]){   //如果样本查询点之后的索引值与实践查询点的索引值一致,则直接将此点的值赋给后面一点的值
	    sampleQueries[j] = sampleQueries[j - 1];   //覆盖之后索引点的值
	    j++;          //取后面的点
	  fscanf(queryFile, "%[^\n]", sBuffer);
	  fscanf(queryFile, "\n");
      nSampleQueries = j;

    // Compute the array sampleQBoundaryIndeces that specifies how to
    // segregate the sample query points according to their distance
    // to NN.
			   nSampleQueries,    //查询集的点的个数
			   sampleQueries,     //查询点的集合,函数运行完成后,点的值将以距离数据集合的距离由小到大的顺序排序
			   nPoints,           //数据集点的个数
			   dataSetPoints,     //数据集集合
			   nRadii,            //半径的个数
			   listOfRadii,        //半径的值
  RNNParametersT *algParameters = NULL;
  PRNearNeighborStructT *nnStructs = NULL;
  if (nargs > 9) {
    // Additional command-line parameter is specified.
    if (strcmp("-c", args[9]) == 0) {
      // Only compute the R-NN DS parameters and output them to stdout. // 如果是-c,就只计算数据集参数,然后输出
      printf("%d\n", nRadii);           //打印出半径的个数:1个。 将写入到参数文件中,
      transformMemRatios();        //memRatiosForNNstructs,转换内存使用率。假设每个结构为1,每个半径占用的总内存的比率,用于计算内存
      for(IntT i = 0; i < nRadii; i++){   //看使用哪个样本查询点
	// which sample queries to use
	Int32T segregatedQStart = (i == 0) ? 0 : sampleQBoundaryIndeces[i - 1];   //起始点的位置
	Int32T segregatedQNumber = nSampleQueries - segregatedQStart;              //查询点的个数
	if (segregatedQNumber == 0) {                        //如果计算所得点的个数为0,就查询所有的点,从0到最后
	  // XXX: not the right answer
	  segregatedQNumber = nSampleQueries;
	  segregatedQStart = 0;
	ASSERT(segregatedQStart < nSampleQueries);
	ASSERT(segregatedQStart >= 0);
	ASSERT(segregatedQStart + segregatedQNumber <= nSampleQueries);
	ASSERT(segregatedQNumber >= 0);
	RNNParametersT optParameters = computeOptimalParameters(listOfRadii[i],    //计算最优的运行时间,
								sampleQueries + segregatedQStart,
								(MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i])); //比率
	printRNNParameters(stdout, optParameters);  //将参数打印出来
    } else if (strcmp("-p", args[9]) == 0) {
      // Read the R-NN DS parameters from the given file and run the
      // queries on the constructed data structure.
      if (nargs < 10){
      FILE *pFile = fopen(args[10], "rt");    //读取参数文件,由lsh_computeParas产生
      FAILIFWR(pFile == NULL, "Could not open the params file.");
      fscanf(pFile, "%d\n", &nRadii);    //这里只取了参数文件中的半径,那参数文件中的其他数据怎样被取用的??
     DPRINTF1("Using the following R-NN DS parameters:\n");   //使用R-NN DS(DateSet)参数
      DPRINTF("N radii = %d\n", nRadii);     //不知道将数据输出到哪里了??
	 // printf("Using the following R-NN DS parameters:\n");
	 // printf("N radii=%d\n",nRadii);
      FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT))));
      FAILIF(NULL == (algParameters = (RNNParametersT*)MALLOC(nRadii * sizeof(RNNParametersT))));
      for(IntT i = 0; i < nRadii; i++){
	        algParameters[i] = readRNNParameters(pFile);      //将参数信息,输出到屏幕上
  //	printRNNParameters(stderr, algParameters[i]);@727
	        nnStructs[i] = initLSH_WithDataSet(algParameters[i], nPoints, dataSetPoints);  //根据用户输入的参数,初始化结构

      pointsDimension = algParameters[0].dimension;
      FAILIF(NULL == (listOfRadii = (RealT*)MALLOC(nRadii * sizeof(RealT))));
      for(IntT i = 0; i < nRadii; i++){
	listOfRadii[i] = algParameters[i].parameterR;
    } else{
      // Wrong option.
  } else {
    FAILIF(NULL == (nnStructs = (PRNearNeighborStructT*)MALLOC(nRadii * sizeof(PRNearNeighborStructT))));
    // Determine the R-NN DS parameters, construct the DS and run the queries.
    for(IntT i = 0; i < nRadii; i++){
      // XXX: segregate the sample queries...
      nnStructs[i] = initSelfTunedRNearNeighborWithDataSet(listOfRadii[i], 
							   (MemVarT)((availableTotalMemory - totalAllocatedMemory) * memRatiosForNNStructs[i]));

 // DPRINTF1("X\n");@

  IntT resultSize = nPoints;
  PPointT *result = (PPointT*)MALLOC(resultSize * sizeof(*result));
  PPointT queryPoint;
  FAILIF(NULL == (queryPoint = (PPointT)MALLOC(sizeof(PointT))));
  FAILIF(NULL == (queryPoint->coordinates = (RealT*)MALLOC(pointsDimension * sizeof(RealT))));

  FILE *queryFile = fopen(args[7], "rt");
  FAILIF(queryFile == NULL);
  TimeVarT meanQueryTime = 0;
  PPointAndRealTStructT *distToNN = NULL;
  for(IntT i = 0; i < nQueries; i++){

    RealT sqrLength = 0;
    // read in the query point.
    for(IntT d = 0; d < pointsDimension; d++){
      FSCANF_REAL(queryFile, &(queryPoint->coordinates[d]));
      sqrLength += SQR(queryPoint->coordinates[d]);   //向量到原点的距离
    queryPoint->sqrLength = sqrLength;
    //printRealVector("Query: ", pointsDimension, queryPoint->coordinates);

    // get the near neighbors.
    IntT nNNs = 0;
    for(IntT r = 0; r < nRadii; r++){
      nNNs = getRNearNeighbors(nnStructs[r], queryPoint, result, resultSize);
      printf("Total time for R-NN query at radius %0.6lf (radius no. %d):\t%0.6lf\n", (double)(listOfRadii[r]), r, timeRNNQuery);
      meanQueryTime += timeRNNQuery;

      if (nNNs > 0){
	printf("Query point %d: found %d NNs at distance %0.6lf (%dth radius). First %d NNs are:\n", i, nNNs, (double)(listOfRadii[r]), r, MIN(nNNs, MAX_REPORTED_POINTS));
	// compute the distances to the found NN, and sort according to the distance
	FAILIF(NULL == (distToNN = (PPointAndRealTStructT*)REALLOC(distToNN, nNNs * sizeof(*distToNN))));
	for(IntT p = 0; p < nNNs; p++){
	  distToNN[p].ppoint = result[p];
	  distToNN[p].real = distance(pointsDimension, queryPoint, result[p]);
	qsort(distToNN, nNNs, sizeof(*distToNN), comparePPointAndRealTStructT);  //C语言标准的函数

	// Print the points
	for(IntT j = 0; j < MIN(nNNs, MAX_REPORTED_POINTS); j++){
	  ASSERT(distToNN[j].ppoint != NULL);
	  printf("%09d\tDistance:%0.6lf\n", distToNN[j].ppoint->index, distToNN[j].real);   //打印点的坐标
	  CR_ASSERT(distToNN[j].real <= listOfRadii[r]);
	  //DPRINTF("Distance: %lf\n", distance(pointsDimension, queryPoint, result[j]));
	  //printRealVector("NN: ", pointsDimension, result[j]->coordinates);
    if (nNNs == 0){
      printf("Query point %d: no NNs found.\n", i);
  if (nQueries > 0){
    meanQueryTime = meanQueryTime / nQueries;
    printf("Mean query time: %0.6lf\n", (double)meanQueryTime);

  for(IntT i = 0; i < nRadii; i++){
  // XXX: should ideally free the other stuff as well.

  return 0;
Exemplo n.º 6
