Пример #1
0
// START FUNC DECL
int 
approx_quantile(
		int * x,
		char * cfld,
		long long siz,
		long long num_quantiles,
		double eps,
		int *y,
		long long y_siz,
		int *ptr_estimate_is_good
		)
// STOP FUNC DECL
//----------------------------------------------------------------------------
/* README:

status = approx_quantile(x,cfld, siz,num_quantiles,eps,y,y_siz,ptr_estimate_is_good): Calculates the approximate quantiles of an integer set using very little memory. 

For example: If you request for 10 quantiles with eps of 0.001: {10%, 20%,...90%, 100% } quantiles will be answered with an error of +/0.1% i.e., 10% quantile will be definitely between 9.9% and 10.1%, 90% quantile will definitely be between 89.9% and 90.1%.

Author: Kishore Jaganathan

Algorithm: Munro-Patterson Algorithm by G.S.Manku ("Approximate Medians and other Quantiles in One Pass with Limited Memory")

INPUTS: 

x: Array containing the input data to be processed.

cfld: two options - (1) NULL: All elements of x are processed.
(2) non-NULL: Array of same size as x. Acts as a select vector (only those elements with non-zero values in cfld are processed). ex: If x has 10 elements and cfld is {0,0,1,0,0,0,1,0,1,0}, then only the 3rd, 7th and 9th element are chosen for processing.

siz: Number of elements in the input array x. 

num_quantiles: Number of quantiles that have to be calculated (1 <= num_quantiles <= siz).  For ex: num_quantiles = 100 implies you need quantile queries every 1% from 1% to 100%, num_quantiles = 200 implies you need quantile queries every 0.5% from 0.5% to 100% and so on (try 100 if you don't know what to use).

eps: Acceptable error in the calculated quantiles (0 <= eps <= 1). For example, eps = 0.001 would imply +/- 0.1%, eps = 0.0001 would imply +/- 0.01% (try 0.001 if you don't know what to use).

y_siz: Number of integers that can be written in y (preallocated memory). Has to be atleast num_quantiles*sizeof(int).

OUTPUTS: 

y: Array where the quantile summary is going to be stored, need to malloc with y_siz integer memory (>= num_quantiles*sizeof(int)) beforehand.

ptr_estimate_is_good: Pointer to the location with values 1, -1 or -2, which stand for the following
1: For the given inputs siz and eps, approximate quantile calculations are possible. The computations are done and results are stored in y.
-1: Something wrong with the inputs.
-2: For the given inputs siz and eps, approximate quantile calculations are not possible within the memory constraints. Retry with: 
(i) a higher value for MAX_SZ defined in this function if you know you have more RAM available 
(ii)a higher value for eps (i.e., more approximation) so that the computations can be done in memory 

status: takes values -1 or 0
0: The algorithm either computed the quantiles and set *ptr_estimate_is_good to 1 or the computations are not possible for the given eps and siz and set *ptr_estimate_is_good to -2.
-1: Something wrong with the inputs, *ptr_estimate_is_good is set to -1.
 
*/

//----------------------------------------------------------------------------
{

  int status = 0;
  
  int flag = 0; /* used to assist freeing of mallocs */

  *ptr_estimate_is_good = -1; /* default */

  /* Check inputs */
  if ( x == NULL ) { go_BYE(-1); }
  if ( y == NULL ) { go_BYE(-1); }
  if ( ( eps < 0 )  || ( eps > 1 ) ) { go_BYE(-1); }
  if ( num_quantiles < 1 ) { go_BYE(-1); }
  if ( siz <= 0 ) { go_BYE(-1); }
  if ( y_siz < num_quantiles ) { go_BYE(-1); } /* insufficient memory to write output */

  long long eff_siz = 0; /* number of entries to be considered */
  if ( cfld == NULL ) { eff_siz = siz; }
  else {

    for ( long long ii = 0; ii < siz; ii++ ) {
      if ( cfld[ii] == 0 ) { continue; }
      eff_siz++;
    }

    if ( eff_siz == 0 ) { go_BYE(-1); } /* cfld has all 0 entries */

  }

  //-------------------------------------------------------------------------

  /* "buffer" is a 2d array containing b buffers, each of size k. Each of these b buffers have a weight assigned to them, which will be stored in "weight" array of size b. Consider the following way of viewing the 2d buffer array: each element in a buffer "effectively" occurs as many times as it's corresponding weight in the weight array. The algorithm  compresses the whole input data into these buffers by using "approximate" entries instead of actual entries so that the total number of distinct entries comes down significantly (uses a total memory of ~ b*k, which is typically << eff_siz, the price paid being approximation). This approximation is done intelligently so that very good and useful theoretical quantile guarantees can be provided */

 
  int b;
  long long k; 
  status = determine_b_k(eps, eff_siz, &b, &k);  cBYE(status);
  /* estimates b and k for the given eps and eff_siz */
  
  int NUM_THREADS; 
  /* explained in the next section, mainly to allow parallelizable computations to be done in parallel */

  if ( b <= 0 || k <= 0 ) {
    *ptr_estimate_is_good = -1;
    go_BYE(-1); /* Something wrong with the inputs eps or siz */ 
  }
  else if ( (b+1+10)*k > MAX_SZ ) {

    /* (b+1+10)*k a good upper bound of the memory requirements */
    *ptr_estimate_is_good = -2; 
    go_BYE(0);
    /* Quitting if too much memory needed. Retry by doing one or more of the following: 
     (i) Increase MAX_SZ if you think you have more RAM 
     (ii) Increase eps (the approximation percentage) so that computations can be done within RAM
    */
  } 
  else {
    *ptr_estimate_is_good = 1;

    NUM_THREADS = 128;
    while ( (b+NUM_THREADS+10)*k > MAX_SZ ) { NUM_THREADS = NUM_THREADS/2; }
    /* adapting NUM_THREADS to meet memory requirements */
    
  }

  int **buffer = NULL;         
  int *weight = NULL;  

  flag = 1; /* buffer and weight defined */

  int no_of_empty_buffers = b; /* no of free buffers in the 2d buffer array*/

  buffer      = malloc( b * sizeof(int *) ); 
  return_if_malloc_failed(buffer);

  weight      = malloc( b * sizeof(int) ); 
  return_if_malloc_failed(weight);
#ifdef IPP
  ippsZero_32s((int *)weight,b);
#else
  assign_const_I4(weight,b,0);
#endif

  for ( int ii = 0; ii < b; ii++ ) {
    buffer[ii] = (int *) malloc( k * sizeof(int) );
  }

  flag = 2; /* buffer[ii] defined for ii = 0 to b-1 */
  
  for ( int ii = 0; ii < b; ii++ ) {
    return_if_malloc_failed(buffer[ii]);
#ifdef IPP
    ippsZero_32s((int *)buffer[ii], k);
#else
    assign_const_I4(buffer[ii], k, 0); 
#endif

  } 

  //--------------------------------------------------------------------------

  /* The Munro-Patterson algorithm assumes that the incoming data is in the form of packets of size k with sorted data. Since the sorting has to be done within each packet separately, we can parallelize this step as follows: we divide the incoming data into blocks of size  NUM_THREADS*k (so that NUM_THREADS threads can be generated for each block and sorted separately in parallel using cilkfor). */

  /* "inputPacket" is a 2d array of size NUM_THREADS * k: stores and sorts packets belonging to the same block in parallel using cilkfor. Since the last packet might be incomplete, it will be dealt with separately, using "lastPacket" if k does not divide eff_siz. Variable last_packet_incomplete will be used to keep track of this */

  int **inputPackets = NULL; 
  int *lastPacket = NULL;
  long long * inputPacketsUsedSiz = NULL;

  flag = 3; /* inputPackets, inputPacketsUsedSiz and lastPacket defined */

  inputPackets = malloc( NUM_THREADS * sizeof(int *) );
  return_if_malloc_failed(inputPackets); 
  
  inputPacketsUsedSiz = malloc( NUM_THREADS * sizeof(long long) );
  return_if_malloc_failed(inputPacketsUsedSiz);

  for ( long long ii = 0; ii < NUM_THREADS; ii++) {
    inputPacketsUsedSiz[ii] = 0;
  }

  lastPacket = (int *)malloc ( k * sizeof(int));
  return_if_malloc_failed(lastPacket);
#ifdef IPP
  ippsZero_32s((int *)lastPacket,k);
#else
  assign_const_I4(lastPacket,k,0);
#endif

  long long lastPacketUsedSiz = 0;
  
  for ( int ii = 0; ii < NUM_THREADS; ii++ ) {
    inputPackets[ii] =  (int *) malloc( k * sizeof(int) );
  }

  flag = 4; /* inputPackets[ii] defined for ii = 0 to NUM_THREADS-1 */

  for ( int ii = 0; ii < NUM_THREADS; ii++ ) {
    return_if_malloc_failed(inputPackets[ii]);
#ifdef IPP
    ippsZero_32s((int *)inputPackets[ii],k);
#else
    assign_const_I4(inputPackets[ii],k,0);
#endif
  }

  //---------------------------------------------------------------------------
  
  long long current_loc_in_x = 0; /* start of input data */
  int last_packet_incomplete = 0; 

  /* Do the following for each block of data */
  while ( current_loc_in_x < siz ) { 

    /* A block of data (containing NUM_THREADS packets, i.e., NUM_THREADS * k integers) is processed inside this loop. For each packet, the following operations are done:
     (1): Sort the packet (can be done in parallel using cilkfor)
     (2): Check if a buffer in the 2d buffer array is free (i.e., some buffer has weight = 0 )
     (3): If yes, copy the packet to a free buffer in the buffer array using New() function. Else, use Collapse() function to merge two buffers in the buffer array which have the same weight and free up one buffer in the process (and copy the packet to that buffer)
    */

    if ( cfld == NULL || eff_siz == siz ) {
      //---------------------------------------------------------------------
      /* considering all input data */

      for ( long long ii = 0; ii < NUM_THREADS; ii++) {
	inputPacketsUsedSiz[ii] = 0;
      }
      
      cilkfor ( int tid = 0; tid < NUM_THREADS; tid++ ) {

	long long lb = current_loc_in_x + tid *k;
	long long ub = lb + k;
	if ( lb >= siz ) { continue; }
	if ( ub >= siz ) { 

	  ub = siz;  
	  if ( (ub-lb) != k ) {
	    /* this happens when last packet is incomplete */
	    memcpy(lastPacket, x+lb, (ub-lb)*sizeof(int));
	    lastPacketUsedSiz = (ub-lb);
	    last_packet_incomplete = 1;
	  }
	  else {
	    /* last packet is also complete: eff_siz is multiple of k */
	    memcpy(inputPackets[tid], x+lb, (ub-lb)*sizeof(int));
	    inputPacketsUsedSiz[tid] = (ub-lb);
	  }
	  continue;
	 
	}

	memcpy(inputPackets[tid], x+lb, (ub-lb)*sizeof(int));
	inputPacketsUsedSiz[tid] = (ub-lb);

      }

      for ( int tid = 0; tid < NUM_THREADS; tid++ ) {
	current_loc_in_x += inputPacketsUsedSiz[tid];
      }
      current_loc_in_x += lastPacketUsedSiz;
      //---------------------------------------------------------------------
    }

    else {
      //--------------------------------------------------------------------
      /* NOTE: if cfld input is non-null, it means we are not interested in all the elements. In every iteration, we keep filling inputPackets buffer with only those data we are interested in using the helper variable "current_loc_in_x". */
      
      int tid = 0;
      for ( long long ii = 0; ii < NUM_THREADS; ii++ ) {
	inputPacketsUsedSiz[ii] = 0;
      }
      
      while ( current_loc_in_x < siz  && tid < NUM_THREADS ) {

	if ( cfld[current_loc_in_x] == 0 ) { current_loc_in_x++; }
	else {
	  inputPackets[tid][inputPacketsUsedSiz[tid]]=x[current_loc_in_x];
	  current_loc_in_x++; inputPacketsUsedSiz[tid]++;
	  if ( inputPacketsUsedSiz[tid] == k ) { tid++; }
	}

      }

      if ( current_loc_in_x == siz ) {
	
	for ( int ii = 0; ii <= NUM_THREADS; ii++ ) {
	  if ( inputPacketsUsedSiz[tid]!=0 && inputPacketsUsedSiz[tid]!=k ) { 
	    last_packet_incomplete = 1; 
	    memcpy(lastPacket, inputPackets[tid], inputPacketsUsedSiz[tid]*sizeof(int));
	    lastPacketUsedSiz = inputPacketsUsedSiz[tid];
	    inputPacketsUsedSiz[tid] = 0;
	    break; 
	  }
	}

      }

      //--------------------------------------------------------------------
    }

    /* Step (1) done here in parallel using cilkfor */
    cilkfor ( int tid = 0; tid < NUM_THREADS; tid++ ) { 

      if ( inputPacketsUsedSiz[tid] != k ) { continue; }

#ifdef IPP
      ippsSortAscend_32s_I(inputPackets[tid], inputPacketsUsedSiz[tid]);
#else
      qsort_asc_I4(inputPackets[tid], inputPacketsUsedSiz[tid], sizeof(int), NULL);
#endif 

    }
  
    /* Steps (2) and (3) of the algorithm done here */
    for ( int tid = 0; tid < NUM_THREADS; tid++ ) {

      if ( inputPacketsUsedSiz[tid] != k ) { continue; }

      if ( no_of_empty_buffers == 0 ) {

	/* if no free buffer available in the 2d buffer array , merge data in 2 buffers having same weight into one of them using Collapse() and free up other */
	bool found = false;
	int bufidx1 = -1, bufidx2 = -1; 
	for ( int ii = 0; ii < b-1; ii++ ) { 
	  for ( int jj = ii+1; jj < b; jj++ ) {
	    if ( weight[ii] == weight[jj] && weight[ii] > 0 ) { 
	      bufidx1 = ii; bufidx2 = jj;
	      found = true;
	      break;
	    }
	  }
	  if ( found == true ) { break; }
	} /* find 2 buffers with same corresponding weight in the weight array */
	status = Collapse(buffer[bufidx1],buffer[bufidx2], weight, bufidx1, bufidx2, b, k);  /* Merge buffer numbers [bufidx1] and [bufidx2] */
	if ( status == -1 ) { 
	  *ptr_estimate_is_good = -1;
	  go_BYE(-1);  /* something fundamentally wrong */
	}
	no_of_empty_buffers++;

      }

      bool found = false;
      int bufidx1 = -1;
      for ( int ii = 0; ii < b; ii++ ) {
	if ( weight[ii] == 0 ) {
	  no_of_empty_buffers--;
	  found = true;
	  bufidx1 = ii;
	  break;
	}
      } /* find a free buffer (corresponding weight = 0 in the weight array) */
      status = New(inputPackets[tid],buffer[bufidx1],weight,1,bufidx1,b,k);
      if ( status == -1 ) {
	*ptr_estimate_is_good = -1;
	go_BYE(-1);  /* something fundamentally wrong */
      } 
      /* Copy current input packet into a free buffer in the 2d buffer array*/


    }

  }
static Ipp32s
sbrencCalcCompensation(Ipp32f bufT[][64],
                       Ipp32f bufDiff[][64],
                       Ipp32s* pFreqTab,
                       Ipp32s nBand,
                       Ipp32s* bs_add_harmonic,
                       Ipp32s* bufComp,
                       Ipp32s totNoEst)

{
  Ipp32f maxVal;
  Ipp32f curThres;

  Ipp32s band, j, i, iStart, iEnd;
  Ipp32s maxPosF,maxPosT;
  Ipp32s compValue;


  ippsZero_32s(bufComp, nBand);

  for(band=0 ; band < nBand; band++){

    if(0 == bs_add_harmonic[band]) continue;

    /* miss sine has been detected */

    iStart = pFreqTab[band];
    iEnd = pFreqTab[band+1];

    maxPosF = 0;
    maxPosT = 0;
    maxVal  = 0;

    for(j=0;j<totNoEst;j++){

      for(i=iStart; i<iEnd; i++){

        if(bufT[j][i] > maxVal) {

          maxVal = bufT[j][i];
          maxPosF = i;
          maxPosT = j;
        }
      }
    }

    if(maxPosF == iStart && band){

      compValue = (Ipp32s) (fabs(ILOG2*log(bufDiff[maxPosT][band - 1]+EPS)) + 0.5f);

      compValue = IPP_MIN( compValue, MAX_COMP );

      if((!bs_add_harmonic[band-1]) && (maxPosF >= 1) ) {

        if(bufT[maxPosT][maxPosF -1] > TONALITY_QUOTA*bufT[maxPosT][maxPosF]){

          bufComp[band-1] = -1*compValue;
        }
      }
    }

    /* STEP2 [+1] */
    if(maxPosF == iEnd-1 && band+1 < nBand){

      compValue = (Ipp32s) (fabs(ILOG2*log(bufDiff[maxPosT][band + 1]+EPS)) + 0.5f);

      compValue = IPP_MIN( compValue, MAX_COMP );

      if(!bs_add_harmonic[band+1]) {

        if(bufT[maxPosT][maxPosF+1] > TONALITY_QUOTA*bufT[maxPosT][maxPosF]){

          bufComp[band+1] = compValue;
        }
      }
    }

    /* intermediate band: (0, nBand)  */
    if(band && band < nBand - 1){

      /* [-1] */
      compValue = (Ipp32s) (fabs(ILOG2*log(bufDiff[maxPosT][band -1]+EPS)) + 0.5f);

      compValue = IPP_MIN( compValue, MAX_COMP );

      curThres = bufDiff[maxPosT][band]*bufDiff[maxPosT][band-1];
      curThres *= DIFF_QUOTA;

      if(1.0f > curThres){
        bufComp[band-1] = -1*compValue;
      }

      /* [+1] */
      compValue = (Ipp32s) (fabs(ILOG2*log(bufDiff[maxPosT][band + 1]+EPS)) + 0.5f);

      compValue = IPP_MIN( compValue, MAX_COMP );

      curThres = bufDiff[maxPosT][band]*bufDiff[maxPosT][band+1];
      curThres *= DIFF_QUOTA;

      if(1.0f > curThres ){
        bufComp[band+1] = compValue;
      }
    }
  }

  return 0; //OK
}
static Ipp32s
sbrencTotalDetectionSinEst(Ipp32f bufT[][64],
                           Ipp32f bufDiff[][64],
                           Ipp32s nBand,
                           Ipp32s* pFreqTab,
                           Ipp32f bufSfmOrig[][64],
                           Ipp32f bufSfmSBR[][64],
                           Ipp32s bufDetection[][64],

                           Ipp32s* prev_bs_add_harmonic,
                           sSBRGuideData* pGuideState,

                           Ipp32s noEstPerFrame,
                           Ipp32s totNoEst,
                           Ipp32s newDetectionAllowed,
                           Ipp32s* bs_add_harmonic)
{
  Ipp32s est = 0;
  Ipp32s start = (newDetectionAllowed) ? noEstPerFrame : 0;
  Ipp32s band;

  ippsZero_32s(bs_add_harmonic, nBand);

  /* ******************************
   * up-date buffers
   * ****************************** */

  if(newDetectionAllowed){

    ippsCopy_32f(pGuideState[0].bufGuideDiff, pGuideState[noEstPerFrame].bufGuideDiff, nBand);
    ippsCopy_32f(pGuideState[0].bufGuideOrig, pGuideState[noEstPerFrame].bufGuideOrig, nBand);

    ippsZero_32s(pGuideState[noEstPerFrame-1].bufGuideDetect, nBand);
  }

  for(est = start; est < totNoEst; est++){

    if(est > 0){
      ippsCopy_32s(bufDetection[est-1], pGuideState[est].bufGuideDetect, nBand);
    }

    ippsZero_32s(bufDetection[est], nBand);

    band = (est < totNoEst-1) ? est+1 : est;

    ippsZero_32f(pGuideState[band].bufGuideDiff,   nBand);
    ippsZero_32f(pGuideState[band].bufGuideOrig,   nBand);
    ippsZero_32s(pGuideState[band].bufGuideDetect, nBand);

    /* ******************************
     * main detection algorithm
     * ****************************** */

    sbrencDetectionSinEst(bufT[est],
                          bufDiff[est],
                          bufDetection[est],
                          pFreqTab,
                          nBand,
                          bufSfmOrig[est],
                          bufSfmSBR[est],
                          pGuideState[est],
                          pGuideState[band]);
  }

  /* *******************************************
   * additional step: because there is transient
   * ******************************************* */
  if(newDetectionAllowed){

      sbrencTransientCorrection(bufT,
                                bufDetection,
                                pFreqTab,
                                nBand,
                                pGuideState[noEstPerFrame],
                                start,
                                totNoEst);
  }

  /* *****************************************************
   * finally decision: merged
   * ***************************************************** */
  for(band = 0; band< nBand; band++){

    for(est = start; est < totNoEst; est++){

      bs_add_harmonic[band] = bs_add_harmonic[band] || bufDetection[est][band];
    }
  }

  /* *****************************************************
   * detections that were not present before are removed
   * ***************************************************** */
  if(!newDetectionAllowed){

    for(band=0; band < nBand; band++){

      if(bs_add_harmonic[band] - prev_bs_add_harmonic[band] > 0) {

        bs_add_harmonic[band] = 0;
      }
    }
  }

  return 0;//OK
}
static Ipp32s
sbrencTransientCorrection(Ipp32f bufT[][64],
                          Ipp32s bufDetection[][64],
                          Ipp32s* pFreqTab,
                          Ipp32s nBand,
                          sSBRGuideData state,
                          Ipp32s start,
                          Ipp32s stop)
{
  Ipp32f maxVal1, maxVal2;
  Ipp32s maxPos1, maxPos2;

  Ipp32s i, est;
  Ipp32s iStart, iEnd, iStart2, iEnd2;
  Ipp32s criterion;
#if !defined(ANDROID)
  Ipp32s bs_add_harmonic[MAX_NUM_FREQ_COEFFS];
#else
  static Ipp32s bs_add_harmonic[MAX_NUM_FREQ_COEFFS];
#endif

  ippsZero_32s(bs_add_harmonic, MAX_NUM_FREQ_COEFFS);

  for(est = start; est < stop; est++){

    for(i=0;i<nBand-1;i++){

      bs_add_harmonic[i] = bs_add_harmonic[i] || bufDetection[est][i];
    }
  }

  for(i = 0; i < nBand-1; i++){
    iStart = pFreqTab[i];
    iEnd   = pFreqTab[i+1];

    criterion = bs_add_harmonic[i] && bs_add_harmonic[i+1];
    if ( !criterion ) continue;

    iStart = pFreqTab[i];
    iEnd   = pFreqTab[i+1];

    maxPos1 = iStart;
    maxVal1 = bufT[start][iStart];

    iStart2 = pFreqTab[i+1];
    iEnd2   = pFreqTab[i+2];

    maxPos2 = iStart2;
    maxVal2 = bufT[start][iStart2];

    for(est = start; est < stop; est++){

      if ( iEnd - iStart > 1 ) {
        ippsMaxIndx_32f(bufT[est] + iStart, iEnd - iStart, &maxVal1, &maxPos1);
      }

      if ( iEnd2 - iStart2 > 1 ) {
        ippsMaxIndx_32f(bufT[est] + iStart2, iEnd2 - iStart2, &maxVal2, &maxPos2);
      }
    }

    if(maxPos2 < 2 + maxPos1){

      if(maxVal1 - maxVal2 > 0){

        state.bufGuideDetect[i+1] = 0;
        state.bufGuideOrig[i+1]     = 0;
        state.bufGuideDiff[i+1]     = 0;

        for(est = start; est < stop; est++){
          bufDetection[est][i+1] = 0;
        }
      } else {

        state.bufGuideDetect[i] = 0;
        state.bufGuideOrig[i]     = 0;
        state.bufGuideDiff[i]     = 0;

        for(est = start; est < stop; est++){
          bufDetection[est][i] = 0;
        }
      }
    }
  }

  return 0;//OK
}
Пример #5
0
//---------------------------------------------------------------
// START FUNC DECL
int 
count(
      char *src_tbl,
      char *fk_dst,
      char *cfld,
      char *dst_tbl,
      char *cnt_fld
      )
// STOP FUNC DECL
{
  int status = 0;
  char *X = NULL;    size_t nX = 0;
  char *op_X = NULL; size_t n_op_X = 0;
  char *cfld_X = NULL; size_t cfld_nX = 0;


  TBL_REC_TYPE src_tbl_rec; int src_tbl_id; 
  TBL_REC_TYPE dst_tbl_rec; int dst_tbl_id; 

  FLD_REC_TYPE fk_dst_rec; int fk_dst_id; 
  FLD_REC_TYPE nn_fk_dst_rec; int nn_fk_dst_id; 

  FLD_REC_TYPE cfld_rec; int cfld_id;
  FLD_REC_TYPE nn_cfld_rec; int nn_cfld_id;

  FLD_REC_TYPE cnt_fld_rec; int cnt_fld_id; 

  char opfile[MAX_LEN_FILE_NAME+1];

  long long src_nR, dst_nR;
#define MAX_LEN 32
  char str_dst_nR[MAX_LEN];
  int **partial_counts = NULL; int nT = 0;

  //----------------------------------------------------------------
  if ( ( src_tbl == NULL ) || ( *src_tbl == '\0' ) ) { go_BYE(-1); }
  if ( ( fk_dst == NULL ) || ( *fk_dst == '\0' ) ) { go_BYE(-1); }
  if ( ( dst_tbl == NULL ) || ( *dst_tbl == '\0' ) ) { go_BYE(-1); }
  if ( ( cnt_fld == NULL ) || ( *cnt_fld == '\0' ) ) { go_BYE(-1); }
  zero_string(str_dst_nR, MAX_LEN);
  zero_string(opfile, (MAX_LEN_FILE_NAME+1));
  //--------------------------------------------------------
  status = is_tbl(dst_tbl, &dst_tbl_id, &dst_tbl_rec); cBYE(status);
  chk_range(dst_tbl_id, 0, g_n_tbl);
  dst_nR = g_tbls[dst_tbl_id].nR;
  if ( dst_nR >= INT_MAX ) { go_BYE(-1); }

  status = is_tbl(src_tbl, &src_tbl_id, &src_tbl_rec); cBYE(status);
  chk_range(src_tbl_id, 0, g_n_tbl);
  src_nR = g_tbls[src_tbl_id].nR;
  if ( src_nR >= INT_MAX ) { go_BYE(-1); }

  status = is_fld(NULL, src_tbl_id, fk_dst, &fk_dst_id, &fk_dst_rec, 
		  &nn_fk_dst_id, &nn_fk_dst_rec); 
  cBYE(status);
  chk_range(fk_dst_id, 0, g_n_fld);

  status = get_data(fk_dst_rec, &X, &nX, 0); cBYE(status);
  if ( nn_fk_dst_id >= 0 ) { 
    fprintf(stderr, "NOT IMPLEMENTED\n"); go_BYE(-1); 
  }

  if ( ( cfld != NULL ) && ( *cfld != '\0' ) ) { 
    status = is_fld(NULL, src_tbl_id, cfld, &cfld_id, &cfld_rec, 
		    &nn_cfld_id, &nn_cfld_rec); 
    if ( cfld_id >= 0 ) { 
      if ( cfld_rec.fldtype != I1 ) { go_BYE(-1); }
      if ( nn_cfld_id >= 0 ) { go_BYE(-1); }
    }
    status = get_data(cfld_rec, &cfld_X, &cfld_nX, 0); cBYE(status);
  }
  //------------------------------------------------------
  int ddir_id = INT_MAX;
  status = mk_temp_file(opfile, (dst_nR * sizeof(int)), &ddir_id); cBYE(status);
  status = q_mmap(ddir_id, opfile, &op_X, &n_op_X, true); cBYE(status);
  int *cntI4 = (int *)op_X;
  for ( int i = 0; i < dst_nR ; i++ ) {
    cntI4[i] = 0;
  }
  //------------------------------------------------------
  if ( dst_nR > INT_MAX ) { go_BYE(-1); } /* required by count_In */

  bool is_sequential = true;
  if ( dst_nR > 1048576 ) {
    fprintf(stderr, "Count(%s)  = %lld > 1048576. Use another algorithm\n", 
	    dst_tbl, dst_nR); go_BYE(-1);
  }
  // TODO: Need to adjust parallelism better than current hack
  if ( ( dst_nR <= 32768 ) && ( src_nR > 1048576 ) ) {
    is_sequential = false;
  }
  // Initialize counters to 0 
#ifdef IPP
  ippsZero_32s(cntI4, dst_nR); 
#else
  for ( int i = 0; i < dst_nR; i++ ) { cntI4[i] = 0; }
#endif

  //  TODO: Parallelism does not seem to provide any speedup at all
//  is_sequential = true; /* TODO P1: parallel version taking longer !!! */
//  fprintf(stderr, "forcing sequential execution in count() \n");

  int max_num_chunks = g_num_cores; 
  long long min_block_size = 8192, block_size; 
  if ( is_sequential == false ) { 
    status = partition(src_nR, min_block_size, max_num_chunks, &block_size, &nT);
    cBYE(status);
    partial_counts = (int **)malloc(nT * sizeof(int *));
    return_if_malloc_failed(partial_counts);
    for ( int tid = 0; tid < nT; tid++ ) {
      partial_counts[tid] = (int *)malloc(dst_nR * sizeof(int));
      return_if_malloc_failed(partial_counts[tid]);
    }
  }
//  fprintf(stderr, "nT         = %d \n", nT);
//  fprintf(stderr, "block_size = %lld \n", block_size);
//  fprintf(stderr, "src_nR     = %lld \n", src_nR);
  
  if ( cfld_id >= 0 ) {
    if ( is_sequential ) { 
      switch ( fk_dst_rec.fldtype ) {
      case I1 : 
	status = count_nn_I1((char *)X, src_nR, cfld_X, cntI4, dst_nR); 
	cBYE(status); 
	break;
      case I2 : 
	status = count_nn_I2((short *)X, src_nR, cfld_X, cntI4, dst_nR); 
	cBYE(status); 
	break;
      case I4 : 
	status = count_nn_I4((int *)X, src_nR, cfld_X, cntI4, dst_nR); 
	cBYE(status); 
	break;
      case I8 : 
	status = count_nn_I8((long long *)X, src_nR, cfld_X, cntI4, dst_nR); 
	cBYE(status); 
	break;
      default : 
	go_BYE(-1);
	break;
      }
    }
    else {
      cilkfor ( int tid = 0; tid < nT; tid++ ) {
	// Initialize counts to 0 
	int *partial_counts_t = partial_counts[tid];
	// Initialize counters to 0 
#ifdef IPP
	ippsZero_32s(partial_counts_t, dst_nR); 
#else
	assign_const_I4(partial_counts_t, 0, dst_nR);
#endif
	long long lb = block_size * tid;
	long long ub = lb + block_size;
	if ( tid == (nT-1) ) { ub = src_nR; }
        char      *inI1 = (char      *)X; inI1 += lb;
        short     *inI2 = (short     *)X; inI2 += lb;
        int       *inI4 = (int       *)X; inI4 += lb;
        long long *inI8 = (long long *)X; inI8 += lb;
	long long t_src_nR = ub - lb;
	switch ( fk_dst_rec.fldtype ) {
	case I1 : 
	  status = count_nn_I1(inI1, t_src_nR, cfld_X, partial_counts_t, dst_nR); 
	  cBYE(status); 
	  break;
	case I2 : 
	  status = count_nn_I2(inI2, t_src_nR, cfld_X, partial_counts_t, dst_nR); 
	  cBYE(status); 
	  break;
	case I4 : 
	  status = count_nn_I4(inI4, t_src_nR, cfld_X, partial_counts_t, dst_nR); 
	  cBYE(status); 
	  break;
	case I8 : 
	  status = count_nn_I8(inI8, t_src_nR, cfld_X, partial_counts_t, dst_nR); 
	  cBYE(status); 
	  break;
	default : 
	  go_BYE(-1);
	  break;
	}
      }
    }
  }
  else {
    if ( is_sequential ) { 
Пример #6
0
// START FUNC DECL
int 
approx_frequent (
		 int * x, 
		 char * cfld,
		 long long siz, 
		 long long min_freq, 
		 long long err, 
		 int * y, 
		 int * f, 
		 long long out_siz,
		 long long * ptr_len,
		 int * ptr_estimate_is_good
		 )
// STOP FUNC DECL
//-----------------------------------------------------------------------------
/* README: 

status = approx_frequent(x,cfld,siz,min_freq,err,y,f,out_siz,ptr_len,ptr_estimate_is_good) : The algorithm takes as input an array of integers, and lists out the "frequent" elements in the set approximately, where "frequent" elements are defined as elements occuring greater than or equal to "min_freq" number of times in the input. The approximated output has the following properties: 

(1) all elements in x occuring greater than or equal to min_freq number of times  will definitely be listed in y (THESE ARE THE FREQUENT ELEMENTS (definition) )
(2) their corresponding frequency in f will be greater than or equal to (min_freq-err), i.e., the maximum error in estimating their frequencies is err.
(3) no elements in x occuring less than (min_freq-err) number of times will be listed in y

The approximation is two fold: 
(i) the estimated frequencies of the "frequent" elements can be off by a maximum of err.
(ii) elements occuring between (min_freq-err) and (min_freq) number of times can also be listed in y.


For example: say min_freq = 500 and err = 100.  y will contain the id of all the elements occuring >= 500 definitely, and their corresponding estimated frequency in f would definitely be >= (500-100) = 400. No element in x which occurs less than 400 times will occur in y. Note that elements with frequency between 400 and 500 "can" be listed in y.

Author: Kishore Jaganathan

Algorithm: FREQUENT algorithm (refer to Cormode's paper "Finding Frequent Items in Data Streams")

NOTE: This implementation is a slight variant of the algorithm mentioned in the paper, so that some steps can be parallelized. 

INPUTS: 

x: The input array 

cfld: two options - (1) NULL: All elements of x are processed.
(2) non-NULL: Array of same size as x. Acts as a select vector (only those elements with non-zero values in cfld are processed). ex: If x has 10 elements and cfld is {0,0,1,0,0,0,1,0,1,0}, then only the 3rd, 7th and 9th element are chosen for processing.

siz: Number of elements in the input array x

min_freq: elements occuring greater than or equal to min_freq times in x (among the ones selected for processing) are considered frequent elements. All of their id's will definitely be stored in y.

err: the measured frequencies of the "frequent" elements in x (i.e., occuring >= min_freq times in x, among the ones selected for processing) will definitely be greater than or equal to min_freq-err, and will be stored in f (corresponding to the id stored in y). Also, no element with frequency lesser than (min_freq-err) in x (among the ones selected for processing) will occur in y. Note: Lesser the error, more memory is needed for computation

out_siz: number of integers that can be written in y and f (prealloced memory). See y and f for how much to allocate.


OUTPUTS:

y: array containing the id's of the "frequent" elements. Need to malloc beforehand by atleast (number of elements to be processed)/(min_freq-err) * sizeof(int). If cfld is NULL, number of elements to be processed is siz, else it is equal to the number of non-zero entries in cfld.

f: array containing the corresponding frequencies of the "frequent" elements. Need to malloc beforehand by atleast (number of elements to be processed)/(min_freq-err) * sizeof(int). If cfld is NULL, number of elements to be processed is siz, else it is equal to the number of non-zero entries in cfld.

out_siz: number of integers that can be written in y and f (prealloced memory). See y and f for how much to allocate.

ptr_len: the size of y and f used by the algorithm to write the ids and frequencies of estimated approximate "frequent" elements

ptr_estimate_is_good: pointer to a location which stores 1, -1, -2 or -3
1: approximate calculations were successful, results stored in y,f and ptr_len
-1: something wrong with the input data. Check if sufficient malloc was done beforehand to y and f, in case you forgot.
-2: need too much memory, hence didn't do the calculations. Can retry with one of the following two things : (i) increase MAX_SZ if you are sure you have more RAM available (ii) increase err (the approximation parameter). Increasing err will result in more approximation (hence answer being less accurate) but memory requirements will be lesser.

status: will return 0 or -1
0: two cases - (i) calculations are successful, ptr_estimate_is_good will be set to 1 (ii) need too much memory and hence didn't do the calculations, ptr_estimate_is_good will be set to -2.
-1: Something wrong with inputs, ptr_estimate_is_good will also be set to -1

 */
//-----------------------------------------------------------------------------
{

  int status = 0;

  int flag = 0; /* used to assist freeing mallocs */ 

  *ptr_estimate_is_good = -1; /* default */

  /* Check inputs */
  if ( x == NULL ) { go_BYE(-1); }
  if ( siz <= 0 ) { go_BYE(-1); }
  if ( err <= 0 ) { go_BYE(-1); } 
  if ( min_freq <= 0 ) { go_BYE(-1); }
  if ( min_freq - err <= 0 ) { go_BYE(-1); }
  if ( y == NULL ) { go_BYE(-1); }
  if ( f == NULL ) { go_BYE(-1); }
  if ( ptr_len == NULL ) { go_BYE(-1); }


  long long eff_siz = 0; /* number of entries to be considered */
  if ( cfld == NULL ) { eff_siz = siz; }
  else {

    for ( long long ii = 0; ii < siz; ii++ ) {
      if ( cfld[ii] == 0 ) { continue; }
      eff_siz++;
    }
    if ( eff_siz == 0 ) { go_BYE(-1); } /* cfld has all 0 entries */

  }

  double eps = (double) err/eff_siz; 
  /* parameter of FREQUENT algorithm, decides the error in approximation */
  if ( eps < pow(2,-50) ) { 
    *ptr_estimate_is_good = -2; 
    go_BYE(0); /* need too much memory */
  }

  if ( out_siz < eff_siz/(min_freq - err) ) { 
    *ptr_estimate_is_good = -1;
    go_BYE(-1);
    /* insufficient memory allocated to the outputs y and f */
  }

  //-------------------------------------------------------------------------

  /* The algorithm will be using (long long)(1/eps)+1 counters: stored in (cntr_id, cntr_freq) */

  int * cntr_id = NULL;
  int * cntr_freq = NULL;
  long long cntr_siz = (long long) (1/eps)+1;

  if ( cntr_siz < 10000 ) { cntr_siz = 10000; } /* can be removed */

  flag = 1; /* defined cntr_id and cntr_freq */

  if ( ( cntr_siz*(1+2+6) ) > MAX_SZ ) {
    *ptr_estimate_is_good = -2;
    go_BYE(0);
    /* Quitting if too much memory needed. Retry by doing one of the following:
       (i) Increase MAX_SZ if you think you have more RAM
       (ii) Increase eps (the approximation percentage) so that computations can be done within RAM
     */
  }
  
  int NUM_THREADS = 128;
  while ( (cntr_siz*(NUM_THREADS+2+6)) > MAX_SZ ) { NUM_THREADS = NUM_THREADS/2; } /* to promote parallel computing when possible, adapting NUM_THREADS to meet memory requirements */
 
  cntr_id = (int *)malloc( cntr_siz * sizeof(int) );
  return_if_malloc_failed(cntr_id);
  cntr_freq = (int *)malloc( cntr_siz * sizeof(int) );
  return_if_malloc_failed(cntr_freq);

  long long active_cntr_siz = 0; /* no of counters with non-zero frequencies */

#ifdef IPP
  ippsZero_32s((int *)cntr_id, cntr_siz);
  ippsZero_32s((int *)cntr_freq, cntr_siz);
#else
  assign_const_I4(cntr_id,cntr_siz,0);
  assign_const_I4(cntr_freq,cntr_siz,0);
#endif

  //-------------------------------------------------------------------------

  /* We will look at the incoming data as packets of size cntr_siz with sorted data (this would help speed up the update process a lot, this step is not mentioned in the paper - it's my improvization). Since the sorting has to be done within each packet separately, we can parallelize this step as follows: we divide the incoming data into blocks of size  = NUM_THREADS*cntr_siz (so that NUM_THREADS threads can be generated for each block and sorted separately in parallel using cilkfor) */

  /* "inputPacket" is a 2d array of size NUM_THREADS *cntr_siz: stores and sortes packets belonging to the same block in parallel using cilkfor. */

  int ** inputPackets = NULL;
  long long * inputPacketsUsedSiz = NULL;

  flag = 2;  /* inputPackets and inputPacketsUsedSiz are defined */

  inputPackets = malloc ( NUM_THREADS * sizeof(int*) );
  return_if_malloc_failed(inputPackets); 

  inputPacketsUsedSiz = malloc ( NUM_THREADS * sizeof(long long) );
  return_if_malloc_failed(inputPacketsUsedSiz);

  for ( long long ii = 0; ii < NUM_THREADS; ii++) {
    inputPacketsUsedSiz[ii] = 0;
  }

  for ( int ii = 0; ii < NUM_THREADS; ii++ ) {
    inputPackets[ii] =  (int *) malloc( cntr_siz * sizeof(int) );
  }

  flag = 3; /* inputPackets[ii] defined for ii = 0 to NUM_THREADS-1 */

  for ( int ii = 0; ii < NUM_THREADS; ii++ ) {
    return_if_malloc_failed(inputPackets[ii]);
#ifdef IPP
    ippsZero_32s((int *)inputPackets[ii],cntr_siz);
#else
    assign_const_I4(inputPackets[ii],cntr_siz,0);
#endif
  }

  //------------------------------------------------------------------------
  
  int * bf_id = NULL;
  int * bf_freq = NULL; /* temporary counters for processing */

  flag = 4;  /* bf_id and bf_freq are defined */

  bf_id = (int *)malloc( cntr_siz * sizeof(int) );
  return_if_malloc_failed(bf_id);

  bf_freq = (int *)malloc( cntr_siz * sizeof(int) );
  return_if_malloc_failed(bf_freq);

  long long current_loc_in_x = 0; /* start of input data */

  /* Do the following for each block, till you reach the end of input */
  while ( current_loc_in_x < siz ) { 

    /* A block of data ( containing NUM_THREADS packets, i.e NUM_THREADS * cntr_siz integers ) is processed inside this loop. For each packet, the following operations are done: 
     (1): Sort the packet (can be done in parallel using cilkfor)
     (2): Convert each sorted packet into (id, freq) i.e (key, count) format using sorted_array_to_id_freq(). 
     (3): Update the counter array using update_counter()
     
     Steps (1) and (2) can be done in parallel, but for some reason trying to do (2) in parallel is slowing down the code. So doing only (1) in parallel. */

    /* Copying input data into "inputPackets" buffers */

    if ( cfld == NULL || eff_siz == siz ) {

      //------------------------------------------------------------------
      for ( long long ii = 0; ii < NUM_THREADS; ii++) {
	inputPacketsUsedSiz[ii] = 0;
      }

      cilkfor ( int tid = 0; tid < NUM_THREADS; tid++ ) {

	long long lb = current_loc_in_x + tid * cntr_siz; 
	long long ub = lb + cntr_siz;
	if ( lb >= siz ) { continue; }
	if ( ub >= siz ) { ub = siz; }

	memcpy(inputPackets[tid], x+lb, (ub-lb)*sizeof(int));
	inputPacketsUsedSiz[tid] = (ub-lb);

      }

      for ( int tid = 0; tid < NUM_THREADS; tid++ ) {
	current_loc_in_x += inputPacketsUsedSiz[tid];
      }
      //------------------------------------------------------------------

    }
    else {

      //------------------------------------------------------------------
      /* NOTE: if cfld input is non-null, it means we are not interested in all the elements. In every iteration, we keep filling inputPackets buffer with only those data we are interested in using the helper variable "current_loc_in_x". */

      for ( long long ii = 0; ii < NUM_THREADS; ii++) {
	inputPacketsUsedSiz[ii] = 0;
      }
      int tid = 0;
      
      while ( current_loc_in_x < siz  && tid < NUM_THREADS ) {

	if ( cfld[current_loc_in_x] == 0 ) { current_loc_in_x++; }
	else {
	  inputPackets[tid][inputPacketsUsedSiz[tid]] = x[current_loc_in_x];
	  current_loc_in_x++; inputPacketsUsedSiz[tid]++;
	  if ( inputPacketsUsedSiz[tid] == cntr_siz ) { tid++; }
	}

      }
      //------------------------------------------------------------------

    }


    /* Step (1) can be done here in parallel using cilkfor */
    cilkfor ( int tid = 0; tid < NUM_THREADS; tid++ ) {
    
      if ( inputPacketsUsedSiz[tid] == 0 ) { continue; }

#ifdef IPP
      ippsSortAscend_32s_I(inputPackets[tid], inputPacketsUsedSiz[tid]);
#else
      qsort_asc_I4(inputPackets[tid], inputPacketsUsedSiz[tid], sizeof(int), NULL);
#endif

    }

    /* Steps (2) and (3) done here */

    for ( int tid = 0; tid < NUM_THREADS; tid++ ) {
    
      if ( inputPacketsUsedSiz[tid] == 0 ) { break; }
    
      long long bf_siz = 0;
      status = sorted_array_to_id_freq(inputPackets[tid],inputPacketsUsedSiz[tid],bf_id,bf_freq,&bf_siz); cBYE(status);

      status = update_counter(cntr_id,cntr_freq,cntr_siz,&active_cntr_siz,bf_id,bf_freq,bf_siz);
      cBYE(status);

    }


  }