Exemple #1
0
SEXP getDTthreads_R(SEXP verbose) {
  // verbose checked at R level
  if (!isLogical(verbose) || LENGTH(verbose)!=1 || INTEGER(verbose)[0]==NA_LOGICAL) error("'verbose' must be TRUE or FALSE");
  if (LOGICAL(verbose)[0]) {
    Rprintf("omp_get_max_threads() = %d\n", omp_get_max_threads());
    Rprintf("omp_get_thread_limit() = %d\n", omp_get_thread_limit()); // can be INT_MAX meaning unlimited
    Rprintf("DTthreads = %d\n", DTthreads);
    #ifndef _OPENMP
      Rprintf("This installation of data.table has not been compiled with OpenMP support.\n");
      // the omp_ functions used above are defined in myomp.h to be 1 in this case
    #endif
  }
  return ScalarInteger(getDTthreads());
}
Exemple #2
0
SEXP getDTthreads_R() {
    return ScalarInteger(getDTthreads());
}
Exemple #3
0
void fadaptiverollmeanExact(double *x, uint_fast64_t nx, double_ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) {
  if (verbose) Rprintf("%s: running for input length %lu, hasna %d, narm %d\n", __func__, nx, hasna, (int) narm);
  volatile bool truehasna = hasna>0;                            // flag to re-run if NAs detected
  
  if (!truehasna || !narm) {                                    // narm=FALSE handled here as NAs properly propagated in exact algo
    const int threads = MIN(getDTthreads(), nx);
    #pragma omp parallel num_threads(threads) shared(truehasna)
    {
      #pragma omp for schedule(static)
      for (uint_fast64_t i=0; i<nx; i++) {                      // loop on every observation to produce final answer
        if (narm && truehasna) continue;                        // if NAs detected no point to continue
        if (i+1 < k[i]) ans->ans[i] = fill;                     // position in a vector smaller than obs window width - partial window
        else {
          long double w = 0.0;
          for (int j=-k[i]+1; j<=0; j++) {                      // sub-loop on window width
            w += x[i+j];                                        // sum of window for particular observation
          }
          if (R_FINITE((double) w)) {                           // no need to calc roundoff correction if NAs detected as will re-call all below in truehasna==1
            long double res = w / k[i];                         // keep results as long double for intermediate processing
            long double err = 0.0;                              // roundoff corrector
            for (int j=-k[i]+1; j<=0; j++) {                    // sub-loop on window width
              err += x[i+j] - res;                              // measure difference of obs in sub-loop to calculated fun for obs
            }
            ans->ans[i] = (double) (res + (err / k[i]));        // adjust calculated fun with roundoff correction
          } else {
            if (!narm) ans->ans[i] = (double) (w / k[i]);       // NAs should be propagated
            truehasna = 1;                                      // NAs detected for this window, set flag so rest of windows will not be re-run
          }
        }
      }
    } // end of parallel region
    if (truehasna) {
      if (hasna==-1) {                                          // raise warning
        ans->status = 2;
        sprintf(ans->message[2], "%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning", __func__);
      }
      if (verbose) {
        if (narm) Rprintf("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n", __func__);
        else Rprintf("%s: NA (or other non-finite) value(s) are present in input, na.rm was FALSE so in 'exact' implementation NAs were handled already, no need to re-run\n", __func__);
      }
    }
  }
  if (truehasna && narm) {
    const int threads = MIN(getDTthreads(), nx);
    #pragma omp parallel num_threads(threads)
    {
      #pragma omp for schedule(static)
      for (uint_fast64_t i=0; i<nx; i++) {                      // loop over observations to produce final answer
        if (i+1 < k[i]) ans->ans[i] = fill;                     // partial window
        else {
          long double w = 0.0;                                  // window to accumulate values in particular window
          long double err = 0.0;                                // accumulate roundoff error
          long double res;                                      // keep results as long double for intermediate processing
          int nc = 0;                                           // NA counter within current window
          for (int j=-k[i]+1; j<=0; j++) {                      // sub-loop on window width
            if (ISNAN(x[i+j])) nc++;                            // increment NA count in current window
            else w += x[i+j];                                   // add observation to current window
          }
          if (nc == 0) {                                        // no NAs in current window
            res = w / k[i];
            for (int j=-k[i]+1; j<=0; j++) {                    // sub-loop on window width to accumulate roundoff error
              err += x[i+j] - res;                              // measure roundoff for each obs in window
            }
            ans->ans[i] = (double) (res + (err / k[i]));        // adjust calculated fun with roundoff correction
          } else if (nc < k[i]) {
            res = w / (k[i]-nc);
            for (int j=-k[i]+1; j<=0; j++) {                    // sub-loop on window width to accumulate roundoff error
              if (!ISNAN(x[i+j])) err += x[i+j] - res;          // measure roundoff for each obs in window
            }
            ans->ans[i] = (double) (res + (err / (k[i] - nc))); // adjust calculated fun with roundoff correction
          } else {                                              // nc == k[i]
            ans->ans[i] = R_NaN;                                // this branch assume narm so R_NaN always here
          }
        }
      }
    } // end of parallel region
  } // end of truehasna
}
Exemple #4
0
void fadaptiverollmeanFast(double *x, uint_fast64_t nx, double_ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) {
  if (verbose) Rprintf("%s: running for input length %lu, hasna %d, narm %d\n", __func__, nx, hasna, (int) narm);
  bool truehasna = hasna>0;                                     // flag to re-run if NAs detected
  long double w = 0.0;
  // TODO measure speed of cs as long double
  double *cs = malloc(nx*sizeof(double));                       // cumsum vector, same as double cs[nx] but no segfault
  if (!cs) {
    ans->status = 3;                                            // raise error
    sprintf(ans->message[3], "%s: Unable to allocate memory for cumsum", __func__);
    free(cs);
    return;
  }
  if (!truehasna) {
    for (uint_fast64_t i=0; i<nx; i++) {                        // loop on every observation to calculate cumsum only
      w += x[i];                                                // cumulate in long double
      cs[i] = (double) w;
    }
    if (R_FINITE((double) w)) {                                 // no need to calc this if NAs detected as will re-calc all below in truehasna==1
      const int threads = MIN(getDTthreads(), nx);
      #pragma omp parallel num_threads(threads)
      {
        # pragma omp for schedule(static)
        for (uint_fast64_t i=0; i<nx; i++) {                    // loop over observations to calculate final answer
          if (i+1 == k[i]) ans->ans[i] = cs[i]/k[i];            // current obs window width exactly same as obs position in a vector
          else if (i+1 > k[i]) ans->ans[i] = (cs[i]-cs[i-k[i]])/k[i]; // window width smaller than position so use cumsum to calculate diff
          else ans->ans[i] = fill;                              // position in a vector smaller than obs window width - partial window
        }
      } // end of parallel region
    } else {                                                    // update truehasna flag if NAs detected
      if (hasna==-1) {                                          // raise warning
        ans->status = 2;
        sprintf(ans->message[2], "%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning", __func__);
      }
      if (verbose) Rprintf("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n", __func__);
      w = 0.0;
      truehasna = 1;
    }
  }
  if (truehasna) {
    uint_fast64_t nc = 0;                                       // running NA counter
    uint_fast64_t *cn = malloc(nx*sizeof(uint_fast64_t));       // cumulative NA counter, used the same way as cumsum, same as uint_fast64_t cn[nx] but no segfault
    if (!cn) {
      ans->status = 3;                                          // raise error
      sprintf(ans->message[3], "%s: Unable to allocate memory for cum NA counter", __func__);
      free(cs);
      free(cn);
      return;
    }
    for (uint_fast64_t i=0; i<nx; i++) {                        // loop over observations to calculate cumsum and cum NA counter
      if (R_FINITE(x[i])) w += x[i];                            // add observation to running sum
      else nc++;                                                // increment non-finite counter
      cs[i] = (double) w;                                       // cumsum, na.rm=TRUE always, NAs handled using cum NA counter
      cn[i] = nc;                                               // cum NA counter
    }
    const int threads = MIN(getDTthreads(), nx);
    #pragma omp parallel num_threads(threads)
    {
      #pragma omp for schedule(static)
      for (uint_fast64_t i=0; i<nx; i++) {                      // loop over observations to calculate final answer
        if (i+1 < k[i]) {                                       // partial window
          ans->ans[i] = fill;
        } else if (!narm) {                                     // this branch reduce number of branching in narm=1 below
          if (i+1 == k[i]) {
            ans->ans[i] = cn[i]>0 ? NA_REAL : cs[i]/k[i];
          } else if (i+1 > k[i]) {
            ans->ans[i] = (cn[i] - cn[i-k[i]])>0 ? NA_REAL : (cs[i]-cs[i-k[i]])/k[i];
          }
        } else if (i+1 == k[i]) {                               // window width equal to observation position in vector
          int thisk = k[i] - ((int) cn[i]);                     // window width taking NAs into account, we assume single window width is int32, cum NA counter can be int64
          ans->ans[i] = thisk==0 ? R_NaN : cs[i]/thisk;         // handle all obs NAs and na.rm=TRUE
        } else if (i+1 > k[i]) {                                // window width smaller than observation position in vector
          int thisk = k[i] - ((int) (cn[i] - cn[i-k[i]]));      // window width taking NAs into account, we assume single window width is int32, cum NA counter can be int64
          ans->ans[i] = thisk==0 ? R_NaN : (cs[i]-cs[i-k[i]])/thisk; // handle all obs NAs and na.rm=TRUE
        }
      }
    } // end of parallel region
    free(cn);
  } // end of truehasna
  free(cs);
}
Exemple #5
0
SEXP reorder(SEXP x, SEXP order)
{
    // For internal use only by setkey().
    // 'order' must strictly be a permutation of 1:n (i.e. no repeats, zeros or NAs)
    // If only a small subset in the middle is reordered the ends are moved in: [start,end].
    // x may be a vector, or a list of same-length vectors such as data.table
    
    R_len_t nrow, ncol;
    int maxSize = 0;
    if (isNewList(x)) {
      nrow = length(VECTOR_ELT(x,0));
      ncol = length(x);
      for (int i=0; i<ncol; i++) {
        SEXP v = VECTOR_ELT(x,i);
        if (SIZEOF(v)!=4 && SIZEOF(v)!=8)
          error("Item %d of list is type '%s' which isn't yet supported", i+1, type2char(TYPEOF(v)));
        if (length(v)!=nrow)
          error("Column %d is length %d which differs from length of column 1 (%d). Invalid data.table.", i+1, length(v), nrow);
        if (SIZEOF(v) > maxSize)
          maxSize=SIZEOF(v);
      }
    } else {
      if (SIZEOF(x)!=4 && SIZEOF(x)!=8)
        error("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported", type2char(TYPEOF(x)));
      maxSize = SIZEOF(x);
      nrow = length(x);
      ncol = 1;
    }
    if (!isInteger(order)) error("order must be an integer vector");
    if (length(order) != nrow) error("nrow(x)[%d]!=length(order)[%d]",nrow,length(order));
    
    R_len_t start = 0;
    while (start<nrow && INTEGER(order)[start] == start+1) start++;
    if (start==nrow) return(R_NilValue);  // input is 1:n, nothing to do
    R_len_t end = nrow-1;
    while (INTEGER(order)[end] == end+1) end--;
    for (R_len_t i=start; i<=end; i++) { 
      int itmp = INTEGER(order)[i]-1;
      if (itmp<start || itmp>end) error("order is not a permutation of 1:nrow[%d]", nrow);
    }
    // Creorder is for internal use (so we should get the input right!), but the check above seems sensible, otherwise
    // would be segfault below. The for loop above should run in neglible time (sequential) and will also catch NAs.
    // It won't catch duplicates in order, but that's ok. Checking that would be going too far given this is for internal use only.
    
    // Enough working ram for one column of the largest type, for every thread.
    // Up to a limit of 1GB total. It's system dependent how to find out the truly free RAM - TODO.
    // Without a limit it could easily start swapping and not work at all.
    int nth = MIN(getDTthreads(), ncol);
    size_t oneTmpSize = (end-start+1)*(size_t)maxSize;
    size_t totalLimit = 1024*1024*(size_t)1024;  // 1GB
    nth = MIN(totalLimit/oneTmpSize, nth);
    if (nth==0) nth=1;  // if one column's worth is very big, we'll just have to try
    char *tmp[nth];  // VLA ok because small; limited to max getDTthreads() not ncol which could be > 1e6
    int ok=0; for (; ok<nth; ok++) {
      tmp[ok] = malloc(oneTmpSize);
      if (tmp[ok] == NULL) break;
    }
    if (ok==0) error("unable to allocate %d * %d bytes of working memory for reordering data.table", end-start+1, maxSize);
    nth = ok;  // as many threads for which we have a successful malloc
    // So we can still reorder a 10GB table in 16GB of RAM, as long as we have at least one column's worth of tmp
    
    #pragma omp parallel for schedule(dynamic) num_threads(nth)
    for (int i=0; i<ncol; i++) {
      const SEXP v = isNewList(x) ? VECTOR_ELT(x,i) : x;
      const int size = SIZEOF(v);
      const int me = omp_get_thread_num();
      const int *vi = INTEGER(order)+start;
      if (size==4) {
        const int *vd = (const int *)DATAPTR(v);
        int *tmpp = (int *)tmp[me];
        for (int j=start; j<=end; j++) {
          *tmpp++ = vd[*vi++ -1];  // just copies 4 bytes, including pointers on 32bit
        }
      } else {
        const double *vd = (const double *)DATAPTR(v);
        double *tmpp = (double *)tmp[me];
        for (int j=start; j<=end; j++) {
          *tmpp++ = vd[*vi++ -1];  // just copies 8 bytes, pointers too including STRSXP and VECSXP
        }
      }
      // How is this possible to not only ignore the write barrier but in parallel too?
      // Only because this reorder() function accepts and checks a unique permutation of 1:nrow. It
      // performs an in-place shuffle. This operation in the end does not change gcgen, mark or
      // named/refcnt. They all stay the same even for STRSXP and VECSXP because it's just a data shuffle.
      //
      // Theory:
      // The write to tmp is contiguous and io efficient (so less threads should not help that)
      // The read from vd is as io efficient as order is ordered (the more threads the better when close
      // to ordered but less threads may help when not very ordered).
      // TODO large data benchmark to confirm theory and auto tune.
      // io probably limits too much but at least this is our best shot (e.g. branchless) in finding out
      // on other platforms with faster bus, perhaps
      
      // copy the reordered data back into the original vector
      memcpy((char *)DATAPTR(v) + start*(size_t)size,
             tmp[me],
             (end-start+1)*(size_t)size);
      // size_t, otherwise #5305 (integer overflow in memcpy)
    }
    for (int i=0; i<nth; i++) free(tmp[i]);
    return(R_NilValue);
}