SEXP getDTthreads_R(SEXP verbose) { // verbose checked at R level if (!isLogical(verbose) || LENGTH(verbose)!=1 || INTEGER(verbose)[0]==NA_LOGICAL) error("'verbose' must be TRUE or FALSE"); if (LOGICAL(verbose)[0]) { Rprintf("omp_get_max_threads() = %d\n", omp_get_max_threads()); Rprintf("omp_get_thread_limit() = %d\n", omp_get_thread_limit()); // can be INT_MAX meaning unlimited Rprintf("DTthreads = %d\n", DTthreads); #ifndef _OPENMP Rprintf("This installation of data.table has not been compiled with OpenMP support.\n"); // the omp_ functions used above are defined in myomp.h to be 1 in this case #endif } return ScalarInteger(getDTthreads()); }
SEXP getDTthreads_R() { return ScalarInteger(getDTthreads()); }
void fadaptiverollmeanExact(double *x, uint_fast64_t nx, double_ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { if (verbose) Rprintf("%s: running for input length %lu, hasna %d, narm %d\n", __func__, nx, hasna, (int) narm); volatile bool truehasna = hasna>0; // flag to re-run if NAs detected if (!truehasna || !narm) { // narm=FALSE handled here as NAs properly propagated in exact algo const int threads = MIN(getDTthreads(), nx); #pragma omp parallel num_threads(threads) shared(truehasna) { #pragma omp for schedule(static) for (uint_fast64_t i=0; i<nx; i++) { // loop on every observation to produce final answer if (narm && truehasna) continue; // if NAs detected no point to continue if (i+1 < k[i]) ans->ans[i] = fill; // position in a vector smaller than obs window width - partial window else { long double w = 0.0; for (int j=-k[i]+1; j<=0; j++) { // sub-loop on window width w += x[i+j]; // sum of window for particular observation } if (R_FINITE((double) w)) { // no need to calc roundoff correction if NAs detected as will re-call all below in truehasna==1 long double res = w / k[i]; // keep results as long double for intermediate processing long double err = 0.0; // roundoff corrector for (int j=-k[i]+1; j<=0; j++) { // sub-loop on window width err += x[i+j] - res; // measure difference of obs in sub-loop to calculated fun for obs } ans->ans[i] = (double) (res + (err / k[i])); // adjust calculated fun with roundoff correction } else { if (!narm) ans->ans[i] = (double) (w / k[i]); // NAs should be propagated truehasna = 1; // NAs detected for this window, set flag so rest of windows will not be re-run } } } } // end of parallel region if (truehasna) { if (hasna==-1) { // raise warning ans->status = 2; sprintf(ans->message[2], "%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning", __func__); } if (verbose) { if (narm) Rprintf("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n", __func__); else Rprintf("%s: NA (or other non-finite) value(s) are present in input, na.rm was FALSE so in 'exact' implementation NAs were handled already, no need to re-run\n", __func__); } } } if (truehasna && narm) { const int threads = MIN(getDTthreads(), nx); #pragma omp parallel num_threads(threads) { #pragma omp for schedule(static) for (uint_fast64_t i=0; i<nx; i++) { // loop over observations to produce final answer if (i+1 < k[i]) ans->ans[i] = fill; // partial window else { long double w = 0.0; // window to accumulate values in particular window long double err = 0.0; // accumulate roundoff error long double res; // keep results as long double for intermediate processing int nc = 0; // NA counter within current window for (int j=-k[i]+1; j<=0; j++) { // sub-loop on window width if (ISNAN(x[i+j])) nc++; // increment NA count in current window else w += x[i+j]; // add observation to current window } if (nc == 0) { // no NAs in current window res = w / k[i]; for (int j=-k[i]+1; j<=0; j++) { // sub-loop on window width to accumulate roundoff error err += x[i+j] - res; // measure roundoff for each obs in window } ans->ans[i] = (double) (res + (err / k[i])); // adjust calculated fun with roundoff correction } else if (nc < k[i]) { res = w / (k[i]-nc); for (int j=-k[i]+1; j<=0; j++) { // sub-loop on window width to accumulate roundoff error if (!ISNAN(x[i+j])) err += x[i+j] - res; // measure roundoff for each obs in window } ans->ans[i] = (double) (res + (err / (k[i] - nc))); // adjust calculated fun with roundoff correction } else { // nc == k[i] ans->ans[i] = R_NaN; // this branch assume narm so R_NaN always here } } } } // end of parallel region } // end of truehasna }
void fadaptiverollmeanFast(double *x, uint_fast64_t nx, double_ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { if (verbose) Rprintf("%s: running for input length %lu, hasna %d, narm %d\n", __func__, nx, hasna, (int) narm); bool truehasna = hasna>0; // flag to re-run if NAs detected long double w = 0.0; // TODO measure speed of cs as long double double *cs = malloc(nx*sizeof(double)); // cumsum vector, same as double cs[nx] but no segfault if (!cs) { ans->status = 3; // raise error sprintf(ans->message[3], "%s: Unable to allocate memory for cumsum", __func__); free(cs); return; } if (!truehasna) { for (uint_fast64_t i=0; i<nx; i++) { // loop on every observation to calculate cumsum only w += x[i]; // cumulate in long double cs[i] = (double) w; } if (R_FINITE((double) w)) { // no need to calc this if NAs detected as will re-calc all below in truehasna==1 const int threads = MIN(getDTthreads(), nx); #pragma omp parallel num_threads(threads) { # pragma omp for schedule(static) for (uint_fast64_t i=0; i<nx; i++) { // loop over observations to calculate final answer if (i+1 == k[i]) ans->ans[i] = cs[i]/k[i]; // current obs window width exactly same as obs position in a vector else if (i+1 > k[i]) ans->ans[i] = (cs[i]-cs[i-k[i]])/k[i]; // window width smaller than position so use cumsum to calculate diff else ans->ans[i] = fill; // position in a vector smaller than obs window width - partial window } } // end of parallel region } else { // update truehasna flag if NAs detected if (hasna==-1) { // raise warning ans->status = 2; sprintf(ans->message[2], "%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning", __func__); } if (verbose) Rprintf("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n", __func__); w = 0.0; truehasna = 1; } } if (truehasna) { uint_fast64_t nc = 0; // running NA counter uint_fast64_t *cn = malloc(nx*sizeof(uint_fast64_t)); // cumulative NA counter, used the same way as cumsum, same as uint_fast64_t cn[nx] but no segfault if (!cn) { ans->status = 3; // raise error sprintf(ans->message[3], "%s: Unable to allocate memory for cum NA counter", __func__); free(cs); free(cn); return; } for (uint_fast64_t i=0; i<nx; i++) { // loop over observations to calculate cumsum and cum NA counter if (R_FINITE(x[i])) w += x[i]; // add observation to running sum else nc++; // increment non-finite counter cs[i] = (double) w; // cumsum, na.rm=TRUE always, NAs handled using cum NA counter cn[i] = nc; // cum NA counter } const int threads = MIN(getDTthreads(), nx); #pragma omp parallel num_threads(threads) { #pragma omp for schedule(static) for (uint_fast64_t i=0; i<nx; i++) { // loop over observations to calculate final answer if (i+1 < k[i]) { // partial window ans->ans[i] = fill; } else if (!narm) { // this branch reduce number of branching in narm=1 below if (i+1 == k[i]) { ans->ans[i] = cn[i]>0 ? NA_REAL : cs[i]/k[i]; } else if (i+1 > k[i]) { ans->ans[i] = (cn[i] - cn[i-k[i]])>0 ? NA_REAL : (cs[i]-cs[i-k[i]])/k[i]; } } else if (i+1 == k[i]) { // window width equal to observation position in vector int thisk = k[i] - ((int) cn[i]); // window width taking NAs into account, we assume single window width is int32, cum NA counter can be int64 ans->ans[i] = thisk==0 ? R_NaN : cs[i]/thisk; // handle all obs NAs and na.rm=TRUE } else if (i+1 > k[i]) { // window width smaller than observation position in vector int thisk = k[i] - ((int) (cn[i] - cn[i-k[i]])); // window width taking NAs into account, we assume single window width is int32, cum NA counter can be int64 ans->ans[i] = thisk==0 ? R_NaN : (cs[i]-cs[i-k[i]])/thisk; // handle all obs NAs and na.rm=TRUE } } } // end of parallel region free(cn); } // end of truehasna free(cs); }
SEXP reorder(SEXP x, SEXP order) { // For internal use only by setkey(). // 'order' must strictly be a permutation of 1:n (i.e. no repeats, zeros or NAs) // If only a small subset in the middle is reordered the ends are moved in: [start,end]. // x may be a vector, or a list of same-length vectors such as data.table R_len_t nrow, ncol; int maxSize = 0; if (isNewList(x)) { nrow = length(VECTOR_ELT(x,0)); ncol = length(x); for (int i=0; i<ncol; i++) { SEXP v = VECTOR_ELT(x,i); if (SIZEOF(v)!=4 && SIZEOF(v)!=8) error("Item %d of list is type '%s' which isn't yet supported", i+1, type2char(TYPEOF(v))); if (length(v)!=nrow) error("Column %d is length %d which differs from length of column 1 (%d). Invalid data.table.", i+1, length(v), nrow); if (SIZEOF(v) > maxSize) maxSize=SIZEOF(v); } } else { if (SIZEOF(x)!=4 && SIZEOF(x)!=8) error("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported", type2char(TYPEOF(x))); maxSize = SIZEOF(x); nrow = length(x); ncol = 1; } if (!isInteger(order)) error("order must be an integer vector"); if (length(order) != nrow) error("nrow(x)[%d]!=length(order)[%d]",nrow,length(order)); R_len_t start = 0; while (start<nrow && INTEGER(order)[start] == start+1) start++; if (start==nrow) return(R_NilValue); // input is 1:n, nothing to do R_len_t end = nrow-1; while (INTEGER(order)[end] == end+1) end--; for (R_len_t i=start; i<=end; i++) { int itmp = INTEGER(order)[i]-1; if (itmp<start || itmp>end) error("order is not a permutation of 1:nrow[%d]", nrow); } // Creorder is for internal use (so we should get the input right!), but the check above seems sensible, otherwise // would be segfault below. The for loop above should run in neglible time (sequential) and will also catch NAs. // It won't catch duplicates in order, but that's ok. Checking that would be going too far given this is for internal use only. // Enough working ram for one column of the largest type, for every thread. // Up to a limit of 1GB total. It's system dependent how to find out the truly free RAM - TODO. // Without a limit it could easily start swapping and not work at all. int nth = MIN(getDTthreads(), ncol); size_t oneTmpSize = (end-start+1)*(size_t)maxSize; size_t totalLimit = 1024*1024*(size_t)1024; // 1GB nth = MIN(totalLimit/oneTmpSize, nth); if (nth==0) nth=1; // if one column's worth is very big, we'll just have to try char *tmp[nth]; // VLA ok because small; limited to max getDTthreads() not ncol which could be > 1e6 int ok=0; for (; ok<nth; ok++) { tmp[ok] = malloc(oneTmpSize); if (tmp[ok] == NULL) break; } if (ok==0) error("unable to allocate %d * %d bytes of working memory for reordering data.table", end-start+1, maxSize); nth = ok; // as many threads for which we have a successful malloc // So we can still reorder a 10GB table in 16GB of RAM, as long as we have at least one column's worth of tmp #pragma omp parallel for schedule(dynamic) num_threads(nth) for (int i=0; i<ncol; i++) { const SEXP v = isNewList(x) ? VECTOR_ELT(x,i) : x; const int size = SIZEOF(v); const int me = omp_get_thread_num(); const int *vi = INTEGER(order)+start; if (size==4) { const int *vd = (const int *)DATAPTR(v); int *tmpp = (int *)tmp[me]; for (int j=start; j<=end; j++) { *tmpp++ = vd[*vi++ -1]; // just copies 4 bytes, including pointers on 32bit } } else { const double *vd = (const double *)DATAPTR(v); double *tmpp = (double *)tmp[me]; for (int j=start; j<=end; j++) { *tmpp++ = vd[*vi++ -1]; // just copies 8 bytes, pointers too including STRSXP and VECSXP } } // How is this possible to not only ignore the write barrier but in parallel too? // Only because this reorder() function accepts and checks a unique permutation of 1:nrow. It // performs an in-place shuffle. This operation in the end does not change gcgen, mark or // named/refcnt. They all stay the same even for STRSXP and VECSXP because it's just a data shuffle. // // Theory: // The write to tmp is contiguous and io efficient (so less threads should not help that) // The read from vd is as io efficient as order is ordered (the more threads the better when close // to ordered but less threads may help when not very ordered). // TODO large data benchmark to confirm theory and auto tune. // io probably limits too much but at least this is our best shot (e.g. branchless) in finding out // on other platforms with faster bus, perhaps // copy the reordered data back into the original vector memcpy((char *)DATAPTR(v) + start*(size_t)size, tmp[me], (end-start+1)*(size_t)size); // size_t, otherwise #5305 (integer overflow in memcpy) } for (int i=0; i<nth; i++) free(tmp[i]); return(R_NilValue); }