double median(double *x, int length){ int i; int half; double med; double *buffer = Calloc(length,double); memcpy(buffer,x,length*sizeof(double)); half = (length + 1)/2; /* qsort(buffer,length,sizeof(double), (int(*)(const void*, const void*))sort_double); if (length % 2 == 1){ med = buffer[half - 1]; } else { med = (buffer[half] + buffer[half-1])/2.0; } */ rPsort(buffer, length, half-1); med = buffer[half-1]; if (length % 2 == 0){ rPsort(buffer, length, half); med = (med + buffer[half])/2.0; } Free(buffer); return med; }
/* Tukey's Biweight Robust Mean (tbrm). When called directly, there must be no NAs in 'x_const'. This function only alters the argument 'result' => DUP=FALSE is safe (and the fastest, preferred way). Input: - x_const Array of numbers to be summarized by tbrm - n_ptr Pointer to the length of the array - C_ptr Pointer to parameter C which adjusts the scaling of the data - result Pointer to storage location of the result. Output: No return value. The tbrm is written to *result. Written by Mikko Korpela. */ void tbrm(double *x_const, int *n_ptr, double *C_ptr, double *result){ Rboolean n_odd; int i, half, my_count; double this_val, min_val, div_const, x_med, this_wt; double *x, *abs_x_dev, *wt, *wtx; listnode tmp; int n = *n_ptr; double C = *C_ptr; /* Avoid complexity and possible crash in case of empty input * vector */ if(n == 0){ *result = R_NaN; return; } /* x is a copy of the argument array x_const (the data) */ x = (double *) R_alloc(n, sizeof(double)); for(i = 0; i < n; i++) x[i] = x_const[i]; /* Median of x */ if((n & 0x1) == 1){ /* n is odd */ half = ((unsigned int)n) >> 1; rPsort(x, n, half); /* Partial sort: */ x_med = x[half]; /* element at position half is correct.*/ n_odd = TRUE; } else { /* n is even */
/* exponential * variant 3: adaptive bandwidth, k nearest neighbors only */ void exponential3 (double *weights, double *dist, int *N, double *bw, int *k) { int i; /* calculate the k nearest neighbor bandwidth */ double distcopy[*N]; for (i = 0; i < *N; i++) { if (dist[i] < 0) { error("'dist' must be positive"); } distcopy[i] = dist[i]; } rPsort(distcopy, *N, *k-1); double a = distcopy[*k-1]/(1 - DOUBLE_EPS/2); /* determine the nearest neighbors */ int nn = 0; // number of neighbors int neighbors[*N]; // 0 1 neighbor indicator for (i = 0; i < *N; i++) { if (dist[i] < a) { neighbors[i] = 1; nn += 1; } else { neighbors[i] = 0; } } /* number of neighbors may be larger than k due to ties in dist */ if (nn > *k) { // number of neighbors too large int size; // number of neighbors to remove size = nn - *k; int y[size]; // index of neighbors to remove int tied[nn]; // index of tied distances (only first nt entries are relevant) int nt = 0; // number of tied distances for (i = 0; i < *N; i++) { if (dist[i] == distcopy[*k-1]) { nt += 1; tied[nt] = i; } } int x[nt]; // index vector 1,2, ..., nt for (i = 0; i < nt; i++) x[i] = i; sample(&size, &nt, y, x); // sample *size indices from nt indices for (i = 0; i < size; i++) { neighbors[tied[y[i]]] = 0; // remove selected neighbors } } /* calculate weights */ for (i = 0; i < *N; i++) { if (neighbors[i]) { weights[i] = 0.5 * exp(-dist[i]/a)/a; } else { weights[i] = 0; } } }
/* gaussian * variant 4: adaptive bandwidth, all observations */ void gaussian4 (double *weights, double *dist, int N, double *bw, int k) { int i; double distcopy[N]; for (i = 0; i < N; i++) { distcopy[i] = dist[i]; } rPsort(distcopy, N, k); double a = distcopy[k]; for (i = 0; i < N; i++) { weights[i] = dnorm(dist[i], 0, a, 0); } }
/* exponential * variant 4: adaptive bandwidth, all observations */ void exponential4 (double *weights, double *dist, int N, double *bw, int k) { int i; double distcopy[N]; for (i = 0; i < N; i++) { distcopy[i] = dist[i]; } rPsort(distcopy, N, k); double a = distcopy[k]; for (i = 0; i < N; i++) { weights[i] = 0.5 * exp(-fabs(dist[i])/a)/a; } }
/* optcosine * variant 3: adaptive bandwidth, k nearest neighbors only */ void optcosine3 (double *weights, double *dist, int N, double *bw, int k) { int i; double distcopy[N]; for (i = 0; i < N; i++) { distcopy[i] = dist[i]; } rPsort(distcopy, N, k); double a = distcopy[k]; for (i = 0; i < N; i++) { weights[i] = fabs(dist[i]) < a ? M_PI_4 * cos(M_PI * dist[i]/(2 * a))/a : 0; } }
/* cauchy * variant 4: adaptive bandwidth, all observations */ void cauchy4 (double *weights, double *dist, int N, double *bw, int k) { int i; double distcopy[N]; for (i = 0; i < N; i++) { distcopy[i] = dist[i]; } rPsort(distcopy, N, k); double a = distcopy[k]; for (i = 0; i < N; i++) { weights[i] = 1/(M_PI * (1 + pow(dist[i]/a, 2)) * a); } }
/* rectangular * variant 3: adaptive bandwidth, k nearest neighbors only */ void rectangular3 (double *weights, double *dist, int N, double *bw, int k) { //double a = *bw * sqrt(3); int i; double distcopy[N]; for (i = 0; i < N; i++) { distcopy[i] = dist[i]; } rPsort(distcopy, N, k); double a = distcopy[k]; for (i = 0; i < N; i++) { weights[i] = fabs(dist[i]) < a ? 0.5/a : 0; } }
/* epanechnikov * variant 3: adaptive bandwidth, k nearest neighbors only */ void epanechnikov3 (double *weights, double *dist, int N, double *bw, int k) { double adist; int i; double distcopy[N]; for (i = 0; i < N; i++) { distcopy[i] = dist[i]; } rPsort(distcopy, N, k); double a = distcopy[k]; for (i = 0; i < N; i++) { adist = fabs(dist[i]); weights[i] = adist < a ? 0.75 * (1 - pow(adist/a, 2))/a : 0; } }
/* triangular * variant 3: adaptive bandwidth, k nearest neighbors only */ void triangular3 (double *weights, double *dist, int N, double *bw, int k) { double adist; int i; double distcopy[N]; for (i = 0; i < N; i++) { distcopy[i] = dist[i]; } rPsort(distcopy, N, k); double a = distcopy[k]; for (i = 0; i < N; i++) { adist = fabs(dist[i]); weights[i] = adist < a ? (1 - adist/a)/a : 0; } }
/* biweight * variant 3: adaptive bandwidth, k nearest neighbors only */ void biweight3 (double *weights, double *dist, int N, double *bw, int k) { double adist; int i; double distcopy[N]; for (i = 0; i < N; i++) { distcopy[i] = dist[i]; } rPsort(distcopy, N, k); double a = distcopy[k]; for (i = 0; i < N; i++) { adist = fabs(dist[i]); weights[i] = adist < a ? 15/(double) 16 * pow(1 - pow(adist/a, 2), 2)/a : 0; } }
/* gaussian * variant 4: adaptive bandwidth, all observations */ void gaussian4 (double *weights, double *dist, int *N, double *bw, int *k) { int i; double distcopy[*N]; for (i = 0; i < *N; i++) { if (dist[i] < 0) { error("'dist' must be positive"); } distcopy[i] = dist[i]; } rPsort(distcopy, *N, *k-1); double a = distcopy[*k-1]/(1 - DOUBLE_EPS/2); for (i = 0; i < *N; i++) { weights[i] = dnorm(dist[i], 0, a, 0); } }
/* exponential * variant 4: adaptive bandwidth, all observations */ void exponential4 (double *weights, double *dist, int *N, double *bw, int *k) { int i; double distcopy[*N]; for (i = 0; i < *N; i++) { if (dist[i] < 0) { error("'dist' must be positive"); } distcopy[i] = dist[i]; } rPsort(distcopy, *N, *k-1); double a = distcopy[*k-1]/(1 - DOUBLE_EPS/2); for (i = 0; i < *N; i++) { weights[i] = 0.5 * exp(-dist[i]/a)/a; } }
/* cauchy * variant 4: adaptive bandwidth, all observations */ void cauchy4 (double *weights, double *dist, int *N, double *bw, int *k) { int i; double distcopy[*N]; for (i = 0; i < *N; i++) { if (dist[i] < 0) { error("'dist' must be positive"); } distcopy[i] = dist[i]; } rPsort(distcopy, *N, *k-1); double a = distcopy[*k-1]/(1 - DOUBLE_EPS/2); for (i = 0; i < *N; i++) { weights[i] = 1/(M_PI * (1 + pow(dist[i]/a, 2)) * a); } }
SEXP tbrm(SEXP x, SEXP C){ SEXP ans, C2; Rboolean n_odd; int i, half, my_count, n; size_t nlong; double C_val, this_val, min_val, div_const, x_med, this_wt; double *x2, *abs_x_dev, *wt, *wtx, *x_p; listnode tmp; nlong = dplRlength(x); /* Long vectors not supported (limitation of rPsort) */ if (nlong > INT_MAX) { error(_("long vectors not supported")); } C2 = PROTECT(coerceVector(C, REALSXP)); if (length(C2) != 1) { UNPROTECT(1); error(_("length of 'C' must be 1")); } C_val = REAL(C2)[0]; UNPROTECT(1); n = (int) nlong; ans = PROTECT(allocVector(REALSXP, 1)); /* Avoid complexity and possible crash in case of empty input * vector */ if(n == 0){ REAL(ans)[0] = R_NaN; UNPROTECT(1); return ans; } /* Note: x must be a numeric vector */ x_p = REAL(x); /* x2 is a copy of the data part of argument x */ x2 = (double *) R_alloc(n, sizeof(double)); for(i = 0; i < n; i++) x2[i] = x_p[i]; /* Median of x */ if((n & 0x1) == 1){ /* n is odd */ half = ((unsigned int)n) >> 1; rPsort(x2, n, half); /* Partial sort: */ x_med = x2[half]; /* element at position half is correct.*/ n_odd = TRUE; } else { /* n is even */
/* pull(): auxiliary routine for Qn and Sn * ====== ======== --------------------- */ double pull(double *a_in, int n, int k) { /* Finds the k-th order statistic of an array a[] of length n * -------------------- */ int j; double *a, ax; char* vmax = vmaxget(); a = (double *)R_alloc(n, sizeof(double)); /* Copy a[] and use copy since it will be re-shuffled: */ for (j = 0; j < n; j++) a[j] = a_in[j]; k--; /* 0-indexing */ rPsort(a, n, k); ax = a[k]; vmaxset(vmax); return ax; } /* pull */
double qn0(double *x, int n) { /*-------------------------------------------------------------------- Efficient algorithm for the scale estimator: Q*_n = { |x_i - x_j|; i<j }_(k) [= Qn without scaling ] i.e. the k-th order statistic of the |x_i - x_j| Parameters of the function Qn : x : double array containing the observations n : number of observations (n >=2) */ double *y = (double *)R_alloc(n, sizeof(double)); double *work = (double *)R_alloc(n, sizeof(double)); double *a_srt = (double *)R_alloc(n, sizeof(double)); double *a_cand = (double *)R_alloc(n, sizeof(double)); int *left = (int *)R_alloc(n, sizeof(int)); int *right = (int *)R_alloc(n, sizeof(int)); int *p = (int *)R_alloc(n, sizeof(int)); int *q = (int *)R_alloc(n, sizeof(int)); int *weight = (int *)R_alloc(n, sizeof(int)); double trial = R_NaReal;/* -Wall */ Rboolean found; int h, i, j,jj,jh; /* Following should be `long long int' : they can be of order n^2 */ int64_t k, knew, nl,nr, sump,sumq; h = n / 2 + 1; k = (int64_t)h * (h - 1) / 2; for (i = 0; i < n; ++i) { y[i] = x[i]; left [i] = n - i + 1; right[i] = (i <= h) ? n : n - (i - h); /* the n - (i-h) is from the paper; original code had `n' */ } R_qsort(y, 1, n); /* y := sort(x) */ nl = (int64_t)n * (n + 1) / 2; nr = (int64_t)n * n; knew = k + nl;/* = k + (n+1 \over 2) */ found = FALSE; #ifdef DEBUG_qn REprintf("qn0(): h,k= %2d,%2d; nl,nr= %d,%d\n", h,k, nl,nr); #endif /* L200: */ while(!found && nr - nl > n) { j = 0; /* Truncation to float : try to make sure that the same values are got later (guard bits !) */ for (i = 1; i < n; ++i) { if (left[i] <= right[i]) { weight[j] = right[i] - left[i] + 1; jh = left[i] + weight[j] / 2; work[j] = (float)(y[i] - y[n - jh]); ++j; } } trial = whimed_i(work, weight, j, a_cand, a_srt, /*iw_cand*/ p); #ifdef DEBUG_qn REprintf(" ..!found: whimed("); # ifdef DEBUG_long REprintf("wrk=c("); for(i=0; i < j; i++) REprintf("%s%g", (i>0)? ", " : "", work[i]); REprintf("),\n wgt=c("); for(i=0; i < j; i++) REprintf("%s%d", (i>0)? ", " : "", weight[i]); REprintf("), j= %3d) -> trial= %7g\n", j, trial); # else REprintf("j=%3d) -> trial= %g:", j, trial); # endif #endif j = 0; for (i = n - 1; i >= 0; --i) { while (j < n && ((float)(y[i] - y[n - j - 1])) < trial) ++j; p[i] = j; } #ifdef DEBUG_qn REprintf(" f_1: j=%2d", j); #endif j = n + 1; for (i = 0; i < n; ++i) { while ((float)(y[i] - y[n - j + 1]) > trial) --j; q[i] = j; } sump = 0; sumq = 0; for (i = 0; i < n; ++i) { sump += p[i]; sumq += q[i] - 1; } #ifdef DEBUG_qn REprintf(" f_2 -> j=%2d, sump|q= %lld,%lld", j, sump,sumq); #endif if (knew <= sump) { for (i = 0; i < n; ++i) right[i] = p[i]; nr = sump; #ifdef DEBUG_qn REprintf("knew <= sump =: nr , new right[]\n"); #endif } else if (knew > sumq) { for (i = 0; i < n; ++i) left[i] = q[i]; nl = sumq; #ifdef DEBUG_qn REprintf("knew > sumq =: nl , new left[]\n"); #endif } else { /* sump < knew <= sumq */ found = TRUE; #ifdef DEBUG_qn REprintf("sump < knew <= sumq ---> FOUND\n"); #endif } } /* while */ if (found) return trial; else { #ifdef DEBUG_qn REprintf(".. not fnd -> new work[]"); #endif j = 0; for (i = 1; i < n; ++i) { for (jj = left[i]; jj <= right[i]; ++jj) { work[j] = y[i] - y[n - jj]; j++; }/* j will be = sum_{i=2}^n (right[i] - left[i] + 1)_{+} */ } #ifdef DEBUG_qn REprintf(" of length %d; knew-nl=%d\n", j, knew-nl); #endif /* return pull(work, j - 1, knew - nl) : */ knew -= (nl + 1); /* -1: 0-indexing */ rPsort(work, j, knew); return(work[knew]); } } /* qn0 */
static void clowess(double *x, double *y, int n, double f, int nsteps, double delta, double *ys, double *rw, double *res) { int i, iter, j, last, m1, m2, nleft, nright, ns; int ok; double alpha, c1, c9, cmad, cut, d1, d2, denom, r, sc; if (n < 2) { ys[0] = y[0]; return; } /* nleft, nright, last, etc. must all be shifted to get rid of these: */ x--; y--; ys--; /* at least two, at most n points */ ns = imax2(2, imin2(n, (int)(f*n + 1e-7))); /* robustness iterations */ iter = 1; while (iter <= nsteps+1) { nleft = 1; nright = ns; last = 0; /* index of prev estimated point */ i = 1; /* index of current point */ for(;;) { if (nright < n) { /* move nleft, nright to right */ /* if radius decreases */ d1 = x[i] - x[nleft]; d2 = x[nright+1] - x[i]; /* if d1 <= d2 with */ /* x[nright+1] == x[nright], */ /* lowest fixes */ if (d1 > d2) { /* radius will not */ /* decrease by */ /* move right */ nleft++; nright++; continue; } } /* fitted value at x[i] */ lowest(&x[1], &y[1], n, &x[i], &ys[i], nleft, nright, res, iter>1, rw, &ok); if (!ok) ys[i] = y[i]; /* all weights zero */ /* copy over value (all rw==0) */ if (last < i-1) { denom = x[i]-x[last]; /* skipped points -- interpolate */ /* non-zero - proof? */ for(j = last+1; j < i; j++) { alpha = (x[j]-x[last])/denom; ys[j] = alpha*ys[i] + (1.-alpha)*ys[last]; } } /* last point actually estimated */ last = i; /* x coord of close points */ cut = x[last]+delta; for (i = last+1; i <= n; i++) { if (x[i] > cut) break; if (x[i] == x[last]) { ys[i] = ys[last]; last = i; } } i = imax2(last+1, i-1); if (last >= n) break; } /* residuals */ for(i = 0; i < n; i++) res[i] = y[i+1] - ys[i+1]; /* overall scale estimate */ sc = 0.; for(i = 0; i < n; i++) sc += fabs(res[i]); sc /= n; /* compute robustness weights */ /* except last time */ if (iter > nsteps) break; /* Note: The following code, biweight_{6 MAD|Ri|} is also used in stl(), loess and several other places. --> should provide API here (MM) */ for(i = 0 ; i < n ; i++) rw[i] = fabs(res[i]); /* Compute cmad := 6 * median(rw[], n) ---- */ /* FIXME: We need C API in R for Median ! */ m1 = n/2; /* partial sort, for m1 & m2 */ rPsort((double *)rw, (int)n, (int)m1); if(n % 2 == 0) { m2 = n-m1-1; rPsort((double *)rw, (int)n, (int)m2); cmad = 3.*(rw[m1]+rw[m2]); } else { /* n odd */ cmad = 6.*rw[m1]; } if(cmad < 1e-7 * sc) /* effectively zero */ break; c9 = 0.999*cmad; c1 = 0.001*cmad; for(i = 0 ; i < n ; i++) { r = fabs(res[i]); if (r <= c1) rw[i] = 1.; else if (r <= c9) rw[i] = fsquare(1.-fsquare(r/cmad)); else rw[i] = 0.; } iter++; } }
// For lots of subsets of size nwhich, compute the exact fit to those data // points and the residuals from all the data points. // copied with modification from MASS/src/lqs.c // Copyright (C) 1998-2007 B. D. Ripley // Copyright (C) 1999 R Development Core Team // TODO: rewrite void LQSEstimator::operator()(const Data& data, double* coef_ptr, double* fitted_ptr, double* resid_ptr, double* scale_ptr) { int nnew = nwhich, pp = p; int i, iter, nn = n, trial; int rank, info, n100 = 100; int firsttrial = 1; double a = 0.0, tol = 1.0e-7, sum, thiscrit, best = DBL_MAX, target, old, newp, dummy, k0 = pk0; const arma::vec& y = data.y; const arma::mat& x = data.x; double coef[p]; arma::vec coef_vec(coef, p, false, true); double qraux[p]; double work[2*p]; double res[n]; arma::vec res_vec(res, n, false, true); double yr[nwhich]; double xr[nwhich * p]; arma::vec yr_vec(yr, nwhich, false, true); arma::mat xr_mat(xr, nwhich, p, false, true); double bestcoef[p]; int pivot[p]; arma::uvec which_vec(nwhich); //int bestone[nwhich]; target = (nn - pp)* (beta); for(trial = 0; trial < ntrials; trial++) { R_CheckUserInterrupt(); // get this trial's subset which_vec = indices.col(trial); yr_vec = y.elem(which_vec); xr_mat = x.rows(which_vec); /* compute fit, find residuals */ F77_CALL(dqrdc2)(xr, &nnew, &nnew, &pp, &tol, &rank, qraux, pivot, work); if(rank < pp) { sing++; continue; } F77_CALL(dqrsl)(xr, &nnew, &nnew, &rank, qraux, yr, &dummy, yr, coef, &dummy, &dummy, &n100, &info); res_vec = y - x * coef_vec; /* S estimation */ if(firsttrial) { for(i = 0; i < nn; i ++) res[i] = fabs(res[i]); rPsort(res, nn, nn/2); old = res[nn/2]/0.6745; /* MAD provides the initial scale */ firsttrial = 0; } else { /* only find optimal scale if it will be better than existing best solution */ sum = 0.0; for(i = 0; i < nn; i ++) sum += chi(res[i], k0 * best); if(sum > target) continue; old = best; } /* now solve for scale S by re-substitution */ for(iter = 0; iter < 30; iter++) { /*printf("iter %d, s = %f sum = %f %f\n", iter, old, sum, target);*/ sum = 0.0; for(i = 0; i < nn; i ++) sum += chi(res[i], k0 * old); newp = sqrt(sum/target) * old; if(fabs(sum/target - 1.) < 1e-4) break; old = newp; } thiscrit = newp; /* first trial might be singular, so use fence */ if(thiscrit < best) { sum = 0.0; for(i = 0; i < nn; i ++) sum += chi(res[i], k0 * best); best = thiscrit; for(i = 0; i < pp; i++) bestcoef[i] = coef[i]; bestcoef[0] += a; } } /* for(trial in 0:ntrials) */ crit = (best < 0.0) ? 0.0 : best; if(sample) PutRNGstate(); /* lqs_free(); */ // output arma::vec coef_out(coef_ptr, p, false, true); arma::vec fitted_out(fitted_ptr, n, false, true); arma::vec resid_out(resid_ptr, n, false, true); arma::vec scale_out(scale_ptr, 1, false, true); for (i = 0; i < p; i++) coef_out[i] = bestcoef[i]; fitted_out = x * coef_out; resid_out = y - fitted_out; scale_out = crit; }