// DONE: return 'uniqlist' as a vector (same as duplist) and write a separate function to get group sizes // Also improvements for numeric type with a hack of checking unsigned int (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up) SEXP uniqlist(SEXP l, SEXP order) { // This works like UNIX uniq as referred to by ?base::unique; i.e., it // drops immediately repeated rows but doesn't drop duplicates of any // previous row. Unless, order is provided, then it also drops any previous // row. l must be a list of same length vectors ans is allocated first // (maximum length the number of rows) and the length returned in anslen. // DONE: ans is now grown Rboolean b, byorder; unsigned long long *ulv; // for numeric check speed-up SEXP v, ans, class; R_len_t i, j, nrow, ncol, len, thisi, previ, isize=1000; int *iidx = Calloc(isize, int); // for 'idx' int *n_iidx; // to catch allocation errors using Realloc! if (NA_INTEGER != NA_LOGICAL || sizeof(NA_INTEGER)!=sizeof(NA_LOGICAL)) error("Have assumed NA_INTEGER == NA_LOGICAL (currently R_NaInt). If R changes this in future (seems unlikely), an extra case is required; a simple change."); ncol = length(l); nrow = length(VECTOR_ELT(l,0)); len = 1; iidx[0] = 1; // first row is always the first of the first group byorder = INTEGER(order)[0] != -1; // Using MISSING() does not seem stable under windows. Always having arguments passed in seems a good idea anyway. thisi = byorder ? INTEGER(order)[0]-1 : 0; for (i=1; i<nrow; i++) { previ = thisi; thisi = byorder ? INTEGER(order)[i]-1 : i; j = ncol; // the last column varies the most frequently so check that first and work backwards b = TRUE; while (--j>=0 && b) { v=VECTOR_ELT(l,j); switch (TYPEOF(v)) { case INTSXP : case LGLSXP : b=INTEGER(v)[thisi]==INTEGER(v)[previ]; break; case STRSXP : // fix for #469, when key is set, duplicated calls uniqlist, where encoding // needs to be taken care of. b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break; // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings. case REALSXP : ulv = (unsigned long long *)REAL(v); b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup) if (!b) { class = getAttrib(v, R_ClassSymbol); twiddle = (isString(class) && STRING_ELT(class, 0)==char_integer64) ? &i64twiddle : &dtwiddle; b = twiddle(ulv, thisi, 1) == twiddle(ulv, previ, 1); } break; // TO DO: store previ twiddle call, but it'll need to be vector since this is in a loop through columns. Hopefully the first == will short circuit most often default : error("Type '%s' not supported", type2char(TYPEOF(v))); } } if (!b) iidx[len++] = i+1; if (len >= isize) { isize = 1.1*isize*nrow/i; n_iidx = Realloc(iidx, isize, int); if (n_iidx != NULL) iidx = n_iidx; else error("Error in reallocating memory in 'uniqlist'\n"); } }
void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int lowmax, int uppmax) // col is >0 and <=ncol-1 if this range of [xlow,xupp] and [ilow,iupp] match up to but not including that column // lowmax=1 if xlowIn is the lower bound of this group (needed for roll) // uppmax=1 if xuppIn is the upper bound of this group (needed for roll) { int xlow=xlowIn, xupp=xuppIn, ilow=ilowIn, iupp=iuppIn, j, k, ir, lir, tmp; SEXP class; ir = lir = ilow + (iupp-ilow)/2; // lir = logical i row. if (o) ir = o[lir]-1; // ir = the actual i row if i were ordered ic = VECTOR_ELT(i,icols[col]-1); // ic = i column xc = VECTOR_ELT(x,xcols[col]-1); // xc = x column // it was checked in bmerge() that the types are equal switch (TYPEOF(xc)) { case LGLSXP : case INTSXP : // including factors ival.i = INTEGER(ic)[ir]; while(xlow < xupp-1) { mid = xlow + (xupp-xlow)/2; // Same as (xlow+xupp)/2 but without risk of overflow xval.i = INTEGER(xc)[XIND(mid)]; if (xval.i<ival.i) { // relies on NA_INTEGER == INT_MIN, tested in init.c xlow=mid; } else if (xval.i>ival.i) { // TO DO: is *(&xlow, &xupp)[0|1]=mid more efficient than branch? xupp=mid; } else { // xval.i == ival.i including NA_INTEGER==NA_INTEGER // branch mid to find start and end of this group in this column // TO DO?: not if mult=first|last and col<ncol-1 tmplow = mid; tmpupp = mid; while(tmplow<xupp-1) { mid = tmplow + (xupp-tmplow)/2; xval.i = INTEGER(xc)[XIND(mid)]; if (xval.i == ival.i) tmplow=mid; else xupp=mid; } while(xlow<tmpupp-1) { mid = xlow + (tmpupp-xlow)/2; xval.i = INTEGER(xc)[XIND(mid)]; if (xval.i == ival.i) tmpupp=mid; else xlow=mid; } // xlow and xupp now surround the group in xc, we only need this range for the next column break; } } tmplow = lir; tmpupp = lir; while(tmplow<iupp-1) { // TO DO: could double up from lir rather than halving from iupp mid = tmplow + (iupp-tmplow)/2; xval.i = INTEGER(ic)[ o ? o[mid]-1 : mid ]; // reuse xval to search in i if (xval.i == ival.i) tmplow=mid; else iupp=mid; } while(ilow<tmpupp-1) { mid = ilow + (tmpupp-ilow)/2; xval.i = INTEGER(ic)[ o ? o[mid]-1 : mid ]; if (xval.i == ival.i) tmpupp=mid; else ilow=mid; } // ilow and iupp now surround the group in ic, too break; case STRSXP : ival.s = ENC2UTF8(STRING_ELT(ic,ir)); while(xlow < xupp-1) { mid = xlow + (xupp-xlow)/2; xval.s = ENC2UTF8(STRING_ELT(xc, XIND(mid))); tmp = StrCmp(xval.s, ival.s); // uses pointer equality first, NA_STRING are allowed and joined to, then uses strcmp on CHAR(). if (tmp == 0) { // TO DO: deal with mixed encodings and locale optionally tmplow = mid; tmpupp = mid; while(tmplow<xupp-1) { mid = tmplow + (xupp-tmplow)/2; xval.s = ENC2UTF8(STRING_ELT(xc, XIND(mid))); if (ival.s == xval.s) tmplow=mid; else xupp=mid; // the == here handles encodings as well. Marked non-utf8 encodings are converted to utf-8 using ENC2UTF8. } while(xlow<tmpupp-1) { mid = xlow + (tmpupp-xlow)/2; xval.s = ENC2UTF8(STRING_ELT(xc, XIND(mid))); if (ival.s == xval.s) tmpupp=mid; else xlow=mid; // see above re == } break; } else if (tmp < 0) { xlow=mid; } else { xupp=mid; } } tmplow = lir; tmpupp = lir; while(tmplow<iupp-1) { mid = tmplow + (iupp-tmplow)/2; xval.s = ENC2UTF8(STRING_ELT(ic, o ? o[mid]-1 : mid)); if (xval.s == ival.s) tmplow=mid; else iupp=mid; // see above re == } while(ilow<tmpupp-1) { mid = ilow + (tmpupp-ilow)/2; xval.s = ENC2UTF8(STRING_ELT(ic, o ? o[mid]-1 : mid)); if (xval.s == ival.s) tmpupp=mid; else ilow=mid; // see above re == } break; case REALSXP : class = getAttrib(xc, R_ClassSymbol); twiddle = (isString(class) && STRING_ELT(class, 0)==char_integer64) ? &i64twiddle : &dtwiddle; ival.ll = twiddle(DATAPTR(ic), ir, 1); while(xlow < xupp-1) { mid = xlow + (xupp-xlow)/2; xval.ll = twiddle(DATAPTR(xc), XIND(mid), 1); if (xval.ll<ival.ll) { xlow=mid; } else if (xval.ll>ival.ll) { xupp=mid; } else { // xval.ll == ival.ll) tmplow = mid; tmpupp = mid; while(tmplow<xupp-1) { mid = tmplow + (xupp-tmplow)/2; xval.ll = twiddle(DATAPTR(xc), XIND(mid), 1); if (xval.ll == ival.ll) tmplow=mid; else xupp=mid; } while(xlow<tmpupp-1) { mid = xlow + (tmpupp-xlow)/2; xval.ll = twiddle(DATAPTR(xc), XIND(mid), 1); if (xval.ll == ival.ll) tmpupp=mid; else xlow=mid; } break; } } tmplow = lir; tmpupp = lir; while(tmplow<iupp-1) { mid = tmplow + (iupp-tmplow)/2; xval.ll = twiddle(DATAPTR(ic), o ? o[mid]-1 : mid, 1 ); if (xval.ll == ival.ll) tmplow=mid; else iupp=mid; } while(ilow<tmpupp-1) { mid = ilow + (tmpupp-ilow)/2; xval.ll = twiddle(DATAPTR(ic), o ? o[mid]-1 : mid, 1 ); if (xval.ll == ival.ll) tmpupp=mid; else ilow=mid; } break; default: error("Type '%s' not supported as key column", type2char(TYPEOF(xc))); } if (xlow<xupp-1) { // if value found, low and upp surround it, unlike standard binary search where low falls on it if (col<ncol-1) bmerge_r(xlow, xupp, ilow, iupp, col+1, 1, 1); // final two 1's are lowmax and uppmax else { int len = xupp-xlow-1; if (len>1) allLen1[0] = FALSE; for (j=ilow+1; j<iupp; j++) { // usually iterates once only for j=ir k = o ? o[j]-1 : j; retFirst[k] = xlow+2; // extra +1 for 1-based indexing at R level retLength[k]= len; } } } else if (roll!=0.0 && col==ncol-1) { // runs once per i row (not each search test), so not hugely time critical if (xlow != xupp-1 || xlow<xlowIn || xupp>xuppIn) error("Internal error: xlow!=xupp-1 || xlow<xlowIn || xupp>xuppIn"); if (rollToNearest) { // value of roll ignored currently when nearest if ( (!lowmax || xlow>xlowIn) && (!uppmax || xupp<xuppIn) ) { if ( ( TYPEOF(ic)==REALSXP && REAL(ic)[ir]-REAL(xc)[XIND(xlow)] <= REAL(xc)[XIND(xupp)]-REAL(ic)[ir] ) || ( TYPEOF(ic)<=INTSXP && INTEGER(ic)[ir]-INTEGER(xc)[XIND(xlow)] <= INTEGER(xc)[XIND(xupp)]-INTEGER(ic)[ir] )) { retFirst[ir] = xlow+1; retLength[ir] = 1; } else { retFirst[ir] = xupp+1; retLength[ir] = 1; } } else if (uppmax && xupp==xuppIn && rollends[1]) { retFirst[ir] = xlow+1; retLength[ir] = 1; } else if (lowmax && xlow==xlowIn && rollends[0]) { retFirst[ir] = xupp+1; retLength[ir] = 1; } } else { // fix for #1405 if (TYPEOF(ic) == REALSXP) { xvalupp.ll = REAL(xc)[XIND(xupp)]; xvallow.ll = REAL(xc)[XIND(xlow)]; ival.ll = REAL(ic)[ir]; } if ( ( (roll>0.0 && (!lowmax || xlow>xlowIn) && (xupp<xuppIn || !uppmax || rollends[1])) || (roll<0.0 && xupp==xuppIn && uppmax && rollends[1]) ) && ( (TYPEOF(ic)==REALSXP && ((double)(ival.ll-xvallow.ll)-rollabs<1e-6 || (double)(ival.ll-xvallow.ll) == rollabs)) // #1007 fix || (TYPEOF(ic)<=INTSXP && (double)(INTEGER(ic)[ir]-INTEGER(xc)[XIND(xlow)])-rollabs<1e-6 ) || (TYPEOF(ic)==STRSXP) )) { retFirst[ir] = xlow+1; retLength[ir] = 1; } else if ( ( (roll<0.0 && (!uppmax || xupp<xuppIn) && (xlow>xlowIn || !lowmax || rollends[0])) || (roll>0.0 && xlow==xlowIn && lowmax && rollends[0]) ) && ( (TYPEOF(ic)==REALSXP && ((double)(xvalupp.ll-ival.ll)-rollabs<1e-6 || (double)(xvalupp.ll-ival.ll) == rollabs)) // 1007 fix || (TYPEOF(ic)<=INTSXP && (double)(INTEGER(xc)[XIND(xupp)]-INTEGER(ic)[ir])-rollabs<1e-6 ) || (TYPEOF(ic)==STRSXP) )) { retFirst[ir] = xupp+1; // == xlow+2 retLength[ir] = 1; } } if (iupp-ilow > 2 && retFirst[ir]!=NA_INTEGER) { // >=2 equal values in the last column being rolling to the same point. for (j=ilow+1; j<iupp; j++) { // will rewrite retFirst[ir] to itself, but that's ok if (o) k=o[j]-1; else k=j; retFirst[k] = retFirst[ir]; retLength[k]= retLength[ir]; } } } if (ilow>ilowIn && (xlow>xlowIn || (roll!=0.0 && col==ncol-1))) bmerge_r(xlowIn, xlow+1, ilowIn, ilow+1, col, lowmax, uppmax && xlow+1==xuppIn); if (iupp<iuppIn && (xupp<xuppIn || (roll!=0.0 && col==ncol-1))) bmerge_r(xupp-1, xuppIn, iupp-1, iuppIn, col, lowmax && xupp-1==xlowIn, uppmax); }