int Bvlprint(Biobuf *bp, const char *fmt, va_list args) { Fmt f; Rune buf[256]; int res; if(utf8locale()) return Bvprint(bp, fmt, args); f.runes = 1; f.start = (char*)buf; f.to = (char*)buf; f.stop = (char*)(buf + nelem(buf) - 1); f.flush = fmtBlflush; f.farg = bp; f.nfmt = 0; va_copy(f.args, args); res = dofmt(&f, fmt); va_end(f.args); if(res > 0 && fmtBlflush(&f) == 0) return -1; return res; }
// count strings x, their prefixes or suffixes // and return counts greater than lower. // // note that for string counting the number of // nodes usually will be much greater than the // number of result strings. however, we could // not determine a better upper bound without // traversing the tree. SEXP R_utf8CountString(SEXP x, SEXP R_n, SEXP R_lower, SEXP R_type, SEXP R_verbose, SEXP R_persistent, SEXP R_useBytes) { if (!persistent && rpn) { cpnfree(rpn); rpn = 0; warning("cleaning up stale state"); } if (isNull(x) || TYPEOF(x) != VECSXP) error("'x' not of type list"); if (isNull(R_n) || TYPEOF(R_n) != INTSXP) error("'n' not of type integer"); if (isNull(R_lower) || TYPEOF(R_lower) != INTSXP) error("'lower' not of type integer"); if (isNull(R_type) || TYPEOF(R_type) != INTSXP) error("'type' not of type integer"); if (isNull(R_verbose) || TYPEOF(R_verbose) != LGLSXP) error("'verbose' not of type logical"); if (isNull(R_persistent) || TYPEOF(R_persistent) != LGLSXP) error("'persistent' not of type logical"); if (isNull(R_useBytes) || TYPEOF(R_useBytes) != LGLSXP) error("'useBytes' not of type logical"); long l, n = 0; int h, i, j, k, type; const unsigned char *c; SEXP r, s; if (!persistent) { known_to_be_utf8 = utf8locale(); known_to_be_latin1 = latin1locale(); use_bytes = *LOGICAL(R_useBytes); } else if (use_bytes != *LOGICAL(R_useBytes)) error("change of useBytes in persistent mode"); else if (known_to_be_utf8 != utf8locale() || known_to_be_latin1 != latin1locale()) error_reset("change of locale in persistent mode"); persistent = LOGICAL(R_persistent)[0]; if (!persistent) { tcnt = INTEGER(R_lower)[0]; if (tcnt < 0) error_reset("'lower' invalid value"); } type = INTEGER(R_type)[0]; switch (type) { case 0: // strings inc = 0; break; case 1: // prefixes case 2: // suffixes case 3: n = INTEGER(R_n)[0]; if (n < 0) error_reset("'n' invalid value"); if (n == 0) return R_NilValue; inc = 1; break; default: error_reset("'type' invalid value"); } #ifdef _TIME_H clock_t t2, t1, t0 = clock(); if (LOGICAL(R_verbose)[0] == TRUE) { Rprintf("counting ..."); #ifdef __DEBUG Rprintf("\n"); #endif } #endif nap = 0; for (i = 0; i < LENGTH(x); i++) { r = VECTOR_ELT(x, i); if (TYPEOF(r) != STRSXP) error_reset("not of type character"); for (j = 0; j < LENGTH(r); j++) { s = STRING_ELT(r, j); l = LENGTH(s); if (s == NA_STRING || !l) continue; #ifdef __TRANSLATE if (!use_bytes) { c = (const unsigned char *) translateChar(s); l = strlen((const char *) c); } else c = (const unsigned char *) CHAR(s); #else c = (const unsigned char *) CHAR(s); #endif if (!use_bytes && known_to_be_utf8 && tau_pcre_valid_utf8(c, l) >= 0) error_reset("not a valid UTF-8 string"); if (type > 1) { if (reverse_copy_utf8(c, l, n) >= 0) error_reset("cannot copy string to buffer"); } else { if (type < 1) n = l; h = 0; for (k = 0; k < l; k++) { if (c[k] == '\0') continue; if (k < __CBUF_SIZE - 1) cbuf[k] = c[k]; else error_reset("cannot copy string to buffer"); if (use_bytes || !known_to_be_utf8 || (c[k] & 0xC0) != 0x80) { h++; if (h > n) break; } } cbuf[k] = 0; } #ifdef __DEBUG Rprintf(" %s\n", cbuf); #endif h = nap + 1; lpn = 0; rpn = cpncount(rpn, cbuf); if (nap != h) error_reset("cannot add string to tree"); if (!inc) { if (lpn) // should never be NULL lpn->count++; } } R_CheckUserInterrupt(); } #ifdef _TIME_H t1 = clock(); if (LOGICAL(R_verbose)[0] == TRUE) { Rprintf(" %i string(s) using %i nodes [%.2fs]\n", nap, ncpn, ((double) t1 - t0) / CLOCKS_PER_SEC); if (!persistent) Rprintf("writing ..."); #ifdef __DEBUG Rprintf("\n"); #endif } #endif if (persistent) return R_NilValue; nap = enc = 0; rval = PROTECT(allocVector(INTSXP, ncpn)); setAttrib(rval, R_NamesSymbol, (nval = allocVector(STRSXP, ncpn))); cpnretprefix(rpn, 0); if (ncpn) { cpnfree(rpn); rpn = 0; error_reset("cannot retrieve count(s)"); } rpn = 0; // reverse the reversed strings if (type == 2) for (i = 0; i < nap; i++) { s = STRING_ELT(nval, i); reverse_copy_utf8((const unsigned char *) CHAR(s), LENGTH(s), -1); SET_STRING_ELT(nval, i, mkCharCE((const char *) cbuf, getCharCE(s))); } #ifdef _TIME_H t2 = clock(); if (LOGICAL(R_verbose)[0] == TRUE) Rprintf(" %i strings [%.2fs]\n", nap, ((double) t2 - t1) / CLOCKS_PER_SEC); #endif // reduce if (nap < LENGTH(rval)) { r = PROTECT(allocVector(INTSXP, nap)); setAttrib(r, R_NamesSymbol, (s = allocVector(STRSXP, nap))); while (nap-- > 0) { INTEGER(r)[nap] = INTEGER(rval)[nap]; SET_STRING_ELT(s, nap, STRING_ELT(nval, nap)); } UNPROTECT(2); return r; } UNPROTECT(1); return rval; }
SEXP R_utf8CountNgram(SEXP x, SEXP R_n, SEXP R_lower, SEXP R_verbose, SEXP R_persistent, SEXP R_useBytes) { if (!persistent && rpn) { cpnfree(rpn); rpn = 0; warning("cleaning up stale state"); } if (isNull(x) || TYPEOF(x) != VECSXP) error("'x' not of type list"); if (isNull(R_n) || TYPEOF(R_n) != INTSXP) error("'n' not of type integer"); if (isNull(R_lower) || TYPEOF(R_lower) != INTSXP) error("'lower' not of type integer"); if (isNull(R_verbose) || TYPEOF(R_verbose) != LGLSXP) error("'verbose' not of type logical"); if (isNull(R_persistent) || TYPEOF(R_persistent) != LGLSXP) error("'persistent' not of type logical"); if (isNull(R_useBytes) || TYPEOF(R_useBytes) != LGLSXP) error("'useBytes' not of type logical"); long l; int h, i, j, k, m, n; const unsigned char *c; SEXP r, s; if (!persistent) { known_to_be_utf8 = utf8locale(); known_to_be_latin1 = latin1locale(); use_bytes = *LOGICAL(R_useBytes); } else if (use_bytes != *LOGICAL(R_useBytes)) error("change of useBytes in persistent mode"); else if (known_to_be_utf8 != utf8locale() || known_to_be_latin1 != latin1locale()) error_reset("change of locale in persistent mode"); persistent = LOGICAL(R_persistent)[0]; n = INTEGER(R_n)[0]; if (n < 0) error_reset("'n' invalid value"); if (n == 0) return R_NilValue; if (!persistent) { tcnt = INTEGER(R_lower)[0]; if (tcnt < 0) error_reset("'lowr' invalid value"); } #ifdef _TIME_H clock_t t2, t1, t0 = clock(); if (LOGICAL(R_verbose)[0] == TRUE) { Rprintf("counting ..."); #ifdef __DEBUG Rprintf("\n"); #endif } #endif nap = 0; inc = 1; for (i = 0; i < LENGTH(x); i++) { r = VECTOR_ELT(x, i); if (TYPEOF(r) != STRSXP) error_reset("not of type character"); for (j = 0; j < LENGTH(r); j++) { s = STRING_ELT(r, j); l = LENGTH(s); if (s == NA_STRING || !l) continue; #ifdef __TRANSLATE if (!use_bytes) { c = (const unsigned char *) translateChar(s); l = strlen((const char *) c); } else c = (const unsigned char *) CHAR(s); #else c = (const unsigned char *) CHAR(s); #endif // strings of unknown encoding are not translated // or strings marked as UTF-8 could be invalid, so // we have to check. if (!use_bytes && known_to_be_utf8 && tau_pcre_valid_utf8(c, l) >= 0) error_reset("not a valid UTF-8 string"); /* in an UTF-8 multibyte sequence any byte * except the first has 10 as its leading bits. * thus, 1) the byte cannot be the start of a * suffix and 2) we have to expand the current * window. * * '\1' is a special boundary marker that triggers * reduced counting, ie omission of windows which * start at a boundary and shrinkage of windows * which end at a boundary. */ int b; for (k = 0; k < l; k++) { if (c[k] == '\0') continue; if (!use_bytes && known_to_be_utf8 && (c[k] & 0xC0) == 0x80) continue; if (k == 1 && c[0] == '\1') continue; h = 0; m = k; b = k; while (m < l) { if (m-k < __CBUF_SIZE) cbuf[m-k] = c[m]; else error_reset("cannot copy string to buffer"); if (use_bytes || !known_to_be_utf8 || (c[m] & 0xC0) != 0x80) { h++; if (h > n) { h--; if (c[m] == '\1') { h--; m = b; } break; } b = m; } m++; } cbuf[m-k] = 0; #ifdef __DEBUG Rprintf(" %i %i %i %s\n", k+1, m, h, cbuf); #endif h = nap + 1; rpn = cpncount(rpn, cbuf); if (nap != h) error_reset("cannot add string to tree"); } } R_CheckUserInterrupt(); } #ifdef _TIME_H t1 = clock(); if (LOGICAL(R_verbose)[0] == TRUE) { Rprintf(" %i string(s) using %i nodes [%.2fs]\n", nap, ncpn, ((double) t1 - t0) / CLOCKS_PER_SEC); if (!persistent) Rprintf("writing ..."); #ifdef __DEBUG Rprintf("\n"); #endif } #endif if (persistent) return R_NilValue; nap = enc = 0; rval = PROTECT(allocVector(INTSXP, ncpn)); setAttrib(rval, R_NamesSymbol, (nval = allocVector(STRSXP, ncpn))); cpnretprefix(rpn, 0); if (ncpn) { cpnfree(rpn); rpn = 0; error("cannot retrieve count(s)"); } rpn = 0; #ifdef _TIME_H t2 = clock(); if (LOGICAL(R_verbose)[0] == TRUE) Rprintf(" %i strings [%.2fs]\n", nap, ((double) t2 - t1) / CLOCKS_PER_SEC); #endif // reduce if (nap < LENGTH(rval)) { r = PROTECT(allocVector(INTSXP, nap)); setAttrib(r, R_NamesSymbol, (s = allocVector(STRSXP, nap))); while (nap-- > 0) { INTEGER(r)[nap] = INTEGER(rval)[nap]; SET_STRING_ELT(s, nap, STRING_ELT(nval, nap)); } UNPROTECT(2); return r; } UNPROTECT(1); return rval; }