示例#1
0
int
Bvlprint(Biobuf *bp, const char *fmt, va_list args)
{
	Fmt f;
	Rune buf[256];
	int res;

	if(utf8locale())
		return Bvprint(bp, fmt, args);

	f.runes = 1;
	f.start = (char*)buf;
	f.to = (char*)buf;
	f.stop = (char*)(buf + nelem(buf) - 1);
	f.flush = fmtBlflush;
	f.farg = bp;
	f.nfmt = 0;

	va_copy(f.args, args);
	res = dofmt(&f, fmt);
	va_end(f.args);
	if(res > 0 && fmtBlflush(&f) == 0)
		return -1;
	return res;
}
示例#2
0
// count strings x, their prefixes or suffixes
// and return counts greater than lower.
//
// note that for string counting the number of
// nodes usually will be much greater than the
// number of result strings. however, we could
// not determine a better upper bound without 
// traversing the tree.
SEXP R_utf8CountString(SEXP x, SEXP R_n, SEXP R_lower, SEXP R_type,
		       SEXP R_verbose, SEXP R_persistent, SEXP R_useBytes) {
    if (!persistent && rpn) {
	cpnfree(rpn);
	rpn = 0;
	warning("cleaning up stale state");
    }
    if (isNull(x) || TYPEOF(x) != VECSXP)
	error("'x' not of type list");
    if (isNull(R_n) || TYPEOF(R_n) != INTSXP)
	error("'n' not of type integer");
    if (isNull(R_lower) || TYPEOF(R_lower) != INTSXP)
	error("'lower' not of type integer");
    if (isNull(R_type) || TYPEOF(R_type) != INTSXP)
	error("'type' not of type integer");
    if (isNull(R_verbose) || TYPEOF(R_verbose) != LGLSXP)
	error("'verbose' not of type logical");
    if (isNull(R_persistent) || TYPEOF(R_persistent) != LGLSXP)
	error("'persistent' not of type logical");
    if (isNull(R_useBytes) || TYPEOF(R_useBytes) != LGLSXP)
	error("'useBytes' not of type logical");
    long l, n = 0;
    int h, i, j, k, type;
    const unsigned char *c;
    SEXP r, s;

    if (!persistent) {
	known_to_be_utf8   = utf8locale();
	known_to_be_latin1 = latin1locale();
	use_bytes          = *LOGICAL(R_useBytes);
    } else
    if (use_bytes != *LOGICAL(R_useBytes))
	error("change of useBytes in persistent mode");
    else
    if (known_to_be_utf8   != utf8locale() ||
        known_to_be_latin1 != latin1locale())
	error_reset("change of locale in persistent mode");
    persistent = LOGICAL(R_persistent)[0];

    if (!persistent) {
	tcnt = INTEGER(R_lower)[0];
	if (tcnt < 0)
	    error_reset("'lower' invalid value");
    }

    type = INTEGER(R_type)[0];

    switch (type) {
	case 0:				// strings
	    inc = 0;
	    break;
	case 1:				// prefixes
	case 2:				// suffixes
	case 3:
	    n = INTEGER(R_n)[0];
	    if (n < 0) 
		error_reset("'n' invalid value");
	    if (n == 0)
		return R_NilValue;
	    inc = 1;
	    break;
	default:
	    error_reset("'type' invalid value");
    }
#ifdef _TIME_H
    clock_t t2, t1, t0 = clock();
    if (LOGICAL(R_verbose)[0] == TRUE) {
	Rprintf("counting ...");
#ifdef __DEBUG
	Rprintf("\n");
#endif
    }
#endif

    nap = 0;

    for (i = 0; i < LENGTH(x); i++) {
	r = VECTOR_ELT(x, i);
	if (TYPEOF(r) != STRSXP)
	    error_reset("not of type character");
	for (j = 0; j < LENGTH(r); j++) {
	    s = STRING_ELT(r, j);
	    l = LENGTH(s);
	    if (s == NA_STRING || !l)
		continue;
#ifdef __TRANSLATE
	    if (!use_bytes) {
		c = (const unsigned char *) translateChar(s);
		l = strlen((const char *) c);
	    } else
		c = (const unsigned char *) CHAR(s);
#else
	    c = (const unsigned char *) CHAR(s);
#endif
	    if (!use_bytes &&
		known_to_be_utf8 &&
		tau_pcre_valid_utf8(c, l) >= 0)
		error_reset("not a valid UTF-8 string");
	    if (type > 1) {
		if (reverse_copy_utf8(c, l, n) >= 0)
		    error_reset("cannot copy string to buffer");
	    } else {
		if (type < 1)
		    n = l;
		h = 0;
		for (k = 0; k < l; k++) {
		    if (c[k] == '\0')
			continue;
		    if (k < __CBUF_SIZE - 1)
			cbuf[k] = c[k];
		    else 
			error_reset("cannot copy string to buffer");
		    if (use_bytes ||
			!known_to_be_utf8 ||
			(c[k] & 0xC0) != 0x80) {
			h++;
			if (h > n)
			    break;
		    }
		}
		cbuf[k] = 0;
	    }
#ifdef __DEBUG
	    Rprintf(" %s\n", cbuf);
#endif
	    h = nap + 1;
	    lpn = 0;
	    rpn = cpncount(rpn, cbuf);

	    if (nap != h)
		error_reset("cannot add string to tree");

	    if (!inc) {
		if (lpn)	    // should never be NULL
		    lpn->count++;
	    }
	}
	R_CheckUserInterrupt();
    }
#ifdef _TIME_H
    t1 = clock();
    if (LOGICAL(R_verbose)[0] == TRUE) {
	Rprintf(" %i string(s) using %i nodes [%.2fs]\n", nap, ncpn,
		((double) t1 - t0) / CLOCKS_PER_SEC);
	if (!persistent)
	    Rprintf("writing  ...");
#ifdef __DEBUG
	Rprintf("\n");
#endif
    }
#endif
    if (persistent)
	return R_NilValue;

    nap  = enc = 0;
    rval = PROTECT(allocVector(INTSXP, ncpn));
    setAttrib(rval, R_NamesSymbol, (nval = allocVector(STRSXP, ncpn)));

    cpnretprefix(rpn, 0);

    if (ncpn) {
	cpnfree(rpn);
	rpn = 0;
	error_reset("cannot retrieve count(s)");
    }
    rpn = 0;

    // reverse the reversed strings
    if (type == 2)
	for (i = 0; i < nap; i++) {
	    s = STRING_ELT(nval, i);
	    reverse_copy_utf8((const unsigned char *) CHAR(s), LENGTH(s), -1);
	    SET_STRING_ELT(nval, i, mkCharCE((const char *) cbuf, 
					     getCharCE(s)));
	}
#ifdef _TIME_H
    t2 = clock();
    if (LOGICAL(R_verbose)[0] == TRUE)
	Rprintf(" %i strings [%.2fs]\n", nap,
		((double) t2 - t1) / CLOCKS_PER_SEC); 
#endif
    // reduce
    if (nap < LENGTH(rval)) {
	r = PROTECT(allocVector(INTSXP, nap));
	setAttrib(r, R_NamesSymbol, (s = allocVector(STRSXP, nap)));
	while (nap-- > 0) {
	    INTEGER(r)[nap] = INTEGER(rval)[nap];
	    SET_STRING_ELT(s, nap, STRING_ELT(nval, nap));
	}
	UNPROTECT(2);

	return r;
    }
    UNPROTECT(1);

    return rval;
}
示例#3
0
SEXP R_utf8CountNgram(SEXP x, SEXP R_n, SEXP R_lower, SEXP R_verbose,
		      SEXP R_persistent, SEXP R_useBytes) {
    if (!persistent && rpn) {
	cpnfree(rpn);
	rpn = 0;
	warning("cleaning up stale state");
    }
    if (isNull(x) || TYPEOF(x) != VECSXP)
	error("'x' not of type list");
    if (isNull(R_n) || TYPEOF(R_n) != INTSXP)
	error("'n' not of type integer");
    if (isNull(R_lower) || TYPEOF(R_lower) != INTSXP)
	error("'lower' not of type integer");
    if (isNull(R_verbose) || TYPEOF(R_verbose) != LGLSXP)
	error("'verbose' not of type logical");
    if (isNull(R_persistent) || TYPEOF(R_persistent) != LGLSXP)
	error("'persistent' not of type logical");
    if (isNull(R_useBytes) || TYPEOF(R_useBytes) != LGLSXP)
	error("'useBytes' not of type logical");
    long l;
    int h, i, j, k, m, n;
    const unsigned char *c;
    SEXP r, s;

    if (!persistent) {
	known_to_be_utf8   = utf8locale();
	known_to_be_latin1 = latin1locale();
	use_bytes          = *LOGICAL(R_useBytes);
    } else
    if (use_bytes != *LOGICAL(R_useBytes))
	error("change of useBytes in persistent mode");
    else
    if (known_to_be_utf8   != utf8locale() ||
	known_to_be_latin1 != latin1locale())
	error_reset("change of locale in persistent mode");
    persistent = LOGICAL(R_persistent)[0];

    n = INTEGER(R_n)[0];
    if (n < 0) 
	error_reset("'n' invalid value");
    if (n == 0)
	return R_NilValue;

    if (!persistent) {
	tcnt = INTEGER(R_lower)[0];
	if (tcnt < 0)
	    error_reset("'lowr' invalid value");
    }

#ifdef _TIME_H
    clock_t t2, t1, t0 = clock();
    if (LOGICAL(R_verbose)[0] == TRUE) {
	Rprintf("counting ...");
#ifdef __DEBUG
	Rprintf("\n");
#endif
    }
#endif

    nap = 0;
    inc = 1;

    for (i = 0; i < LENGTH(x); i++) {
	r = VECTOR_ELT(x, i);
	if (TYPEOF(r) != STRSXP)
	    error_reset("not of type character");
	for (j = 0; j < LENGTH(r); j++) {
	    s = STRING_ELT(r, j);
	    l = LENGTH(s);
	    if (s == NA_STRING || !l)
		continue;
#ifdef __TRANSLATE
	    if (!use_bytes) {
		c = (const unsigned char *) translateChar(s);
		l = strlen((const char *) c);
	    } else
		c = (const unsigned char *) CHAR(s);
#else
	    c = (const unsigned char *) CHAR(s);
#endif
	    // strings of unknown encoding are not translated
	    // or strings marked as UTF-8 could be invalid, so
	    // we have to check.
	    if (!use_bytes &&
		known_to_be_utf8 &&
		tau_pcre_valid_utf8(c, l) >= 0)
		error_reset("not a valid UTF-8 string");
	    /* in an UTF-8 multibyte sequence any byte
	     * except the first has 10 as its leading bits.
	     * thus, 1) the byte cannot be the start of a
	     * suffix and 2) we have to expand the current
	     * window.
	     *
	     * '\1' is a special boundary marker that triggers
	     * reduced counting, ie omission of windows which
	     * start at a boundary and shrinkage of windows
	     * which end at a boundary.
	     */
	    int b;
	    for (k = 0; k < l; k++) {
		if (c[k] == '\0')
		    continue;
		if (!use_bytes &&
		    known_to_be_utf8 &&
		    (c[k] & 0xC0) == 0x80)
		    continue;
		if (k == 1 && c[0] == '\1')
		    continue;
		h = 0;
		m = k;
		b = k;
		while (m < l) {
		    if (m-k < __CBUF_SIZE)
			cbuf[m-k] = c[m];
		    else 
			error_reset("cannot copy string to buffer");
		    if (use_bytes ||
			!known_to_be_utf8 ||
			(c[m] & 0xC0) != 0x80) {
			h++;
			if (h > n) {
			    h--;
			    if (c[m] == '\1') {
				h--;
				m = b;
			    }
			    break;
			}
			b = m;
		    }
		    m++;
		}
		cbuf[m-k] = 0;
#ifdef __DEBUG
		Rprintf(" %i %i %i %s\n", k+1, m, h, cbuf);
#endif
		h = nap + 1;
		rpn = cpncount(rpn, cbuf);

		if (nap != h)
		    error_reset("cannot add string to tree");
	    }
	}
	R_CheckUserInterrupt();
    }
#ifdef _TIME_H
    t1 = clock();
    if (LOGICAL(R_verbose)[0] == TRUE) {
	Rprintf(" %i string(s) using %i nodes [%.2fs]\n", nap, ncpn,
		((double) t1 - t0) / CLOCKS_PER_SEC);
	if (!persistent)
	    Rprintf("writing  ...");
#ifdef __DEBUG
	Rprintf("\n");
#endif
    }
#endif
    if (persistent)
	return R_NilValue;

    nap  = enc = 0;
    rval = PROTECT(allocVector(INTSXP, ncpn));
    setAttrib(rval, R_NamesSymbol, (nval = allocVector(STRSXP, ncpn)));

    cpnretprefix(rpn, 0);

    if (ncpn) {
	cpnfree(rpn);
	rpn = 0;
	error("cannot retrieve count(s)");
    }
    rpn = 0;
#ifdef _TIME_H
    t2 = clock();
    if (LOGICAL(R_verbose)[0] == TRUE)
	Rprintf(" %i strings [%.2fs]\n", nap,
		((double) t2 - t1) / CLOCKS_PER_SEC);
#endif
    // reduce
    if (nap < LENGTH(rval)) {
	r = PROTECT(allocVector(INTSXP, nap));
	setAttrib(r, R_NamesSymbol, (s = allocVector(STRSXP, nap)));
	while (nap-- > 0) {
	    INTEGER(r)[nap] = INTEGER(rval)[nap];
	    SET_STRING_ELT(s, nap, STRING_ELT(nval, nap));
	}
	UNPROTECT(2);

	return r;
    }
    UNPROTECT(1);

    return rval;
}