/** * Reads a complete utf-8 character * and returns the unicode value, or -1 on error. */ static int fd_read(struct current *current) { #ifdef USE_UTF8 char buf[4]; int n; int i; int c; if (read(current->fd, &buf[0], 1) != 1) { return -1; } n = utf8_charlen(buf[0]); if (n < 1 || n > 3) { return -1; } for (i = 1; i < n; i++) { if (read(current->fd, &buf[i], 1) != 1) { return -1; } } buf[n] = 0; /* decode and return the character */ utf8_tounicode(buf, &c); return c; #else return fd_read_char(current->fd, -1); #endif }
STATIC mp_obj_t uni_unary_op(mp_unary_op_t op, mp_obj_t self_in) { GET_STR_DATA_LEN(self_in, str_data, str_len); switch (op) { case MP_UNARY_OP_BOOL: return mp_obj_new_bool(str_len != 0); case MP_UNARY_OP_LEN: return MP_OBJ_NEW_SMALL_INT(utf8_charlen(str_data, str_len)); default: return MP_OBJ_NULL; // op not supported } }
static size_t linelen(const char *buffer, size_t cols) { size_t len; const char *p; len = 0; p = buffer; /* * Step 1: go up "cols" characters */ while (*p) { if (len >= cols) break; if (*p < ' ') break; #ifdef USE_UTF8 p += utf8_charlen((int) *p); #else p++; #endif len++; } /* * Ended at a CR/LF. Skip it, and return all of it. */ if (*p && (*p < ' ')) { while (*p && (*p < ' ')) p++; return p - buffer; } if (!*p) return p - buffer; /* short */ /* * Step 2: back up to the previous space */ while ((p >= buffer) && (*p > ' ')) p--; if (p > buffer) return p - buffer; /* * No previous space, print the entire word. */ while (*p && (*p > ' ')) p++; return p - buffer; }
/** * (Experimental) Implementation of mbrtowc for Windows. * This is required because the other, commonly available implementations * seem to not work very well, based on user reports. Someone who is * really, really good at windows programming needs to review this stuff! */ size_t lg_mbrtowc(wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) { int nb, nb2; if (NULL == s) return 0; if (0 == n) return -2; if (0 == *s) { *pwc = 0; return 0; } nb = utf8_charlen(s); if (0 == nb) return 0; if (0 > nb) return nb; nb2 = MultiByteToWideChar(CP_UTF8, 0, s, nb, NULL, 0); nb2 = MultiByteToWideChar(CP_UTF8, 0, s, nb, pwc, nb2); if (0 == nb2) return (size_t)-1; return nb; }
/* - regexec - match a regexp against a string */ int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags) { const char *s; int scan; /* Be paranoid... */ if (preg == NULL || preg->program == NULL || string == NULL) { return REG_ERR_NULL_ARGUMENT; } /* Check validity of program. */ if (*preg->program != REG_MAGIC) { return REG_ERR_CORRUPTED; } #ifdef DEBUG fprintf(stderr, "regexec: %s\n", string); regdump(preg); #endif preg->eflags = eflags; preg->pmatch = pmatch; preg->nmatch = nmatch; preg->start = string; /* All offsets are computed from here */ /* Must clear out the embedded repeat counts */ for (scan = OPERAND(1); scan != 0; scan = regnext(preg, scan)) { switch (OP(preg, scan)) { case REP: case REPMIN: case REPX: case REPXMIN: preg->program[scan + 4] = 0; break; } } /* If there is a "must appear" string, look for it. */ if (preg->regmust != 0) { s = string; while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) { if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) { break; } s++; } if (s == NULL) /* Not present. */ return REG_NOMATCH; } /* Mark beginning of line for ^ . */ preg->regbol = string; /* Simplest case: anchored match need be tried only once (maybe per line). */ if (preg->reganch) { if (eflags & REG_NOTBOL) { /* This is an anchored search, but not an BOL, so possibly skip to the next line */ goto nextline; } while (1) { int ret = regtry(preg, string); if (ret) { return REG_NOERROR; } if (*string) { nextline: if (preg->cflags & REG_NEWLINE) { /* Try the next anchor? */ string = strchr(string, '\n'); if (string) { preg->regbol = ++string; continue; } } } return REG_NOMATCH; } } /* Messy cases: unanchored match. */ s = string; if (preg->regstart != '\0') { /* We know what char it must start with. */ while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) { if (regtry(preg, s)) return REG_NOERROR; s++; } } else /* We don't -- general case. */ while (1) { if (regtry(preg, s)) return REG_NOERROR; if (*s == '\0') { break; } s += utf8_charlen(*s); } /* Failure. */ return REG_NOMATCH; }