Ejemplos de utf8_to_int en C++ (Cpp)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: damerau_levenshtein_split.c Proyecto: mrschuster/dlsplit

SEXP R_damerau_levenshtein_split(SEXP a, SEXP b, SEXP penalty){
  if ( length(a) != length(b) ){
    error("Vectors must be of equal length.\n");
  }
//   Rprintf("penalty %lf\n",asReal(penalty));
  
  int n = length(a);
  
  // output vector
  SEXP yy;
  PROTECT(yy = allocVector(REALSXP, n));
  double *y = REAL(yy);

  for (int i=0; i<n; i++) {
    const char *str1 = CHAR(STRING_ELT(a,i));
    const char *str2 = CHAR(STRING_ELT(b,i));
    unsigned int *int1 = (unsigned int *) malloc(( 1L + strlen(str1)) * sizeof(int));
    unsigned int *int2 = (unsigned int *) malloc(( 1L + strlen(str2)) * sizeof(int));
    int len1 = utf8_to_int( str1, int1 );
    int len2 = utf8_to_int( str2, int2 );
    if ( len1 < 0 || len2 < 0 ){
      error("Encountered byte sequence not representing an utf-8 character.\n");
    }
    y[i] = damerau_levenshtein_split(int1, len1, int2, len2, asReal(penalty));
    free(int1);
    free(int2);
  }
  
  UNPROTECT(1);
  if (n < 0 ) error("Unable to allocate enough memory");
  return(yy);
}

Ejemplo n.º 2

0

Mostrar archivo

Archivo: utf8ToInt.c Proyecto: cran/stringdist

// Get one element from x (VECSXP or STRSXP) convert to usigned int if necessary and store in c
// TODO: this can probably be a bit optimized by decreasing the use of the *_ELT macros.
unsigned int *get_elem(SEXP x, int i, int bytes, int intdist, int *len, int *isna, unsigned int *c){

  if ( intdist ){
    // we need a copy with trailing zero in this case since some distances 
    // (e.g the dl-distance) expects this
    *isna = ( INTEGER(VECTOR_ELT(x,i))[0] == NA_INTEGER );
    (*len) = length(VECTOR_ELT(x,i));
    // this implicitly converts from int to unsigned int (but that should not influence the result).
    memcpy(c , INTEGER(VECTOR_ELT(x,i)), (*len) * sizeof(int));
    c[*len] = 0;
  } else {
    *isna = ( STRING_ELT(x,i) == NA_STRING );
    if (bytes){
      (*len)  = length(STRING_ELT(x,i));
      for (int j=0; j < *len; j++ ){
        c[j] = CHAR(STRING_ELT(x,i))[j];
      }
        c[*len] = 0;
    } else {
      (*len)  = utf8_to_int( CHAR(STRING_ELT(x,i)), c);
      if ( *len < 0 ){
        error("Encountered byte sequence not representing an utf-8 character.\n");
      }
    }
  }
  return  c;
}

Ejemplo n.º 3

0

Mostrar archivo

Archivo: gpx.c Proyecto: idaohang/viking

static
char *
entitize(const char * str)
{
        int elen, ecount, nsecount;
        entity_types *ep;
        const char * cp;
        char * p, * tmp, * xstr;

        char tmpsub[20];
        int bytes = 0;
        int value = 0;
        ep = stdentities;
        elen = ecount = nsecount = 0;

        /* figure # of entity replacements and additional size. */
        while (ep->text) {
                cp = str;
                while ((cp = strstr(cp, ep->text)) != NULL) {
                        elen += strlen(ep->entity) - strlen(ep->text);
                        ecount++;
                        cp += strlen(ep->text);
                }
                ep++;
        }

        /* figure the same for other than standard entities (i.e. anything
         * that isn't in the range U+0000 to U+007F */
        for ( cp = str; *cp; cp++ ) {
                if ( *cp & 0x80 ) {

                        utf8_to_int( cp, &bytes, &value );
                        cp += bytes-1;
                        elen += sprintf( tmpsub, "&#x%x;", value ) - bytes;
                        nsecount++;
                }
        }

        /* enough space for the whole string plus entity replacements, if any */
        tmp = g_malloc((strlen(str) + elen + 1));
        strcpy(tmp, str);

        /* no entity replacements */
        if (ecount == 0 && nsecount == 0)
                return (tmp);

        if ( ecount != 0 ) {
                for (ep = stdentities; ep->text; ep++) {
                        p = tmp;
                        while ((p = strstr(p, ep->text)) != NULL) {
                                elen = strlen(ep->entity);

                                xstr = g_strdup(p + strlen(ep->text));

                                strcpy(p, ep->entity);
                                strcpy(p + elen, xstr);

                                g_free(xstr);

                                p += elen;
                        }
                }
        }

        if ( nsecount != 0 ) {
                p = tmp;
                while (*p) {
                        if ( *p & 0x80 ) {
                                utf8_to_int( p, &bytes, &value );
                                if ( p[bytes] ) {
                                        xstr = g_strdup( p + bytes );
                                }
                                else {
                                        xstr = NULL;
                                }
                                sprintf( p, "&#x%x;", value );
                                p = p+strlen(p);
                                if ( xstr ) {
                                        strcpy( p, xstr );
                                        g_free(xstr);
                                }
                        }
                        else {
                                p++;
                        }
                }
        }
        return (tmp);
}

Ejemplo n.º 4

0

Mostrar archivo

Archivo: hex.c Proyecto: NoSeungHwan/mc_kor_dev

void
mcview_display_hex (mcview_t * view)
{
    const screen_dimen top = view->data_area.top;
    const screen_dimen left = view->data_area.left;
    const screen_dimen height = view->data_area.height;
    const screen_dimen width = view->data_area.width;
    const int ngroups = view->bytes_per_line / 4;
    const screen_dimen text_start = 8 + 13 * ngroups + ((width < 80) ? 0 : (ngroups - 1 + 1));
    /* 8 characters are used for the file offset, and every hex group
     * takes 13 characters. On "big" screens, the groups are separated
     * by an extra vertical line, and there is an extra space before the
     * text column.
     */

    screen_dimen row;
    off_t from;
    int c;
    mark_t boldflag = MARK_NORMAL;
    struct hexedit_change_node *curr = view->change_list;
#ifdef HAVE_CHARSET
    int ch = 0;
#endif /* HAVE_CHARSET */

    char hex_buff[10];          /* A temporary buffer for sprintf and mvwaddstr */
    int bytes;                  /* Number of bytes already printed on the line */

    mcview_display_clean (view);

    /* Find the first displayable changed byte */
    from = view->dpy_start;
    while (curr && (curr->offset < from))
    {
        curr = curr->next;
    }

    for (row = 0; mcview_get_byte (view, from, NULL) == TRUE && row < height; row++)
    {
        screen_dimen col = 0;
        size_t i;

        col = 0;

        /* Print the hex offset */
        g_snprintf (hex_buff, sizeof (hex_buff), "%08" PRIXMAX " ", (uintmax_t) from);
        widget_move (view, top + row, left);
        tty_setcolor (VIEW_BOLD_COLOR);
        for (i = 0; col < width && hex_buff[i] != '\0'; i++)
        {
            tty_print_char (hex_buff[i]);
            /*              tty_print_char(hex_buff[i]); */
            col += 1;
        }
        tty_setcolor (VIEW_NORMAL_COLOR);

        for (bytes = 0; bytes < view->bytes_per_line; bytes++, from++)
        {

#ifdef HAVE_CHARSET
            if (view->utf8)
            {
                int cw = 1;
                gboolean read_res = TRUE;

                ch = mcview_get_utf (view, from, &cw, &read_res);
                if (!read_res)
                    break;
                /* char width is greater 0 bytes */
                if (cw != 0)
                {
                    int cnt;
                    char corr_buf[UTF8_CHAR_LEN + 1];
                    struct hexedit_change_node *corr = curr;
                    int res;

                    res = g_unichar_to_utf8 (ch, (char *) corr_buf);

                    for (cnt = 0; cnt < cw; cnt++)
                    {
                        if (curr != NULL && from + cnt == curr->offset)
                        {
                            /* replace only changed bytes in array of multibyte char */
                            corr_buf[cnt] = curr->value;
                            curr = curr->next;
                        }
                    }
                    corr_buf[res] = '\0';
                    /* Determine the state of the current multibyte char */
                    ch = utf8_to_int ((char *) corr_buf, &cw, &read_res);
                    curr = corr;
                }
            }
#endif /* HAVE_CHARSET */
            if (!mcview_get_byte (view, from, &c))
                break;

            /* Save the cursor position for mcview_place_cursor() */
            if (from == view->hex_cursor && !view->hexview_in_text)
            {
                view->cursor_row = row;
                view->cursor_col = col;
            }

            /* Determine the state of the current byte */
            boldflag = mcview_hex_calculate_boldflag (view, from, curr);

            /* Determine the value of the current byte */
            if (curr != NULL && from == curr->offset)
            {
                c = curr->value;
                curr = curr->next;
            }

            /* Select the color for the hex number */
            tty_setcolor (boldflag == MARK_NORMAL ? VIEW_NORMAL_COLOR :
                          boldflag == MARK_SELECTED ? VIEW_BOLD_COLOR :
                          boldflag == MARK_CHANGED ? VIEW_UNDERLINED_COLOR :
                          /* boldflag == MARK_CURSOR */
                          view->hexview_in_text ? VIEW_SELECTED_COLOR : VIEW_UNDERLINED_COLOR);

            /* Print the hex number */
            widget_move (view, top + row, left + col);
            if (col < width)
            {
                tty_print_char (hex_char[c / 16]);
                col += 1;
            }
            if (col < width)
            {
                tty_print_char (hex_char[c % 16]);
                col += 1;
            }

            /* Print the separator */
            tty_setcolor (VIEW_NORMAL_COLOR);
            if (bytes != view->bytes_per_line - 1)
            {
                if (col < width)
                {
                    tty_print_char (' ');
                    col += 1;
                }

                /* After every four bytes, print a group separator */
                if (bytes % 4 == 3)
                {
                    if (view->data_area.width >= 80 && col < width)
                    {
                        tty_print_one_vline (TRUE);
                        col += 1;
                    }
                    if (col < width)
                    {
                        tty_print_char (' ');
                        col += 1;
                    }
                }
            }

            /* Select the color for the character; this differs from the
             * hex color when boldflag == MARK_CURSOR */
            tty_setcolor (boldflag == MARK_NORMAL ? VIEW_NORMAL_COLOR :
                          boldflag == MARK_SELECTED ? VIEW_BOLD_COLOR :
                          boldflag == MARK_CHANGED ? VIEW_UNDERLINED_COLOR :
                          /* boldflag == MARK_CURSOR */
                          view->hexview_in_text ? VIEW_SELECTED_COLOR : MARKED_SELECTED_COLOR);


#ifdef HAVE_CHARSET
            if (mc_global.utf8_display)
            {
                if (!view->utf8)
                {
                    c = convert_from_8bit_to_utf_c ((unsigned char) c, view->converter);
                }
                if (!g_unichar_isprint (c))
                    c = '.';
            }
            else if (view->utf8)
                ch = convert_from_utf_to_current_c (ch, view->converter);
            else
#endif
            {
#ifdef HAVE_CHARSET
                c = convert_to_display_c (c);
#endif

                if (!is_printable (c))
                    c = '.';
            }

            /* Print corresponding character on the text side */
            if (text_start + bytes < width)
            {
                widget_move (view, top + row, left + text_start + bytes);
#ifdef HAVE_CHARSET
                if (view->utf8)
                    tty_print_anychar (ch);
                else
#endif
                    tty_print_char (c);
            }

            /* Save the cursor position for mcview_place_cursor() */
            if (from == view->hex_cursor && view->hexview_in_text)
            {
                view->cursor_row = row;
                view->cursor_col = text_start + bytes;
            }
        }
    }

    /* Be polite to the other functions */
    tty_setcolor (VIEW_NORMAL_COLOR);

    mcview_place_cursor (view);
    view->dpy_end = from;
}

Ejemplo n.º 5

0

Mostrar archivo

Archivo: utf8ToInt.c Proyecto: cran/stringdist

Stringset *new_stringset(SEXP str, int bytes, int intdist){
  size_t nstr = length(str);
  Stringset *s;
  s = (Stringset *) malloc(sizeof(Stringset));

  // get and set string lengths.
  s->str_len = (int *) malloc(nstr * sizeof(int));

  size_t nbytes = 0L;

  if ( intdist ){
    for (size_t i=0; i<nstr; i++){
      nbytes += length(VECTOR_ELT(str,i));
    }
  } else {
    for (size_t i=0; i<nstr; i++){
      nbytes += length(STRING_ELT(str,i));
    }
  }

  s->string = (unsigned int **) malloc(nstr * sizeof(int *));
  // room for int rep of strings, including a trailing zero (needed by e.g. by full dl-distance)
  // this is enough room for byte-by-byte translation, so for UTF-8 it will be too much.
  s->data = (unsigned int *) malloc( (nstr + nbytes) * sizeof(int));

  int *t = s->str_len;
  unsigned int *d = s->data;

  if ( intdist ){
    for (size_t i=0L; i < nstr; i++, t++){
      if ( INTEGER(VECTOR_ELT(str,i))[0] == NA_INTEGER ){
        (*t) = NA_INTEGER; 
      } else {
        (*t) = length(VECTOR_ELT(str,i));
        memcpy(d, INTEGER(VECTOR_ELT(str,i)), (*t)*sizeof(int) );
        s->string[i] = d;
        (*(d + (*t))) = 0L; // append a zero.
        d += (*t) + 1L;
      }
    }
  } else if ( bytes ){
    for (size_t i=0L; i < nstr; i++, t++){
      if ( STRING_ELT(str,i) == NA_STRING ){
        (*t) = NA_INTEGER; 
      } else {
        (*t) = char_to_int(CHAR(STRING_ELT(str,i)), d);
        s->string[i] = d;
        (*(d + (*t))) = 0L; // append a zero.
        d += (*t) + 1L;
      }
    }
  } else {
    for (size_t i=0L; i < nstr; i++, t++){
      if ( STRING_ELT(str,i) == NA_STRING ){
        (*t) = NA_INTEGER; 
      } else {
        (*t) = utf8_to_int(CHAR(STRING_ELT(str,i)), d); 
        s->string[i] = d;
        (*(d + (*t))) = 0L; // append a zero.
        d += (*t) + 1L;
      }
    }
  }

  return s;
}