SEXP R_damerau_levenshtein_split(SEXP a, SEXP b, SEXP penalty){ if ( length(a) != length(b) ){ error("Vectors must be of equal length.\n"); } // Rprintf("penalty %lf\n",asReal(penalty)); int n = length(a); // output vector SEXP yy; PROTECT(yy = allocVector(REALSXP, n)); double *y = REAL(yy); for (int i=0; i<n; i++) { const char *str1 = CHAR(STRING_ELT(a,i)); const char *str2 = CHAR(STRING_ELT(b,i)); unsigned int *int1 = (unsigned int *) malloc(( 1L + strlen(str1)) * sizeof(int)); unsigned int *int2 = (unsigned int *) malloc(( 1L + strlen(str2)) * sizeof(int)); int len1 = utf8_to_int( str1, int1 ); int len2 = utf8_to_int( str2, int2 ); if ( len1 < 0 || len2 < 0 ){ error("Encountered byte sequence not representing an utf-8 character.\n"); } y[i] = damerau_levenshtein_split(int1, len1, int2, len2, asReal(penalty)); free(int1); free(int2); } UNPROTECT(1); if (n < 0 ) error("Unable to allocate enough memory"); return(yy); }
// Get one element from x (VECSXP or STRSXP) convert to usigned int if necessary and store in c // TODO: this can probably be a bit optimized by decreasing the use of the *_ELT macros. unsigned int *get_elem(SEXP x, int i, int bytes, int intdist, int *len, int *isna, unsigned int *c){ if ( intdist ){ // we need a copy with trailing zero in this case since some distances // (e.g the dl-distance) expects this *isna = ( INTEGER(VECTOR_ELT(x,i))[0] == NA_INTEGER ); (*len) = length(VECTOR_ELT(x,i)); // this implicitly converts from int to unsigned int (but that should not influence the result). memcpy(c , INTEGER(VECTOR_ELT(x,i)), (*len) * sizeof(int)); c[*len] = 0; } else { *isna = ( STRING_ELT(x,i) == NA_STRING ); if (bytes){ (*len) = length(STRING_ELT(x,i)); for (int j=0; j < *len; j++ ){ c[j] = CHAR(STRING_ELT(x,i))[j]; } c[*len] = 0; } else { (*len) = utf8_to_int( CHAR(STRING_ELT(x,i)), c); if ( *len < 0 ){ error("Encountered byte sequence not representing an utf-8 character.\n"); } } } return c; }
static char * entitize(const char * str) { int elen, ecount, nsecount; entity_types *ep; const char * cp; char * p, * tmp, * xstr; char tmpsub[20]; int bytes = 0; int value = 0; ep = stdentities; elen = ecount = nsecount = 0; /* figure # of entity replacements and additional size. */ while (ep->text) { cp = str; while ((cp = strstr(cp, ep->text)) != NULL) { elen += strlen(ep->entity) - strlen(ep->text); ecount++; cp += strlen(ep->text); } ep++; } /* figure the same for other than standard entities (i.e. anything * that isn't in the range U+0000 to U+007F */ for ( cp = str; *cp; cp++ ) { if ( *cp & 0x80 ) { utf8_to_int( cp, &bytes, &value ); cp += bytes-1; elen += sprintf( tmpsub, "&#x%x;", value ) - bytes; nsecount++; } } /* enough space for the whole string plus entity replacements, if any */ tmp = g_malloc((strlen(str) + elen + 1)); strcpy(tmp, str); /* no entity replacements */ if (ecount == 0 && nsecount == 0) return (tmp); if ( ecount != 0 ) { for (ep = stdentities; ep->text; ep++) { p = tmp; while ((p = strstr(p, ep->text)) != NULL) { elen = strlen(ep->entity); xstr = g_strdup(p + strlen(ep->text)); strcpy(p, ep->entity); strcpy(p + elen, xstr); g_free(xstr); p += elen; } } } if ( nsecount != 0 ) { p = tmp; while (*p) { if ( *p & 0x80 ) { utf8_to_int( p, &bytes, &value ); if ( p[bytes] ) { xstr = g_strdup( p + bytes ); } else { xstr = NULL; } sprintf( p, "&#x%x;", value ); p = p+strlen(p); if ( xstr ) { strcpy( p, xstr ); g_free(xstr); } } else { p++; } } } return (tmp); }
void mcview_display_hex (mcview_t * view) { const screen_dimen top = view->data_area.top; const screen_dimen left = view->data_area.left; const screen_dimen height = view->data_area.height; const screen_dimen width = view->data_area.width; const int ngroups = view->bytes_per_line / 4; const screen_dimen text_start = 8 + 13 * ngroups + ((width < 80) ? 0 : (ngroups - 1 + 1)); /* 8 characters are used for the file offset, and every hex group * takes 13 characters. On "big" screens, the groups are separated * by an extra vertical line, and there is an extra space before the * text column. */ screen_dimen row; off_t from; int c; mark_t boldflag = MARK_NORMAL; struct hexedit_change_node *curr = view->change_list; #ifdef HAVE_CHARSET int ch = 0; #endif /* HAVE_CHARSET */ char hex_buff[10]; /* A temporary buffer for sprintf and mvwaddstr */ int bytes; /* Number of bytes already printed on the line */ mcview_display_clean (view); /* Find the first displayable changed byte */ from = view->dpy_start; while (curr && (curr->offset < from)) { curr = curr->next; } for (row = 0; mcview_get_byte (view, from, NULL) == TRUE && row < height; row++) { screen_dimen col = 0; size_t i; col = 0; /* Print the hex offset */ g_snprintf (hex_buff, sizeof (hex_buff), "%08" PRIXMAX " ", (uintmax_t) from); widget_move (view, top + row, left); tty_setcolor (VIEW_BOLD_COLOR); for (i = 0; col < width && hex_buff[i] != '\0'; i++) { tty_print_char (hex_buff[i]); /* tty_print_char(hex_buff[i]); */ col += 1; } tty_setcolor (VIEW_NORMAL_COLOR); for (bytes = 0; bytes < view->bytes_per_line; bytes++, from++) { #ifdef HAVE_CHARSET if (view->utf8) { int cw = 1; gboolean read_res = TRUE; ch = mcview_get_utf (view, from, &cw, &read_res); if (!read_res) break; /* char width is greater 0 bytes */ if (cw != 0) { int cnt; char corr_buf[UTF8_CHAR_LEN + 1]; struct hexedit_change_node *corr = curr; int res; res = g_unichar_to_utf8 (ch, (char *) corr_buf); for (cnt = 0; cnt < cw; cnt++) { if (curr != NULL && from + cnt == curr->offset) { /* replace only changed bytes in array of multibyte char */ corr_buf[cnt] = curr->value; curr = curr->next; } } corr_buf[res] = '\0'; /* Determine the state of the current multibyte char */ ch = utf8_to_int ((char *) corr_buf, &cw, &read_res); curr = corr; } } #endif /* HAVE_CHARSET */ if (!mcview_get_byte (view, from, &c)) break; /* Save the cursor position for mcview_place_cursor() */ if (from == view->hex_cursor && !view->hexview_in_text) { view->cursor_row = row; view->cursor_col = col; } /* Determine the state of the current byte */ boldflag = mcview_hex_calculate_boldflag (view, from, curr); /* Determine the value of the current byte */ if (curr != NULL && from == curr->offset) { c = curr->value; curr = curr->next; } /* Select the color for the hex number */ tty_setcolor (boldflag == MARK_NORMAL ? VIEW_NORMAL_COLOR : boldflag == MARK_SELECTED ? VIEW_BOLD_COLOR : boldflag == MARK_CHANGED ? VIEW_UNDERLINED_COLOR : /* boldflag == MARK_CURSOR */ view->hexview_in_text ? VIEW_SELECTED_COLOR : VIEW_UNDERLINED_COLOR); /* Print the hex number */ widget_move (view, top + row, left + col); if (col < width) { tty_print_char (hex_char[c / 16]); col += 1; } if (col < width) { tty_print_char (hex_char[c % 16]); col += 1; } /* Print the separator */ tty_setcolor (VIEW_NORMAL_COLOR); if (bytes != view->bytes_per_line - 1) { if (col < width) { tty_print_char (' '); col += 1; } /* After every four bytes, print a group separator */ if (bytes % 4 == 3) { if (view->data_area.width >= 80 && col < width) { tty_print_one_vline (TRUE); col += 1; } if (col < width) { tty_print_char (' '); col += 1; } } } /* Select the color for the character; this differs from the * hex color when boldflag == MARK_CURSOR */ tty_setcolor (boldflag == MARK_NORMAL ? VIEW_NORMAL_COLOR : boldflag == MARK_SELECTED ? VIEW_BOLD_COLOR : boldflag == MARK_CHANGED ? VIEW_UNDERLINED_COLOR : /* boldflag == MARK_CURSOR */ view->hexview_in_text ? VIEW_SELECTED_COLOR : MARKED_SELECTED_COLOR); #ifdef HAVE_CHARSET if (mc_global.utf8_display) { if (!view->utf8) { c = convert_from_8bit_to_utf_c ((unsigned char) c, view->converter); } if (!g_unichar_isprint (c)) c = '.'; } else if (view->utf8) ch = convert_from_utf_to_current_c (ch, view->converter); else #endif { #ifdef HAVE_CHARSET c = convert_to_display_c (c); #endif if (!is_printable (c)) c = '.'; } /* Print corresponding character on the text side */ if (text_start + bytes < width) { widget_move (view, top + row, left + text_start + bytes); #ifdef HAVE_CHARSET if (view->utf8) tty_print_anychar (ch); else #endif tty_print_char (c); } /* Save the cursor position for mcview_place_cursor() */ if (from == view->hex_cursor && view->hexview_in_text) { view->cursor_row = row; view->cursor_col = text_start + bytes; } } } /* Be polite to the other functions */ tty_setcolor (VIEW_NORMAL_COLOR); mcview_place_cursor (view); view->dpy_end = from; }
Stringset *new_stringset(SEXP str, int bytes, int intdist){ size_t nstr = length(str); Stringset *s; s = (Stringset *) malloc(sizeof(Stringset)); // get and set string lengths. s->str_len = (int *) malloc(nstr * sizeof(int)); size_t nbytes = 0L; if ( intdist ){ for (size_t i=0; i<nstr; i++){ nbytes += length(VECTOR_ELT(str,i)); } } else { for (size_t i=0; i<nstr; i++){ nbytes += length(STRING_ELT(str,i)); } } s->string = (unsigned int **) malloc(nstr * sizeof(int *)); // room for int rep of strings, including a trailing zero (needed by e.g. by full dl-distance) // this is enough room for byte-by-byte translation, so for UTF-8 it will be too much. s->data = (unsigned int *) malloc( (nstr + nbytes) * sizeof(int)); int *t = s->str_len; unsigned int *d = s->data; if ( intdist ){ for (size_t i=0L; i < nstr; i++, t++){ if ( INTEGER(VECTOR_ELT(str,i))[0] == NA_INTEGER ){ (*t) = NA_INTEGER; } else { (*t) = length(VECTOR_ELT(str,i)); memcpy(d, INTEGER(VECTOR_ELT(str,i)), (*t)*sizeof(int) ); s->string[i] = d; (*(d + (*t))) = 0L; // append a zero. d += (*t) + 1L; } } } else if ( bytes ){ for (size_t i=0L; i < nstr; i++, t++){ if ( STRING_ELT(str,i) == NA_STRING ){ (*t) = NA_INTEGER; } else { (*t) = char_to_int(CHAR(STRING_ELT(str,i)), d); s->string[i] = d; (*(d + (*t))) = 0L; // append a zero. d += (*t) + 1L; } } } else { for (size_t i=0L; i < nstr; i++, t++){ if ( STRING_ELT(str,i) == NA_STRING ){ (*t) = NA_INTEGER; } else { (*t) = utf8_to_int(CHAR(STRING_ELT(str,i)), d); s->string[i] = d; (*(d + (*t))) = 0L; // append a zero. d += (*t) + 1L; } } } return s; }