static FILE* fopen_win(const char* utf8path, const char* perm) { if (is_ascii(utf8path)) { return fopen(utf8path, perm); } const char* ptr = utf8path; const char* end = utf8path + strlen(utf8path); size_t n = 0; while (ptr < end) { SkUnichar u = SkUTF8_NextUnicharWithError(&ptr, end); if (u < 0) { return nullptr; // malformed UTF-8 } n += SkUTF16_FromUnichar(u); } std::vector<uint16_t> wchars(n + 1); uint16_t* out = wchars.data(); for (const char* ptr = utf8path; ptr < end;) { out += SkUTF16_FromUnichar(SkUTF8_NextUnicharWithError(&ptr, end), out); } SkASSERT(out == &wchars[n]); *out = 0; // final null wchar_t wperms[4] = {(wchar_t)perm[0], (wchar_t)perm[1], (wchar_t)perm[2], (wchar_t)perm[3]}; return _wfopen((wchar_t*)wchars.data(), wperms); }
static unsigned char * dump_char_size(char *c, unsigned char *d, int *off, ssize_t size, int convert) { char *p = c; if (c == NULL) { size = 0; d = dump_int(size, d, off); return d; } if (convert && !is_ascii (c, size)) { p = mutt_substrdup (c, c + size); if (mutt_convert_string (&p, Charset, "utf-8", 0) == 0) { c = p; size = mutt_strlen (c) + 1; } } d = dump_int(size, d, off); lazy_realloc(&d, *off + size); memcpy(d + *off, p, size); *off += size; if (p != c) FREE(&p); return d; }
/* * Takes a string of UTF8-encoded data and strips all characters we cannot use. * * - multibyte characters we can display are mapped over * - all other multibyte characters are stripped * - characters we cannot handle are turned into nulls * * Also see: http://en.wikipedia.org/wiki/Utf8 */ char *utf8_strip(const char *dirty) { // optimistic result: we end up with the same string uint8_t length = strlen(dirty), ci = 0, di = 0; unsigned char bytes = 0; unsigned char current = 0x00, next = 0x00; char *cleaned = ALLOC_STR(length); // iterate character by character and replace it for (di = 0, ci = 0; di < length; di++) { current = dirty[di]; if ( ! valid_utf8(current)) // invalid byte { continue; } else if ( ! is_ascii(current)) // multibyte { if (current == 0xC3 && (di + 1) < length) // might be åäöÅÄÖ, they all are 0xC3xx { next = dirty[++di]; // we consume the next character if (is_ascii(next)) // somehow, next byte is ascii (invalid utf8), so abort { // we cannot safely map the next byte in our charmap as it’ll collide // with the ascii characters which might be bad! continue; } else { current = next; } } else // skip all the additional bytes { bytes = (current & 0xF0); // 1111 xxxx while (bytes <<= 1) di += 1; current = '\0'; // let charmap handle it } } cleaned[ci++] = charmap[current]; } return cleaned; }
int filter_tab_buffer(char* pBuffer, int nStrLen){ char* pStr = NULL; char* pStr2 = NULL; int nLength = 0; int nColNum = 1; int i, isIgnored; g_nCurPos = 0; pStr = pBuffer; while( (*(pStr) != '\0') && (*(pStr+1) != '\0') ){ if ( !is_ascii((unsigned char)(*pStr)) ){ pStr2 = strchr(pStr, '\t'); isIgnored = 0; if ( g_ignoreIdx > -1 ){ /* judge this column is ignored */ for ( i = 0; i < MAX_IGNORE_COL_NUM; i++ ){ if ( g_ignoreCol[g_ignoreIdx][i] == 0 ) break; if ( g_ignoreCol[g_ignoreIdx][i] == nColNum ) { isIgnored = 1; break; } } } if ( pStr2 == NULL ){ nStrLen = (int)strlen(pBuffer); nLength = nStrLen - g_nCurPos; if ( isIgnored == 0 ) string_process(&pBuffer[g_nCurPos], nLength, 0); break; } else{ nLength = (int)(pStr2 - pBuffer) - g_nCurPos; if ( isIgnored == 0 ) nLength = string_process( &pBuffer[g_nCurPos], nLength, 0); else nLength++; pStr = pBuffer + g_nCurPos + nLength; g_nCurPos = (int)(pStr - pBuffer); nColNum ++; } continue; } else if ( is_tab(*pStr) ){ g_nCurPos = (int)(pStr - pBuffer) + 1; nColNum ++; } pStr++; } return 1; }
int filter_ini_buffer(char* pBuffer, int nStrLen){ int nRet; char* pStr = NULL; char* pStr2 = NULL; char cTail = '\0'; g_nCurPos = 0; /* if the line is a remark, not process */ if( is_ini_remark((unsigned char)pBuffer[0], (unsigned char)pBuffer[1]) ) return 1; /* if the line is a section, not process */ nRet = is_ini_section(pBuffer); if( nRet == 1 ){ pStr2 = strrchr(pBuffer, ']'); *pStr2 = '\0'; pStr = pBuffer; cTail = ']'; goto PROCESS; } else if( nRet == 2){ string_process(pBuffer, (int)strlen(pBuffer), 1); return 0; } /* if the line is not a key,then it's error! */ pStr = strchr(pBuffer, '='); if ( pStr == NULL ){ string_process(pBuffer, (int)strlen(pBuffer), 1); return 0; } /* if the line is a key,judge whether there is chinese in key value */ PROCESS: pStr += 1; pStr2 = pStr; if ( *pStr == '\0' ) return 1; nRet = (int)strlen(pStr2)+1; while( (*(pStr) != '\0') && (*(pStr+1) != '\0') ){ if ( !is_ascii((unsigned char)(*pStr)) ){ nRet = string_process(pStr2, (int)strlen(pStr2), 0); break; } pStr ++; } if ( cTail != '\0' ){ pStr2[nRet-1] = cTail; pStr2[nRet] = '\0'; } return 1; }
static int looks_like_proxy_exchange(tvbuff_t *tvb) { gint packet_length; const guchar *packet_data; packet_length = tvb_ensure_captured_length_remaining(tvb, PMPROXY_START_OF_PACKET); packet_data = tvb_get_ptr(tvb, PMPROXY_START_OF_PACKET, packet_length); /* A proxy exchange packet only contains ascii characters (eg "localhost 44321") and terminated with \n */ return is_ascii(packet_data, packet_length) && packet_data[packet_length-1] == '\n'; }
/** * maps a file to our address space * and returns it the calling function. */ void load_file(URL U, char *file) { FILE *fp; size_t len; char *buf; char *filename; char mode[8]; filename = trim(file); memset(mode, '\0', sizeof(mode)); snprintf(mode, sizeof(mode), "%s", (is_ascii(filename))?"r":"rb"); fp = fopen(filename, mode); if (! fp) { NOTIFY(ERROR, "unable to open file: %s", filename ); return; } fseek(fp, 0, SEEK_END); len = ftell(fp); fseek(fp, 0, SEEK_SET); buf = (char *)xmalloc(len+1); if ((fread(buf, 1, len, fp )) == len) { if (is_ascii(filename)) { buf[len] = '\0'; trim(buf); len = strlen(buf); } } else { NOTIFY(ERROR, "unable to read file: %s", filename ); } fclose(fp); if (len > 0) { url_set_conttype(U, get_content_type(filename)); url_set_postdata(U, buf, len); } xfree(buf); return; }
static size_t skip_bytes(char c) { if (is_ascii(c)) { return 1; } else if (is_kanji(c) || is_hankana(c)) { return 2; } else if (is_hojyo(c)) { return 3; } return 0; }
static size_t eucjp_iconv(iconv_t cd, char **srcbuf, size_t *srclen, char **outbuf, size_t *outlen) { unsigned char *tmpbuf, *tmp; unsigned char *src; unsigned char ch, cl; size_t ret; if (! (srcbuf && srclen && outbuf && outlen)) return 0; /* translate EUC-JP into SJIS */ src = (unsigned char *)*srcbuf; tmp = tmpbuf = malloc(*srclen+2); while (*src && ((tmp - tmpbuf) < *srclen)) { ch = *src++; if (is_ascii(ch)) { *tmp++ = ch; } else { cl = *src++; if (is_kanji(ch)) { *tmp++ = ((ch-0x5f)/2) ^ 0xA0; if (!(ch&1)) *tmp++ = cl - 0x02; else if (cl < 0xE0) *tmp++ = cl - 0x61; else *tmp++ = cl - 0x60; } else if (is_hankana(ch)) { if (cl < 0xA0 || cl > 0xDF) { *srcbuf=(char *)(src-2); errno=EILSEQ; return -1; } *tmp++ = cl; } else { /* We don't support JIS X 0212 */ *srcbuf=(char *)(src-2); errno=EILSEQ; return -1; } } } *tmp='\0'; ret = mssjis_iconv(cd, (char **) &tmpbuf, srclen, outbuf, outlen); free(tmpbuf); *srcbuf += *src; *srclen = 0; return ret; }
//Note: there is a safer version in GbUtil.* that writes to a SafeBuf. // . convert "-->%22 , &-->%26, +-->%2b, space-->+, ?-->%3f is that it? // . convert so we can display as a cgi PARAMETER within a url // . used by HttPage2 (cached web page) to encode the query into a url // . used by PageRoot to do likewise // . returns bytes written into "d" not including terminating \0 int32_t urlEncode ( char *d , int32_t dlen , const char *s , int32_t slen, bool requestPath ) { char *dstart = d; // subtract 1 to make room for a terminating \0 char *dend = d + dlen - 1; const char *send = s + slen; for ( ; s < send && d < dend ; s++ ) { if ( *s == '\0' && requestPath ) { *d++ = *s; continue; } // encode if not fit for display if ( ! is_ascii ( *s ) ) goto encode; switch ( *s ) { case ' ': goto encode; case '&': goto encode; case '"': goto encode; case '+': goto encode; case '%': goto encode; case '#': goto encode; // encoding < and > are more for displaying on an // html page than sending to an http server case '>': goto encode; case '<': goto encode; case '?': if ( requestPath ) break; goto encode; } // otherwise, no need to encode *d++ = *s; continue; encode: // space to + if ( *s == ' ' && d + 1 < dend ) { *d++ = '+'; continue; } // break out if no room to encode if ( d + 2 >= dend ) break; *d++ = '%'; // store first hex digit unsigned char v = ((unsigned char)*s)/16 ; if ( v < 10 ) v += '0'; else v += 'A' - 10; *d++ = v; // store second hex digit v = ((unsigned char)*s) & 0x0f ; if ( v < 10 ) v += '0'; else v += 'A' - 10; *d++ = v; } // NULL terminate it *d = '\0'; // and return the length return d - dstart; }
void url_encode(lfl_string* str) { lfl_string out; for (int i = 0; i < str->length(); i++) { char c = (*str)[i]; if( !is_ascii( c ) || is_special_character(c) ) { out += string_printf( "%%%2x", c ); } else { // Pass this character straight through. out += c; } } *str = out; }
int needs_escaping(int c) { switch (c) { case '\r': return esc_cr; case '\n': return esc_lf; case '\t': return esc_tabs; case '\v': return esc_vtab; case '\e': return esc_ansi; case ' ': return esc_space; } if (is_ascii(c)) return false; return true; }
int main(int argc, char *argv[]) { bool is_ascii(char str[]); void encryption_match(char word[], char slt[], char encrypted_pwd[], FILE *fptr_words); /* verify contents of command-line arg's, encrypted password, is composed entirely of ascii characters, string in argv[1] has a legth of 13, and is not empty, or more than one arg. */ if ( argc != 2 || ! is_ascii( argv[1]) || strlen(argv[1]) != 13 ) { printf ("\n*** An ERROR occured and the program has closed.***\n" "Input: ./crack <encrypted password>. \nWhere <encrypted " "password> must be 13 ASCII characters in length.\n"); return 1; } char encrypted_pwd [strlen(argv[1])]; char slt[2], word[81]; cpy_n_pst(argv[1], 0, 13, encrypted_pwd, 0); // assigns encrypted_pwd cpy_n_pst(argv[1], 0, 2, slt, 0); // assigns salt /* altternative dictionary file is exhaustive, and contains 4,160,636 entries. It is slow to load, and it contains commonly used passwords, and numbers, user names from above, and some words associated with Harvard, "/home/jharvard/Dropbox/wiki.txt". */ fptr_wiki = fopen("/usr/share/dict/words", "r"); if (fptr_wiki == 0) { printf("\nError opening the file requested.\n"); return 1; } /* fucntion calls (encryption_match), which runs a dictionary attack. If it fails to locate a match, it then proceeds to run a brute force attack. If both attacks fails the function retuns to main.*/ encryption_match(word, slt, encrypted_pwd, fptr_wiki); if ( match == false ) printf ("\nMATCH NOT FOUND.\n"); fclose(fptr_wiki); return 0; }
PyObject* create_Path(const utf8_string& path, const Optional<Settings>& maybeSettings) { // Fixme: Duplicates py-canvas.cpp if (!is_ascii(path)){ // Fixme: Consider adding ascii_string type throw ValueError("Non-ascii-characters in path definition."); } std::vector<PathPt> points(parse_svg_path(path.str())); if (points.empty()){ throw ValueError("Failed parsing path definition."); } if (points.front().IsNotMove()){ throw ValueError("Paths must begin with a Move-entry."); } const auto s = merge_settings(maybeSettings, default_path_settings()); return create_Shape(create_path_object_raw(Points(points), s)); }
void load_file(URL U, char *file) { FILE *fp; size_t len = 0; struct stat st; char *filename; char postdata[POSTBUF]; size_t postlen = 0; filename = trim(file); memset(postdata, 0, POSTBUF); if ((lstat(filename, &st) == 0) || (errno != ENOENT)) { len = (st.st_size >= POSTBUF) ? POSTBUF : st.st_size; if (len < (unsigned)st.st_size) { NOTIFY(WARNING, "Truncated file: %s exceeds the post limit of %d bytes.\n", filename, POSTBUF); } if ((fp = fopen(filename, "r")) == NULL) { NOTIFY(ERROR, "could not open file: %s", filename); return; } if ((fread(postdata, 1, len, fp )) == len) { if (is_ascii(filename)) { trim(postdata); postlen = strlen(postdata); } else { postlen = len; } } else { NOTIFY(ERROR, "unable to read file: %s", filename ); } fclose(fp); } if (strlen(postdata) > 0) { url_set_conttype(U, get_content_type(filename)); url_set_postdata(U, postdata, postlen); } return; }
size_t utf8_length(InputIterator first, InputIterator last, UTF16Type) { size_t len = 0; for (; first < last; ++len) { if (is_ascii(*first)) { first += 1; } else if (is_2byte(*first)) { first+= 2; } else if (is_3byte(*first)) { first += 3; } else if (is_4byte(*first)) { first += 4; ++len; } else if (is_5byte(*first)) { first += 5; } else if (is_6byte(*first)) { first += 6; } else { break; } } if (first != last) { HPROSE_THROW_EXCEPTION("Not a UTF-8 string"); } return len; }
void demo(size_t N) { printf("string size = %zu \n", N); char *data = (char *)malloc(N); bool expected = true; // it is all ascii? int repeat = 5; printf("We are feeding ascii so it is always going to be ok.\n"); BEST_TIME(is_ascii(data, N), expected,populate(data,N) , repeat, N, true); BEST_TIME(validate_utf8(data, N), expected,populate(data,N) , repeat, N, true); BEST_TIME(validate_utf8_branchless(data, N), expected,populate(data,N) , repeat, N, true); BEST_TIME(validate_utf8_double(data, N), expected,populate(data,N) , repeat, N, true); BEST_TIME(shiftless_validate_utf8(data, N), expected,populate(data,N) , repeat, N, true); BEST_TIME(shiftless_validate_utf8_branchless(data, N), expected,populate(data,N) , repeat, N, true); BEST_TIME(shiftless_validate_utf8_double(data, N), expected,populate(data,N) , repeat, N, true); BEST_TIME(validate_utf8_sse_nocheating(data, N), expected,populate(data,N) , repeat, N, true); BEST_TIME(validate_utf8_sse(data, N), expected,populate(data,N) , repeat, N, true); free(data); }
std::basic_string<Ch> create_escapes(const std::basic_string<Ch> &s) { std::basic_string<Ch> result; typename std::basic_string<Ch>::const_iterator b = s.begin(); typename std::basic_string<Ch>::const_iterator e = s.end(); while (b != e) { // This assumes an ASCII superset. But so does everything in PTree. // We escape everything outside ASCII, because this code can't // handle high unicode characters. if (*b == 0x20 || *b == 0x21 || (*b >= 0x23 && *b <= 0x2E) || (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && is_ascii(*b))) result += *b; else if (*b == Ch('\b')) result += Ch('\\'), result += Ch('b'); else if (*b == Ch('\f')) result += Ch('\\'), result += Ch('f'); else if (*b == Ch('\n')) result += Ch('\\'), result += Ch('n'); else if (*b == Ch('\r')) result += Ch('\\'), result += Ch('r'); else if (*b == Ch('\t')) result += Ch('\\'), result += Ch('t'); else if (*b == Ch('/')) result += Ch('\\'), result += Ch('/'); else if (*b == Ch('"')) result += Ch('\\'), result += Ch('"'); else if (*b == Ch('\\')) result += Ch('\\'), result += Ch('\\'); else { const char *hexdigits = "0123456789ABCDEF"; typedef typename make_unsigned<Ch>::type UCh; unsigned long u = (std::min)(static_cast<unsigned long>( static_cast<UCh>(*b)), 0xFFFFul); int d1 = u / 4096; u -= d1 * 4096; int d2 = u / 256; u -= d2 * 256; int d3 = u / 16; u -= d3 * 16; int d4 = u; result += Ch('\\'); result += Ch('u'); result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]); result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]); } ++b; } return result; }
static void search_buff(const buffer *b, char * p, const int encoding, const bool case_search, const int ext) { assert(p); const int p_len = strlen(p); const int (*cmp)(const char *, const char *, size_t) = case_search ? strncmp : strncasecmp; for(line_desc *ld = (line_desc *)b->line_desc_list.head, *next; next = (line_desc *)ld->ld_node.next; ld = next) { int64_t l = 0, r = 0; do { /* find left edge of word */ while (l < ld->line_len - p_len && !ne_isword(get_char(&ld->line[l], b->encoding), b->encoding)) l += get_char_width(&ld->line[l], b->encoding); if (l < ld->line_len - p_len ) { int ch; /* find right edge of word */ r = l + get_char_width(&ld->line[l], b->encoding); /* accept "'" as a word character if it is followed by another word character, so that words like "don't" are not broken into "don" and "t". */ while (r < ld->line_len && ( ne_isword(ch=get_char(&ld->line[r], b->encoding), b->encoding) || ( r+1 < ld->line_len && ch == '\'' && ne_isword(get_char(&ld->line[r+1], b->encoding), b->encoding)) ) ) r += get_char_width(&ld->line[r], b->encoding); if ((b != cur_buffer || ld != b->cur_line_desc || b->cur_pos < l || r < b->cur_pos) && r - l > p_len && (b->encoding == encoding || is_ascii(&ld->line[l], r - l)) && !cmp(p, &ld->line[l], p_len)) add_string(&ld->line[l], r - l, ext); l = r; count_scanned++; } assert(l <= ld->line_len); if (stop || count_scanned >= MAX_AUTOCOMPLETE_SCAN) { add_string(NULL, -1, 0); return; } } while (l < ld->line_len - p_len); } add_string(NULL, -1, 0); }
static void restore_char(char **c, const unsigned char *d, int *off, int convert) { unsigned int size; restore_int(&size, d, off); if (size == 0) { *c = NULL; return; } *c = safe_malloc(size); memcpy(*c, d + *off, size); if (convert && !is_ascii (*c, size)) { char *tmp = safe_strdup (*c); if (mutt_convert_string (&tmp, "utf-8", Charset, 0) == 0) { mutt_str_replace (c, tmp); } else { FREE(&tmp); } } *off += size; }
static void print_value_scalar(TypeDb& db, zcm_field_t *field, void *data, int *usertype_count) { switch(field->type) { case ZCM_FIELD_BYTE: case ZCM_FIELD_INT8_T: { int8_t i = *(int8_t *) data; printf(" %d", i); if(is_ascii(i)) printf(" (%c)", i); break; } case ZCM_FIELD_INT16_T: printf("% d", *(int16_t *) data); break; case ZCM_FIELD_INT32_T: printf("% d", *(int32_t *) data); break; case ZCM_FIELD_INT64_T: printf("% " PRIi64 "", *(int64_t *) data); break; case ZCM_FIELD_FLOAT: printf("% f", *(float *) data); break; case ZCM_FIELD_DOUBLE: printf("% f", *(double *) data); break; case ZCM_FIELD_STRING: printf("\"%s\"", *(const char **) data); break; case ZCM_FIELD_BOOLEAN: printf("%s", (*(int8_t*) data) == 1 ? "true" : "false"); break; case ZCM_FIELD_USER_TYPE: { if (db.getByName(field->typestr)) { if(usertype_count == NULL) { printf("<USER>"); } else { int n = ++*usertype_count; printf("<%d>", n); } } else { printf("<unknown-user-type>"); } break; } default: printf("???"); fprintf(stderr, "ERR: failed to handle zcm message field type: %s\n", field->typestr); break; } }
bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) { int32_t i = 0; int32_t j; int32_t wlen; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) { goto done; } if ( ! s[i] ) { goto done; } if ( !is_alnum_utf8( s + i ) ) { if ( m_numWords >= m_preCount ) { goto done; } // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if( m_tagIds ) { if ( s[i + 1] == '/' ) { // skip over / m_tagIds[m_numWords] = ::getTagId( s + i + 2 ); m_tagIds[m_numWords] |= BACKBIT; } else { m_tagIds[m_numWords] = ::getTagId( s + i + 1 ); } } m_words[m_numWords] = s + i; m_wordIds[m_numWords] = 0LL; // skip till end int32_t tagLen = getTagLen( s + i ); m_wordLens[m_numWords] = tagLen; m_nodes[m_numWords] = 0; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; for ( ; s[i] ; i += getUtf8CharSize(s+i)) { // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) { break; } // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) { continue; } // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) { continue; } // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; m_nodes [ m_numWords ] = 0; if (m_tagIds) { m_tagIds[m_numWords] = 0; } m_numWords++; goto uptop; } // get an alnum word j = i; again: for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // comma is ok if like ,ddd!d if ( s[i]==',' && i-j <= 3 && is_digit(s[i-1]) ) { // if word so far is 2 or 3 chars, make sure digits if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo; if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo; // scan forward while ( s[i] == ',' && is_digit(s[i+1]) && is_digit(s[i+2]) && is_digit(s[i+3]) && ! is_digit(s[i+4]) ) { i += 4; } } // decimal point? if ( s[i] == '.' && is_digit(s[i-1]) && is_digit(s[i+1]) ) { // allow the decimal point i++; // skip over string of digits while ( is_digit(s[i]) ) i++; } nogo: // allow for words like we're dave's and i'm if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) { i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; if ( computeWordIds ) { int64_t h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; } m_nodes[m_numWords] = 0; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // get a punct word goto uptop; done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); gbshutdownLogicError(); } return true; }
int filter_cpp_buffer(char* pBuffer, int nStrLen){ char* pStr = NULL; char* pStr2 = NULL; int nLength = 0; int nCount = 0; int bError = 0; //if the line is a cpp remark, not process if( is_cpp_remark(pBuffer[0], pBuffer[1]) ) return 1; if( is_c_remark(pBuffer[0], pBuffer[1]) ){ if( strstr(pBuffer, "*/") == NULL ) g_nFinishRemark = 0; return 1; } if( g_nFinishRemark == 0 ){ if( strstr(pBuffer, "*/") ) g_nFinishRemark = 1; return 1; } pStr = pBuffer; while( (*(pStr) != '\0') && (*(pStr+1) != '\0') ){ if( is_cpp_remark(*pStr, *(pStr+1)) || is_c_remark(*pStr, *(pStr+1))) break; if ( !is_ascii((unsigned char)(*pStr)) ){ pStr2 = strchr(pStr, '"'); if ( pStr2 == NULL ) bError = 1; else if ((nCount%2) == 0){ if ( *(pStr2-1) != '\\' ) bError = 1; } if ( bError == 1 ){ nStrLen = (int)strlen(pBuffer); nLength = nStrLen - g_nCurPos; if ( nLength > 0 ) string_process(&pBuffer[g_nCurPos], nLength, 1); break; } else{ nLength = (int)(pStr2 - pBuffer) - g_nCurPos; nLength = string_process(&pBuffer[g_nCurPos], nLength, 0); nCount ++; pStr = pBuffer + g_nCurPos + nLength; g_nCurPos = (int)(pStr - pBuffer); } continue; } else if( is_double_quotation(*pStr) ){ nCount ++; if( nCount % 2 ) g_nCurPos = (int)(pStr - pBuffer) + 1; } pStr++; } return 1; }
int filter_lua_buffer(char* pBuffer, int nStrLen){ char* pStr = NULL; char* pStr2 = NULL; char* pOld = NULL; int nLength = 0; int nDoubleCount = 0; int nSingleCount = 0; int nTailLen = 0; int bError = 0; int nBiasNum = 0; char cQuotation, cOld; g_nCurPos = 0; /* if the line is a remark, not process */ if( is_lua_remark(pBuffer[0], pBuffer[1]) ) return 1; pStr = pBuffer; while( (*(pStr) != '\0') && (*(pStr+1) != '\0') ){ /* stop when encounter lua remark */ if ( is_lua_remark(*pStr, *(pStr+1)) ) break; /* when encounter a chinese, draw out the entire string from its start after '"' to its end before the next '"' */ if ( !is_ascii((unsigned char)(*pStr)) ){ if ( (nSingleCount % 2) == (nDoubleCount % 2) ) bError = 1; else if ( nDoubleCount % 2 ) cQuotation = '"'; else if ( nSingleCount % 2 ) cQuotation = '\''; if ( bError != 1 ){ pStr2 = strchr(++pStr, cQuotation); while ( pStr2 != NULL && is_escape_quotation( pStr2, cQuotation ) > 0 ){ pStr2 = strchr(++pStr2, cQuotation); } } if ( pStr2 == NULL ) bError = 1; if ( bError == 1 ){ nStrLen = (int)strlen(pBuffer); nLength = nStrLen - g_nCurPos; if ( nLength > 0 ) string_process(&pBuffer[g_nCurPos], nLength, 1); break; } else{ nLength = (int)(pStr2 - pBuffer) - g_nCurPos; if ( cQuotation == '"' ) nDoubleCount ++; else if ( cQuotation == '\'' ) nSingleCount ++; nTailLen = get_tail_func_name_len( &pBuffer[g_nCurPos], nLength); nLength = string_process(&pBuffer[g_nCurPos], nLength-nTailLen, 0); if ( nTailLen >= 2 ){ pStr = pBuffer + g_nCurPos + nLength; pOld = pStr2; while ( ++pStr < pOld ){ if ( ( is_escape_quotation( pStr, cQuotation ) == 1 ) || ( ( is_single_quotation(*pStr) || is_double_quotation(*pStr) ) && *pStr != cQuotation ) ){ pStr2 = strchr( (pStr + 1), *pStr ); while ( pStr2 != NULL ){ if ( *pStr2 != cQuotation ) break; else if ( is_escape_quotation( pStr2, cQuotation ) == 1 ) break; pStr2 = strchr( ++pStr2, *pStr ); } if ( pStr2 == NULL ){ nStrLen = (int)strlen(pBuffer); nLength = nStrLen - g_nCurPos; if ( nLength > 0 ) string_process(&pBuffer[g_nCurPos], nLength, 1); return 1; } *pStr2 = '\0'; if ( is_there_gbk_code(pStr) ){ *pStr2 = *pStr; string_process(&pBuffer[(int)(pStr - pBuffer) + 1], ((int)(pStr2 - pStr) - 1), 0); } pStr = pStr2; } } } nLength += nTailLen; pStr = pBuffer + g_nCurPos + nLength; g_nCurPos = (int)(pStr - pBuffer); } continue; } // only a string starts at '"' maybe a string contain chinese // record the position after the '"' // as the start of possible chinese string else if( (nSingleCount % 2) == 0 && is_escape_quotation( pStr, '"' ) == 0 ){ if( (++nDoubleCount) % 2 ) g_nCurPos = (int)(pStr - pBuffer) + 1; } else if( (nDoubleCount % 2) == 0 && is_escape_quotation( pStr, '\'' ) == 0 ){ if( (++nSingleCount) % 2 ) g_nCurPos = (int)(pStr - pBuffer) + 1; } pStr++; } return 1; }
int print_output(char **output, int i, char **argv, int argc) { //stat files struct stat buf; int exists; int x; for(x = 0; x < i; x++) { exists = stat(output[x], &buf); if (exists < 0) { fprintf(stderr, "%s not found\n", output[x]); } else { ////////////get the argument int arge = find_arg_element(argv, argc); char* ls_arg; ls_arg = (char*)malloc(MAX_BUFFER_SIZE); if(arge != -1) ls_arg = strdup(argv[arge]); else ls_arg = ""; /////////////filter arguments // l option char *l_opt; l_opt = (char*)malloc(MAX_BUFFER_SIZE); if(str_index(ls_arg, "l") != -1) { //convert stat's date time_t rawtime = buf.st_mtime; char *date = ctime(&rawtime); //get rid of end newline date[strlen(date) - 1] = '\0'; int read, write, execute; read = buf.st_mode & S_IEXEC; read = (int)read; //read = sqrt(read) - 1; sprintf(l_opt, "%4d %d %d %4d %5d %s", buf.st_mode, buf.st_nlink, buf.st_uid, buf.st_gid, buf.st_size, date); } else {l_opt = "";} // t option char *t_opt; t_opt = (char*)malloc(MAX_BUFFER_SIZE); if(str_index(ls_arg, "t") != -1) { //convert stat's date time_t rawtime = buf.st_mtime; char *date = ctime(&rawtime); //get rid of end newline date[strlen(date) - 1] = '\0'; sprintf(t_opt, "%s", date); } else {t_opt = "";} // f option char *f_opt; f_opt = (char*)malloc(MAX_BUFFER_SIZE); if(str_index(ls_arg, "f") != -1) { //open file and see what its first bits look like FILE *fp; fp = fopen(output[x], "r"); char line [MAX_BUFFER_SIZE]; if(fp != NULL) { if(fgets(line, sizeof(line), fp ) != NULL) { //fputs ( line, stdout ); //print_ts_str(line); if(is_o(output[x]) == 1) f_opt = " - Relocatable .o file"; else if(is_elf(line) == 1) f_opt = " - ELF File"; else if(is_dos(fp) == 1) f_opt = " - ASCII File"; else if(is_ascii(fp) == 1) f_opt = " - ASCII File"; else f_opt = " - file unknow"; } else f_opt = " - directory"; } fclose(fp); } else {f_opt = "";} //print formatted text printf("%s %s %10s %s\n", t_opt, l_opt, output[x], f_opt); } } return(0); }
sstring::sstring(const char *from) { #ifdef DEBUG assert(is_ascii(from)); #endif encode(from); }
sstring::sstring(const char *from, size_t length) { #ifdef DEBUG assert(is_ascii(from)); #endif encode(from, length); }
bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) { long i = 0; long j; //long k = 0; long wlen; //unsigned long e; //long skip; long badCount = 0; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) goto done; if ( ! s[i] ) goto done; if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) { if ( m_numWords >= m_preCount ) goto done; // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if ( s[i+1]=='/' ) { // skip over / m_tagIds [m_numWords] = ::getTagId(s+i+2); m_tagIds [m_numWords] |= BACKBIT; } else m_tagIds [m_numWords] = ::getTagId(s+i+1); // word start m_words [m_numWords] = s + i; m_wordIds [m_numWords] = 0LL; // skip till end long tagLen = getTagLen(s+i); // ,niceness); m_wordLens [m_numWords] = tagLen; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; //for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i)); for ( ; s[i] ; i += getUtf8CharSize(s+i)){ // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) break; // breathe QUICKPOLL(niceness); // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) continue; // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; goto uptop; } // get an alnum word j = i; again: //for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) ); for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // breathe QUICKPOLL(niceness); // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // allow for words like we're dave's and i'm if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){ i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; // . Lars says it's better to leave the accented chars intact // . google agrees // . but what about "re'sume"? if ( computeWordIds ) { long long h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; // until we get an accent removal algo, comment this // out and possibly use the query synonym pipeline // to search without accents. MDW //long long h2 = hash64AsciiLowerE(&s[j],wlen); //if ( h2 != h ) m_stripWordIds [m_numWords] = h2; //else m_stripWordIds [m_numWords] = 0LL; //m_stripWordIds[m_numWords] = 0; } if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // break on \0 or MAX_WORDS //if ( ! s[i] ) goto done; // get a punct word goto uptop; /* j = i; // delineate the "punctuation" word for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i)); // bad utf8 could cause us to breach the node, so watch out! if ( i > nodeLen ) { badCount++; i = nodeLen; } // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [m_numWords ] = &s[j]; m_wordLens [m_numWords ] = wlen; m_wordIds [m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; */ done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); char *xx = NULL; *xx = 0; } // compute total length if ( m_numWords <= 0 ) m_totalLen = 0; else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1]; if ( badCount ) log("words: had %li bad utf8 chars",badCount); return true; }
ps_output &ps_output::put_string(const char *s, int n) { int len = 0; int i; for (i = 0; i < n; i++) { char c = s[i]; if (is_ascii(c) && csprint(c)) { if (c == '(' || c == ')' || c == '\\') len += 2; else len += 1; } else len += 4; } if (len > n*2) { if (col + n*2 + 2 > max_line_length && n*2 + 2 <= max_line_length) { putc('\n', fp); col = 0; } if (col + 1 > max_line_length) { putc('\n', fp); col = 0; } putc('<', fp); col++; for (i = 0; i < n; i++) { if (col + 2 > max_line_length) { putc('\n', fp); col = 0; } fprintf(fp, "%02x", s[i] & 0377); col += 2; } putc('>', fp); col++; } else { if (col + len + 2 > max_line_length && len + 2 <= max_line_length) { putc('\n', fp); col = 0; } if (col + 2 > max_line_length) { putc('\n', fp); col = 0; } putc('(', fp); col++; for (i = 0; i < n; i++) { char c = s[i]; if (is_ascii(c) && csprint(c)) { if (c == '(' || c == ')' || c == '\\') len = 2; else len = 1; } else len = 4; if (col + len + 1 > max_line_length) { putc('\\', fp); putc('\n', fp); col = 0; } switch (len) { case 1: putc(c, fp); break; case 2: putc('\\', fp); putc(c, fp); break; case 4: fprintf(fp, "\\%03o", c & 0377); break; default: assert(0); } col += len; } putc(')', fp); col++; } need_space = 0; return *this; }
static int input_buffer_is_ascii() { return is_ascii(input_buffer, len); }