void handleDerivedNormalizationProps(u_int32_t line, char **col, u_int32_t colCount) { //printf("Line %"INT32": ", line); //for (u_int32_t i=0;i<colCount;i++) // printf("'%s' ", col[i]); //printf("\n"); char *range = NULL; UChar32 codePointStart = strtol(col[0], &range, 16); UChar32 codePointEnd = codePointStart; if (range && range[0] == '.' && range[1] == '.') codePointEnd = strtol(range+2, NULL, 16); for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) { //printf("U+%04x ", c); // get current props, if any UCProps props = ucProperties(c); if (!strncmp(col[1], "NFKC_QC", 7)) props |= UC_NFKC_QC_NO; else if (!strncmp(col[1], "Full_Composition_Exclusion", 26)){ g_excludeCount++; props |= UC_COMP_EX; //printf("Excluding %4x props: %04x\n", c, props); } if (props) g_ucProps.setValue(c, &props); } //printf("\n"); }
void handleScripts(u_int32_t, char **col, u_int32_t colCount){ char *range = NULL; UChar32 codePointStart = strtol(col[0], &range, 16); UChar32 codePointEnd = codePointStart; if (range && range[0] == '.' && range[1] == '.') codePointEnd = strtol(range+2, NULL, 16); for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) { UCProps props = ucProperties(c); //void *p = g_ucProps.getValue(c); //if (p) props = *(u_char*)p; UCScript s = ucScriptCommon; for (int j=0; j < ucScriptNumScripts; j++) { if (!strcmp(col[1], g_ucScriptNames[j])){ s = j; g_ucScripts.setValue(c, &j); } } if (s == ucScriptThai) props |= UC_THAI; else if (s == ucScriptHiragana) props |= UC_HIRAGANA; else if (s == ucScriptKatakana) props |= UC_KATAKANA; else if (s == ucScriptKatakana_Or_Hiragana) props |= UC_KATAKANA|UC_HIRAGANA; if (props) g_ucProps.setValue(c, &props); } }
void handleDerivedCoreProps(u_int32_t line, char **col, u_int32_t colCount) { //printf("Line %"INT32": ", line); //for (u_int32_t i=0;i<colCount;i++) // printf("'%s' ", col[i]); //printf("\n"); char *range = NULL; UChar32 codePointStart = strtol(col[0], &range, 16); UChar32 codePointEnd = codePointStart; if (range && range[0] == '.' && range[1] == '.') codePointEnd = strtol(range+2, NULL, 16); for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) { //printf("U+%04x ", c); // get current props, if any UCProps props = ucProperties(c); if (!strncmp(col[1], "Alphabetic", 10)) props |= UC_ALPHA | UC_WORDCHAR; else if (!strncmp(col[1], "Default_Ignorable_Code_Point", 28)) props |= UC_IGNORABLE; else if (!strncmp(col[1], "Lowercase", 9)) props |= UC_LOWER | UC_WORDCHAR; else if (!strncmp(col[1], "Uppercase", 9)) props |= UC_UPPER | UC_WORDCHAR; else if (!strncmp(col[1], "Grapheme_Extend", 15)) props |= UC_WORDCHAR; if (props) g_ucProps.setValue(c, &props); // if (c == ' ' && (props&UC_WORDCHAR)) // printf("Yow: line %"INT32"\n", line); // if (c == 0 && props) // printf("!!!\nHey: line %"INT32"!!!\n\n", line); } //printf("\n"); }
void handlePropList(u_int32_t line, char **col, u_int32_t colCount) { //printf("Line %"INT32": ", line); //for (u_int32_t i=0;i<colCount;i++) // printf("'%s' ", col[i]); //printf("\n"); char *range = NULL; UChar32 codePointStart = strtol(col[0], &range, 16); UChar32 codePointEnd = codePointStart; if (range && range[0] == '.' && range[1] == '.') codePointEnd = strtol(range+2, NULL, 16); for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) { //printf("U+%04x ", c); // get current props, if any UCProps props = ucProperties(c); //void *p = g_ucProps.getValue(c); //if (p) props = *(u_char*)p; if (!strncmp(col[1], "Ideographic", 11)) props |= UC_IDEOGRAPH | UC_WORDCHAR; else if (!strncmp(col[1], "Unified_Ideograph", 17)) props |= UC_IDEOGRAPH | UC_WORDCHAR; else if (!strncmp(col[1], "White_Space", 11)) props |= UC_WHITESPACE; if (props) g_ucProps.setValue(c, &props); } //printf("\n"); }
void handleUnicodeData(u_int32_t line, char **col, u_int32_t colCount) { UChar32 codePoint = strtol(col[0], NULL, 16); // if ((colCount < 14) || (codePoint == 0)){ // printf("line %"INT32": no data (%"INT32" cols)\n", line, colCount); // return; // } char *name = col[1]; char *category = col[2]; u_char combiningClass = strtol(col[3], NULL, 10); char *decompStr = col[5]; UChar32 ucMapping = strtol(col[12],NULL, 16); UChar32 lcMapping = strtol(col[13],NULL, 16); // Set general category //g_ucCategory.setValue(codePoint, (void*)category); UCProps props = ucProperties(codePoint); if (category[0] == 'L') props |= UC_ALPHA | UC_WORDCHAR; else if (category[0] == 'N') props |= UC_DIGIT | UC_WORDCHAR; else if (category[0] == 'Z') props |= UC_WHITESPACE; if (props) g_ucProps.setValue(codePoint, &props); if (lcMapping) g_ucLowerMap.setValue(codePoint, (void*)&lcMapping); if (ucMapping) g_ucUpperMap.setValue(codePoint, (void*)&ucMapping); if (combiningClass) g_ucCombiningClass.setValue(codePoint, (void*)&combiningClass); if (decompStr && decompStr[0]){ u_char decompCount = 0; UChar32 decomp[32]; bool kompat = false; // Get decomposition char *p = decompStr; int decompLen = gbstrlen(decompStr); while (p < decompStr+decompLen) { char *pend = p; while (*pend && *pend != ' ') pend++; *pend = '\0'; if (p[0] == '<') kompat = true; else{ decomp[decompCount++] = strtol(p, NULL, 16); } p = pend+1; } // printf ("Code Point U+%04"XINT32", %s: %s (%d chars)\n", // codePoint, name, kompat?"(Kompatable)":"", decompCount); // g_decompCount++; // if (decompStr[0] != '<') bool fullComp=false; if (!kompat && !(props & UC_COMP_EX)) { // set up canonical combining table g_canonicalDecompCount++; // printf("%4x:", codePoint); // for (int i = 0; i<decompCount;i++) // printf(" %4x", decomp[i]); // printf("\n"); fullComp = true; } setKDValue(codePoint, decomp, decompCount, fullComp); // JAB: we now have Kompatible and Canonical decompositions if (!kompat) setCDValue(codePoint, decomp, decompCount); } }
bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) { long i = 0; long j; //long k = 0; long wlen; //unsigned long e; //long skip; long badCount = 0; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) goto done; if ( ! s[i] ) goto done; if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) { if ( m_numWords >= m_preCount ) goto done; // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if ( s[i+1]=='/' ) { // skip over / m_tagIds [m_numWords] = ::getTagId(s+i+2); m_tagIds [m_numWords] |= BACKBIT; } else m_tagIds [m_numWords] = ::getTagId(s+i+1); // word start m_words [m_numWords] = s + i; m_wordIds [m_numWords] = 0LL; // skip till end long tagLen = getTagLen(s+i); // ,niceness); m_wordLens [m_numWords] = tagLen; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; //for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i)); for ( ; s[i] ; i += getUtf8CharSize(s+i)){ // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) break; // breathe QUICKPOLL(niceness); // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) continue; // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; goto uptop; } // get an alnum word j = i; again: //for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) ); for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // breathe QUICKPOLL(niceness); // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // allow for words like we're dave's and i'm if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){ i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; // . Lars says it's better to leave the accented chars intact // . google agrees // . but what about "re'sume"? if ( computeWordIds ) { long long h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; // until we get an accent removal algo, comment this // out and possibly use the query synonym pipeline // to search without accents. MDW //long long h2 = hash64AsciiLowerE(&s[j],wlen); //if ( h2 != h ) m_stripWordIds [m_numWords] = h2; //else m_stripWordIds [m_numWords] = 0LL; //m_stripWordIds[m_numWords] = 0; } if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // break on \0 or MAX_WORDS //if ( ! s[i] ) goto done; // get a punct word goto uptop; /* j = i; // delineate the "punctuation" word for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i)); // bad utf8 could cause us to breach the node, so watch out! if ( i > nodeLen ) { badCount++; i = nodeLen; } // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [m_numWords ] = &s[j]; m_wordLens [m_numWords ] = wlen; m_wordIds [m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; */ done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); char *xx = NULL; *xx = 0; } // compute total length if ( m_numWords <= 0 ) m_totalLen = 0; else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1]; if ( badCount ) log("words: had %li bad utf8 chars",badCount); return true; }
bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) { int32_t i = 0; int32_t j; int32_t wlen; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) { goto done; } if ( ! s[i] ) { goto done; } if ( !is_alnum_utf8( s + i ) ) { if ( m_numWords >= m_preCount ) { goto done; } // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if( m_tagIds ) { if ( s[i + 1] == '/' ) { // skip over / m_tagIds[m_numWords] = ::getTagId( s + i + 2 ); m_tagIds[m_numWords] |= BACKBIT; } else { m_tagIds[m_numWords] = ::getTagId( s + i + 1 ); } } m_words[m_numWords] = s + i; m_wordIds[m_numWords] = 0LL; // skip till end int32_t tagLen = getTagLen( s + i ); m_wordLens[m_numWords] = tagLen; m_nodes[m_numWords] = 0; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; for ( ; s[i] ; i += getUtf8CharSize(s+i)) { // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) { break; } // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) { continue; } // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) { continue; } // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; m_nodes [ m_numWords ] = 0; if (m_tagIds) { m_tagIds[m_numWords] = 0; } m_numWords++; goto uptop; } // get an alnum word j = i; again: for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // comma is ok if like ,ddd!d if ( s[i]==',' && i-j <= 3 && is_digit(s[i-1]) ) { // if word so far is 2 or 3 chars, make sure digits if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo; if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo; // scan forward while ( s[i] == ',' && is_digit(s[i+1]) && is_digit(s[i+2]) && is_digit(s[i+3]) && ! is_digit(s[i+4]) ) { i += 4; } } // decimal point? if ( s[i] == '.' && is_digit(s[i-1]) && is_digit(s[i+1]) ) { // allow the decimal point i++; // skip over string of digits while ( is_digit(s[i]) ) i++; } nogo: // allow for words like we're dave's and i'm if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) { i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; if ( computeWordIds ) { int64_t h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; } m_nodes[m_numWords] = 0; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // get a punct word goto uptop; done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); gbshutdownLogicError(); } return true; }