static int exec(FILE* fp, ENC_INFO* einfo) { #define NCOL 8 int c, val, enc; enc = einfo->num; fprintf(fp, "static const unsigned short Enc%s_CtypeTable[256] = {\n", einfo->name); for (c = 0; c < 256; c++) { val = 0; if (IsNewline(enc, c)) val |= BIT_CTYPE_NEWLINE; if (IsAlpha (enc, c)) val |= (BIT_CTYPE_ALPHA | BIT_CTYPE_ALNUM); if (IsBlank (enc, c)) val |= BIT_CTYPE_BLANK; if (IsCntrl (enc, c)) val |= BIT_CTYPE_CNTRL; if (IsDigit (enc, c)) val |= (BIT_CTYPE_DIGIT | BIT_CTYPE_ALNUM); if (IsGraph (enc, c)) val |= BIT_CTYPE_GRAPH; if (IsLower (enc, c)) val |= BIT_CTYPE_LOWER; if (IsPrint (enc, c)) val |= BIT_CTYPE_PRINT; if (IsPunct (enc, c)) val |= BIT_CTYPE_PUNCT; if (IsSpace (enc, c)) val |= BIT_CTYPE_SPACE; if (IsUpper (enc, c)) val |= BIT_CTYPE_UPPER; if (IsXDigit(enc, c)) val |= BIT_CTYPE_XDIGIT; if (IsWord (enc, c)) val |= BIT_CTYPE_WORD; if (IsAscii (enc, c)) val |= BIT_CTYPE_ASCII; if (c % NCOL == 0) fputs(" ", fp); fprintf(fp, "0x%04x", val); if (c != 255) fputs(",", fp); if (c != 0 && c % NCOL == (NCOL-1)) fputs("\n", fp); else fputs(" ", fp); } fprintf(fp, "};\n"); return 0; }
static void Method3 (int k) { int i, keep_num, discard_num, mem, num_trans, recalc_reqd; int keep_to_discard = 0; int discard_to_keep = 0; int recalcs = 0; double freqs_trans[2], total; u_long escape[2]; WordData **keep_heap, **discard_heap; freqs_trans[0] = freqs_trans[1] = 0; num_trans = 0; discard_num = Num[0] + Num[1]; discard_heap = Xmalloc (discard_num * sizeof (WordData *)); keep_num = 0; keep_heap = Xmalloc (discard_num * sizeof (WordData *)); for (i = 0; i < Num[0]; i++) discard_heap[i] = Words[0] + i; for (i = 0; i < Num[1]; i++) discard_heap[i + Num[0]] = Words[1] + i; heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL); mem = 0; while (k > mem && discard_num) { WordData *word = discard_heap[0]; mem += sizeof (u_char *) + word->word[0]; heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL); keep_heap[keep_num++] = word; freqs_trans[KIND (word)] += word->freq; num_trans++; } CalcBitCost (discard_heap, discard_num, keep_heap, keep_num, freqs_trans, escape, num_trans); heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving); heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving); total = 0; recalc_reqd = 0; while (keep_num && discard_num) { if ((keep_num && keep_heap[0]->num_trans < num_trans) || (discard_num && discard_heap[0]->num_trans < num_trans)) { if (keep_num && keep_heap[0]->num_trans < num_trans) { WordData *word = keep_heap[0]; int idx = KIND (word); float wbc; #ifdef DEBUG1 fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word), word->saving); #endif wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) * word->freq; word->saving = (word->char_bit_cost - wbc) / (sizeof (char *) + word->word[0]); #ifdef DEBUG1 fprintf (stderr, " %.2f\n", word->saving); #endif word->num_trans = num_trans; heap_changedhead (keep_heap, sizeof (word), keep_num, SmallSaving); } if (discard_num && discard_heap[0]->num_trans < num_trans) { WordData *word = discard_heap[0]; int idx = KIND (word); float wbc; #ifdef DEBUG1 fprintf (stderr, "DISCARD \"%s\" %.2f ->", word2str (word->word), word->saving); #endif wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) * word->freq; word->saving = (word->char_bit_cost - wbc) / (sizeof (char *) + word->word[0]); #ifdef DEBUG1 fprintf (stderr, " %.2f\n", word->saving); #endif word->num_trans = num_trans; heap_changedhead (discard_heap, sizeof (word), discard_num, BigSaving); } } else if (keep_heap[0]->saving < discard_heap[0]->saving) { assert (keep_num && discard_num); if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > k) { /* Transfer the word at the top of the keep heap to the discard heap. */ WordData *word = keep_heap[0]; int idx = KIND (word); heap_deletehead (keep_heap, sizeof (word), &keep_num, SmallSaving); discard_heap[discard_num] = word; heap_additem (discard_heap, sizeof (word), &discard_num, BigSaving); freqs_trans[idx] -= word->freq; mem -= sizeof (u_char *) + word->word[0]; num_trans++; total += word->saving; keep_to_discard++; #ifdef DEBUG fprintf (stderr, "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n", mem, word->freq, word->char_bit_cost, word->saving, word2str (word->word)); #endif } else { /* Transfer the word at the top of the discard heap to the keep heap. */ WordData *word = discard_heap[0]; int idx = KIND (word); heap_deletehead (discard_heap, sizeof (word), &discard_num, BigSaving); keep_heap[keep_num] = word; heap_additem (keep_heap, sizeof (word), &keep_num, SmallSaving); freqs_trans[idx] += word->freq; mem += sizeof (u_char *) + word->word[0]; num_trans++; total += word->saving; discard_to_keep++; #ifdef DEBUG fprintf (stderr, "DISCARD -> KEEP %8d :%8d :%8.0f :%8.0f : \"%s\"\n", mem, word->freq, word->char_bit_cost, word->saving, word2str (word->word)); #endif } recalc_reqd = 1; } else { if (recalc_reqd == 0) break; #ifdef DEBUG1 fprintf (stderr, "--------------\n"); #endif if (recalcs == MAX_RECALCULATIONS) break; CalcBitCost (discard_heap, discard_num, keep_heap, keep_num, freqs_trans, escape, num_trans); heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving); heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving); recalc_reqd = 0; recalcs++; } } Alloc_keep_discard (); for (i = 0; i < discard_num; i++) { WordData *word = discard_heap[i]; int idx = KIND (word); assert (IsWord (word) || IsNonWord (word)); discard[idx].wd[discard[idx].num_wds++] = word; } for (i = 0; i < keep_num; i++) { WordData *word = keep_heap[i]; int idx = KIND (word); assert (IsWord (word) || IsNonWord (word)); keep[idx].wd[keep[idx].num_wds++] = word; } SortAndCount_DictData (&keep[0]); SortAndCount_DictData (&keep[1]); SortAndCount_DictData (&discard[0]); SortAndCount_DictData (&discard[1]); assert (keep[0].num_wds + discard[0].num_wds == Num[0]); assert (keep[1].num_wds + discard[1].num_wds == Num[1]); Message ("Keep -> Discard : %8d", keep_to_discard); Message ("Discard -> Keep : %8d", discard_to_keep); Message ("Huffman Recalculations : %8d", recalcs); if (recalcs == MAX_RECALCULATIONS) Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS); }
nsACString::const_char_iterator Tokenizer::Parse(Token& aToken) const { if (mCursor == mEnd) { aToken = Token::EndOfFile(); return mEnd; } nsACString::const_char_iterator next = mCursor; enum State { PARSE_INTEGER, PARSE_WORD, PARSE_CRLF, PARSE_LF, PARSE_WS, PARSE_CHAR, } state; if (IsWordFirst(*next)) { state = PARSE_WORD; } else if (IsNumber(*next)) { state = PARSE_INTEGER; } else if (strchr(mWhitespaces, *next)) { // not UTF-8 friendly? state = PARSE_WS; } else if (*next == '\r') { state = PARSE_CRLF; } else if (*next == '\n') { state = PARSE_LF; } else { state = PARSE_CHAR; } mozilla::CheckedUint64 resultingNumber = 0; while (next < mEnd) { switch (state) { case PARSE_INTEGER: // Keep it simple for now resultingNumber *= 10; resultingNumber += static_cast<uint64_t>(*next - '0'); ++next; if (IsEnd(next) || !IsNumber(*next)) { if (!resultingNumber.isValid()) { aToken = Token::Error(); } else { aToken = Token::Number(resultingNumber.value()); } return next; } break; case PARSE_WORD: ++next; if (IsEnd(next) || !IsWord(*next)) { aToken = Token::Word(Substring(mCursor, next)); return next; } break; case PARSE_CRLF: ++next; if (!IsEnd(next) && *next == '\n') { // LF is optional ++next; } aToken = Token::NewLine(); return next; case PARSE_LF: ++next; aToken = Token::NewLine(); return next; case PARSE_WS: ++next; aToken = Token::Whitespace(); return next; case PARSE_CHAR: ++next; aToken = Token::Char(*mCursor); return next; } // switch (state) } // while (next < end) return next; }
void TNGramBs::GetNGramIdV( const TStr& HtmlStr, TIntV& NGramIdV, TIntPrV& NGramBEChXPrV) const { // create MxNGramLen queues TVec<TIntQ> WIdQV(MxNGramLen); TVec<TIntPrQ> BEChXPrQV(MxNGramLen); for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){ WIdQV[NGramLen].Gen(100*NGramLen, NGramLen+1); BEChXPrQV[NGramLen].Gen(100*NGramLen, NGramLen+1); } bool AllWIdQClrP=true; // extract words from text-string PSIn HtmlSIn=TStrIn::New(HtmlStr, false); THtmlLx HtmlLx(HtmlSIn); while (HtmlLx.Sym!=hsyEof){ if ((HtmlLx.Sym==hsyStr)||(HtmlLx.Sym==hsyNum)){ // get word-string & word-id TStr WordStr=HtmlLx.UcChA; int WId; int SymBChX=HtmlLx.SymBChX; int SymEChX=HtmlLx.SymEChX; if ((SwSet.Empty())||(!SwSet->IsIn(WordStr))){ if (!Stemmer.Empty()){ WordStr=Stemmer->GetStem(WordStr);} if (IsWord(WordStr, WId)){ if (!IsSkipWord(WId)){ NGramIdV.Add(0+WId); // add single word NGramBEChXPrV.Add(TIntPr(SymBChX, SymEChX)); // add positions for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){ TIntQ& WIdQ=WIdQV[NGramLen]; TIntPrQ& BEChXPrQ=BEChXPrQV[NGramLen]; WIdQ.Push(WId); BEChXPrQ.Push(TIntPr(SymBChX, SymEChX)); AllWIdQClrP=false; // if queue full if (WIdQ.Len()==NGramLen+1){ // create sequence TIntV WIdV; WIdQ.GetSubValVec(0, WIdQ.Len()-1, WIdV); TIntPrV BEChXPrV; BEChXPrQ.GetSubValVec(0, BEChXPrQ.Len()-1, BEChXPrV); // add ngram-id or reset queues int WIdVP; if (WIdVToFqH.IsKey(WIdV, WIdVP)){ // if sequence is frequent int NGramId=GetWords()+WIdVP; // get sequence ngram-id NGramIdV.Add(NGramId); // add sequence ngram-id NGramBEChXPrV.Add(TIntPr(BEChXPrV[0].Val1, BEChXPrV.Last().Val2)); // add positions } } } } } else { // break queue sequences if infrequent word occures if (!AllWIdQClrP){ for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){ TIntQ& WIdQ=WIdQV[NGramLen]; TIntPrQ& BEChXPrQ=BEChXPrQV[NGramLen]; if (!WIdQ.Empty()){WIdQ.Clr(); BEChXPrQ.Clr();} } AllWIdQClrP=true; } } } } // get next symbol HtmlLx.GetSym(); } }