示例#1
0
static int exec(FILE* fp, ENC_INFO* einfo)
{
#define NCOL  8

  int c, val, enc;

  enc = einfo->num;

  fprintf(fp, "static const unsigned short Enc%s_CtypeTable[256] = {\n",
	  einfo->name);

  for (c = 0; c < 256; c++) {
    val = 0;
    if (IsNewline(enc, c))  val |= BIT_CTYPE_NEWLINE;
    if (IsAlpha (enc, c))   val |= (BIT_CTYPE_ALPHA | BIT_CTYPE_ALNUM);
    if (IsBlank (enc, c))   val |= BIT_CTYPE_BLANK;
    if (IsCntrl (enc, c))   val |= BIT_CTYPE_CNTRL;
    if (IsDigit (enc, c))   val |= (BIT_CTYPE_DIGIT | BIT_CTYPE_ALNUM);
    if (IsGraph (enc, c))   val |= BIT_CTYPE_GRAPH;
    if (IsLower (enc, c))   val |= BIT_CTYPE_LOWER;
    if (IsPrint (enc, c))   val |= BIT_CTYPE_PRINT;
    if (IsPunct (enc, c))   val |= BIT_CTYPE_PUNCT;
    if (IsSpace (enc, c))   val |= BIT_CTYPE_SPACE;
    if (IsUpper (enc, c))   val |= BIT_CTYPE_UPPER;
    if (IsXDigit(enc, c))   val |= BIT_CTYPE_XDIGIT;
    if (IsWord  (enc, c))   val |= BIT_CTYPE_WORD;
    if (IsAscii (enc, c))   val |= BIT_CTYPE_ASCII;

    if (c % NCOL == 0) fputs("  ", fp);
    fprintf(fp, "0x%04x", val);
    if (c != 255) fputs(",", fp);
    if (c != 0 && c % NCOL == (NCOL-1))
      fputs("\n", fp);
    else
      fputs(" ", fp);
  }
  fprintf(fp, "};\n");
  return 0;
}
示例#2
0
static void 
Method3 (int k)
{
  int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
  int keep_to_discard = 0;
  int discard_to_keep = 0;
  int recalcs = 0;
  double freqs_trans[2], total;
  u_long escape[2];
  WordData **keep_heap, **discard_heap;

  freqs_trans[0] = freqs_trans[1] = 0;
  num_trans = 0;

  discard_num = Num[0] + Num[1];
  discard_heap = Xmalloc (discard_num * sizeof (WordData *));

  keep_num = 0;
  keep_heap = Xmalloc (discard_num * sizeof (WordData *));


  for (i = 0; i < Num[0]; i++)
    discard_heap[i] = Words[0] + i;
  for (i = 0; i < Num[1]; i++)
    discard_heap[i + Num[0]] = Words[1] + i;

  heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);

  mem = 0;
  while (k > mem && discard_num)
    {
      WordData *word = discard_heap[0];
      mem += sizeof (u_char *) + word->word[0];
      heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
      keep_heap[keep_num++] = word;
      freqs_trans[KIND (word)] += word->freq;
      num_trans++;
    }

  CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
	       freqs_trans, escape, num_trans);
  heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
  heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);

  total = 0;
  recalc_reqd = 0;
  while (keep_num && discard_num)
    {
      if ((keep_num && keep_heap[0]->num_trans < num_trans) ||
	  (discard_num && discard_heap[0]->num_trans < num_trans))
	{
	  if (keep_num && keep_heap[0]->num_trans < num_trans)
	    {
	      WordData *word = keep_heap[0];
	      int idx = KIND (word);
	      float wbc;
#ifdef DEBUG1
	      fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
		       word->saving);
#endif
	      wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
		word->freq;
	      word->saving = (word->char_bit_cost - wbc) /
		(sizeof (char *) + word->word[0]);
#ifdef DEBUG1
	      fprintf (stderr, " %.2f\n", word->saving);
#endif
	      word->num_trans = num_trans;
	      heap_changedhead (keep_heap, sizeof (word), keep_num, SmallSaving);
	    }

	  if (discard_num && discard_heap[0]->num_trans < num_trans)
	    {
	      WordData *word = discard_heap[0];
	      int idx = KIND (word);
	      float wbc;
#ifdef DEBUG1
	      fprintf (stderr, "DISCARD \"%s\" %.2f ->", word2str (word->word),
		       word->saving);
#endif
	      wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
		word->freq;
	      word->saving = (word->char_bit_cost - wbc) /
		(sizeof (char *) + word->word[0]);
#ifdef DEBUG1
	      fprintf (stderr, " %.2f\n", word->saving);
#endif
	      word->num_trans = num_trans;
	      heap_changedhead (discard_heap, sizeof (word),
				discard_num, BigSaving);
	    }
	}
      else if (keep_heap[0]->saving < discard_heap[0]->saving)
	{
	  assert (keep_num && discard_num);
	  if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > k)
	    {
	      /* Transfer the word at the top of the keep heap to the
	         discard heap. */
	      WordData *word = keep_heap[0];
	      int idx = KIND (word);
	      heap_deletehead (keep_heap, sizeof (word), &keep_num, SmallSaving);
	      discard_heap[discard_num] = word;
	      heap_additem (discard_heap, sizeof (word), &discard_num,
			    BigSaving);
	      freqs_trans[idx] -= word->freq;
	      mem -= sizeof (u_char *) + word->word[0];
	      num_trans++;
	      total += word->saving;
	      keep_to_discard++;
#ifdef DEBUG
	      fprintf (stderr,
		       "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
		       mem, word->freq, word->char_bit_cost,
		       word->saving, word2str (word->word));
#endif
	    }
	  else
	    {
	      /* Transfer the word at the top of the discard heap to the
	         keep heap. */
	      WordData *word = discard_heap[0];
	      int idx = KIND (word);
	      heap_deletehead (discard_heap, sizeof (word),
			       &discard_num, BigSaving);
	      keep_heap[keep_num] = word;
	      heap_additem (keep_heap, sizeof (word), &keep_num, SmallSaving);
	      freqs_trans[idx] += word->freq;
	      mem += sizeof (u_char *) + word->word[0];
	      num_trans++;
	      total += word->saving;
	      discard_to_keep++;
#ifdef DEBUG
	      fprintf (stderr,
		       "DISCARD -> KEEP %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
		       mem, word->freq, word->char_bit_cost,
		       word->saving, word2str (word->word));
#endif
	    }

	  recalc_reqd = 1;

	}
      else
	{
	  if (recalc_reqd == 0)
	    break;
#ifdef DEBUG1
	  fprintf (stderr, "--------------\n");
#endif
	  if (recalcs == MAX_RECALCULATIONS)
	    break;
	  CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
		       freqs_trans, escape, num_trans);
	  heap_build (discard_heap, sizeof (*discard_heap),
		      discard_num, BigSaving);
	  heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
	  recalc_reqd = 0;
	  recalcs++;
	}
    }

  Alloc_keep_discard ();

  for (i = 0; i < discard_num; i++)
    {
      WordData *word = discard_heap[i];
      int idx = KIND (word);
      assert (IsWord (word) || IsNonWord (word));
      discard[idx].wd[discard[idx].num_wds++] = word;
    }
  for (i = 0; i < keep_num; i++)
    {
      WordData *word = keep_heap[i];
      int idx = KIND (word);
      assert (IsWord (word) || IsNonWord (word));
      keep[idx].wd[keep[idx].num_wds++] = word;
    }
  SortAndCount_DictData (&keep[0]);
  SortAndCount_DictData (&keep[1]);
  SortAndCount_DictData (&discard[0]);
  SortAndCount_DictData (&discard[1]);

  assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
  assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
  Message ("Keep -> Discard        : %8d", keep_to_discard);
  Message ("Discard -> Keep        : %8d", discard_to_keep);
  Message ("Huffman Recalculations : %8d", recalcs);
  if (recalcs == MAX_RECALCULATIONS)
    Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
}
示例#3
0
nsACString::const_char_iterator
Tokenizer::Parse(Token& aToken) const
{
  if (mCursor == mEnd) {
    aToken = Token::EndOfFile();
    return mEnd;
  }

  nsACString::const_char_iterator next = mCursor;

  enum State {
    PARSE_INTEGER,
    PARSE_WORD,
    PARSE_CRLF,
    PARSE_LF,
    PARSE_WS,
    PARSE_CHAR,
  } state;

  if (IsWordFirst(*next)) {
    state = PARSE_WORD;
  } else if (IsNumber(*next)) {
    state = PARSE_INTEGER;
  } else if (strchr(mWhitespaces, *next)) { // not UTF-8 friendly?
    state = PARSE_WS;
  } else if (*next == '\r') {
    state = PARSE_CRLF;
  } else if (*next == '\n') {
    state = PARSE_LF;
  } else {
    state = PARSE_CHAR;
  }

  mozilla::CheckedUint64 resultingNumber = 0;

  while (next < mEnd) {
    switch (state) {
    case PARSE_INTEGER:
      // Keep it simple for now
      resultingNumber *= 10;
      resultingNumber += static_cast<uint64_t>(*next - '0');

      ++next;
      if (IsEnd(next) || !IsNumber(*next)) {
        if (!resultingNumber.isValid()) {
          aToken = Token::Error();
        } else {
          aToken = Token::Number(resultingNumber.value());
        }
        return next;
      }
      break;

    case PARSE_WORD:
      ++next;
      if (IsEnd(next) || !IsWord(*next)) {
        aToken = Token::Word(Substring(mCursor, next));
        return next;
      }
      break;

    case PARSE_CRLF:
      ++next;
      if (!IsEnd(next) && *next == '\n') { // LF is optional
        ++next;
      }
      aToken = Token::NewLine();
      return next;

    case PARSE_LF:
      ++next;
      aToken = Token::NewLine();
      return next;

    case PARSE_WS:
      ++next;
      aToken = Token::Whitespace();
      return next;

    case PARSE_CHAR:
      ++next;
      aToken = Token::Char(*mCursor);
      return next;
    } // switch (state)
  } // while (next < end)

  return next;
}
示例#4
0
void TNGramBs::GetNGramIdV(
 const TStr& HtmlStr, TIntV& NGramIdV, TIntPrV& NGramBEChXPrV) const {
  // create MxNGramLen queues
  TVec<TIntQ> WIdQV(MxNGramLen);
  TVec<TIntPrQ> BEChXPrQV(MxNGramLen);
  for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){
    WIdQV[NGramLen].Gen(100*NGramLen, NGramLen+1);
    BEChXPrQV[NGramLen].Gen(100*NGramLen, NGramLen+1);
  }
  bool AllWIdQClrP=true;
  // extract words from text-string
  PSIn HtmlSIn=TStrIn::New(HtmlStr, false);
  THtmlLx HtmlLx(HtmlSIn);
  while (HtmlLx.Sym!=hsyEof){
    if ((HtmlLx.Sym==hsyStr)||(HtmlLx.Sym==hsyNum)){
      // get word-string & word-id
      TStr WordStr=HtmlLx.UcChA;
      int WId; int SymBChX=HtmlLx.SymBChX; int SymEChX=HtmlLx.SymEChX;
      if ((SwSet.Empty())||(!SwSet->IsIn(WordStr))){
        if (!Stemmer.Empty()){
          WordStr=Stemmer->GetStem(WordStr);}
        if (IsWord(WordStr, WId)){
          if (!IsSkipWord(WId)){
            NGramIdV.Add(0+WId); // add single word
            NGramBEChXPrV.Add(TIntPr(SymBChX, SymEChX)); // add positions
            for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){
              TIntQ& WIdQ=WIdQV[NGramLen];
              TIntPrQ& BEChXPrQ=BEChXPrQV[NGramLen];
              WIdQ.Push(WId); BEChXPrQ.Push(TIntPr(SymBChX, SymEChX));
              AllWIdQClrP=false;
              // if queue full
              if (WIdQ.Len()==NGramLen+1){
                // create sequence
                TIntV WIdV; WIdQ.GetSubValVec(0, WIdQ.Len()-1, WIdV);
                TIntPrV BEChXPrV; BEChXPrQ.GetSubValVec(0, BEChXPrQ.Len()-1, BEChXPrV);
                // add ngram-id or reset queues
                int WIdVP;
                if (WIdVToFqH.IsKey(WIdV, WIdVP)){ // if sequence is frequent
                  int NGramId=GetWords()+WIdVP; // get sequence ngram-id
                  NGramIdV.Add(NGramId); // add sequence ngram-id
                  NGramBEChXPrV.Add(TIntPr(BEChXPrV[0].Val1, BEChXPrV.Last().Val2)); // add positions
                }
              }
            }
          }
        } else {
          // break queue sequences if infrequent word occures
          if (!AllWIdQClrP){
            for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){
              TIntQ& WIdQ=WIdQV[NGramLen];
              TIntPrQ& BEChXPrQ=BEChXPrQV[NGramLen];
              if (!WIdQ.Empty()){WIdQ.Clr(); BEChXPrQ.Clr();}
            }
            AllWIdQClrP=true;
          }
        }
      }
    }
    // get next symbol
    HtmlLx.GetSym();
  }
}