void initRDS(RDS *rds)
{
  uint i;
  SEQ *seq = rds->seq;
  uint size_w = rds->txt_len;
  CODE A, B;
  PAIR *pair;
  PAIR **p_que = rds->p_que;

  for (i = 0; i + 1 < size_w; i++) {
    A = seq[i].code;
    B = seq[i+1].code;
    if ((pair = locatePair(rds, A, B)) == NULL) {
      pair = createPair(rds, A, B, i);
    }
    else {
      seq[i].prev = pair->b_pos;
      seq[i].next = DUMMY_POS;
      seq[pair->b_pos].next = i;
      pair->b_pos = i;
      incrementPair(rds, pair);
    }
  }
  resetPQ(rds, 1);
}
DICT *RunRepair(DICT *dict, FILE *input, int length, unsigned int shared_dictsize, unsigned int codewordlength, USEDCHARTABLE *ut, unsigned int chunk_size, unsigned int sample)
{
  RDS  *rds;
  //  DICT *dict;
  PAIR *max_pair;
  PAIR *target;
  CODE new_code;
  uint num_replaced, cseqlen, numsymbol;
  uint width;
  uint i;
 
  if (sample)
    rds = createRDSwithsampling(input, length, chunk_size);
  else
    rds = createRDS(input, length);
  //  dict = createDict(rds->txt_len);
  if (!rds) return NULL;
  pqidx = 0;
  cseqlen = rds->txt_len;

  
  num_replaced = 0;
  if (dict->num_rules - CHAR_SIZE + ut->size > shared_dictsize) {
    dict->num_rules = shared_dictsize + CHAR_SIZE - ut->size;
  }

  // 現在の辞書を使って変換する
  for (i = CHAR_SIZE; i < dict->num_rules; i++) {
    target = locatePair(rds, dict->rule[i].left, dict->rule[i].right);
    if (target) {
      //      printf("%u %u -> %u\n", target->left, target->right, i);
      cseqlen -= replacePairs(rds, target, i);
    }
  }
  //  putchar('\n');
  while ((max_pair = getMaxPair(rds)) != NULL && (unsigned int)(dict->num_rules + ut->size - CHAR_SIZE) < (1U << codewordlength)) {
    new_code = addNewPair(dict, max_pair);
    //    printf("%u %u -> %u\n", max_pair->left, max_pair->right, new_code);
    //    printf("%u\n", new_code);
    cseqlen -= replacePairs(rds, max_pair, new_code);
  }


  getCompSeq(rds, dict);
  /* for (i = 0; i < dict->seq_len; i++) { */
  /*   printf("%d ", dict->comp_seq[i]); */
  /* } */
  /* putchar('\n'); */
  destructRDS(rds);


  return dict;
}
Beispiel #3
0
static
void updateBlock(CRDS *crds, CODE new_code, uint target_pos)
{
  SEQ *seq = crds->seq;
  uint l_pos, r_pos, rr_pos, nx_pos;
  CODE c_code, r_code, l_code, rr_code;
  PCODE c_pcode, r_pcode, l_pcode;
  PAIR *l_pair, *c_pair, *r_pair;

  l_pos   = leftPos(crds, target_pos);
  r_pos   = rightPos(crds, target_pos);
  rr_pos  = rightPos(crds, r_pos);
  c_code  = seq[target_pos].code;
  c_pcode = seq[target_pos].pcode;
  r_code  = seq[r_pos].code;
  r_pcode = seq[r_pos].pcode;

  nx_pos = seq[target_pos].next;
  if (nx_pos == r_pos) {
    nx_pos = seq[nx_pos].next;
  }

  assert(c_code != DUMMY_CODE);
  assert(r_code != DUMMY_CODE);

  if (l_pos != DUMMY_POS) {
    l_code = seq[l_pos].code;
    l_pcode = seq[l_pos].pcode;
    assert(seq[l_pos].code != DUMMY_CODE);
    removeLink(crds, l_pos);
    if ((l_pair = locatePair(crds, l_pcode, l_code, c_code)) != NULL) {
      if (l_pair->f_pos == l_pos) {
	l_pair->f_pos = seq[l_pos].next;
      }
      decrementPair(crds, l_pair);
    }
    if ((l_pair = locatePair(crds, l_pcode, l_code, new_code)) == NULL) {
      seq[l_pos].prev = DUMMY_POS;
      seq[l_pos].next = DUMMY_POS;
      createPair(crds, l_pcode, l_code, new_code, l_pos);
    }
    else {
      seq[l_pos].prev = l_pair->b_pos;
      seq[l_pos].next = DUMMY_POS;
      seq[l_pair->b_pos].next = l_pos;
      l_pair->b_pos = l_pos;
      incrementPair(crds, l_pair);
    }
  }

  removeLink(crds, target_pos);
  removeLink(crds, r_pos);
  seq[target_pos].code = new_code;
  seq[r_pos].code = DUMMY_CODE;
  
  if (rr_pos != DUMMY_POS) {
    rr_code = seq[rr_pos].code;
    assert(rr_code != DUMMY_CODE);
    if ((r_pair = locatePair(crds, r_pcode, r_code, rr_code)) != NULL) {
      if (r_pair->f_pos == r_pos) {
	r_pair->f_pos = seq[r_pos].next;
      }
      decrementPair(crds, r_pair);
    }

    if (target_pos + 1 == rr_pos - 1) {
      seq[target_pos+1].prev = rr_pos;
      seq[target_pos+1].next = target_pos;
    }
    else {
      seq[target_pos+1].prev = rr_pos;
      seq[target_pos+1].next = DUMMY_POS;
      seq[rr_pos-1].prev = DUMMY_POS;
      seq[rr_pos-1].next = target_pos;
    }
    if (nx_pos > rr_pos) {
      if ((c_pair = locatePair(crds, c_pcode, new_code, rr_code)) == NULL) {
	seq[target_pos].prev = seq[target_pos].next = DUMMY_POS;
	createPair(crds, c_pcode, new_code, rr_code, target_pos);
      }
      else {
	seq[target_pos].prev = c_pair->b_pos;
	seq[target_pos].next = DUMMY_POS;
	seq[c_pair->b_pos].next = target_pos;
	c_pair->b_pos = target_pos;
	incrementPair(crds, c_pair);
      }
    }
    else {
      seq[target_pos].next = seq[target_pos].prev = DUMMY_POS;
    }
  }
  else if (target_pos < crds->txt_len - 1) {
    assert(seq[target_pos+1].code == DUMMY_CODE);
    seq[target_pos+1].prev = DUMMY_POS;
    seq[target_pos+1].next = target_pos;
    seq[r_pos].prev = seq[r_pos].next = DUMMY_POS;
  }
}
Beispiel #4
0
static
CRDS *createCRDS(FILE *input, uint cont_len, uint mchar_size)
{
  uint size_w;
  SEQ  *seq;
  uint char_size;
  bool check_table[MAX_CHAR_SIZE];
  uint char_freq[MAX_CHAR_SIZE];
  CODE  *char_table;
  uchar *mchar_table;
  uint num_contexts = (uint)pow(mchar_size, cont_len);
  PQUE **p_que;
  uint p_max;
  CRDS *crds;

  fseek(input, 0, SEEK_END);
  size_w = ftell(input);
  rewind(input);
  seq = (SEQ *)malloc(sizeof(SEQ) * size_w);
  char_table  = (CODE *)malloc(sizeof(CODE) * MAX_CHAR_SIZE);
  mchar_table = (uchar *)malloc(sizeof(uchar) * MAX_CHAR_SIZE);

  {
    uint i;
    for (i = 0; i < MAX_CHAR_SIZE; i++) {
      check_table[i] = false;
      char_table[i]  = DUMMY_CODE;
      char_freq[i]   = 0;
    }
  }

  char_size = 0;
  {
    CODE c;
    uint i = 0;
    while ((c = getc(input)) != EOF) {
      seq[i].code = c;
      seq[i].next = DUMMY_POS;
      seq[i].prev = DUMMY_POS;
      if (check_table[c] == false) {
	check_table[c] = true;
	char_size++;
      }
      i++;
    }
  }

  if (char_size <= mchar_size) {
    mchar_size = char_size;
  }

  {
    uint i, j;
    for (i = 0, j = 0; i < MAX_CHAR_SIZE; i++) {
      if (check_table[i] == true) {
	char_table[i] = (CODE)j++;
      }
    }
  }

  {
    uint i = 0; 
    while (i < size_w) {
      seq[i].code = char_table[seq[i].code];
      char_freq[seq[i].code]++;
      i++;
    }
  }

  if (mchar_size < char_size) {
    uint i, j;
    int k = 0;
    uchar max_code = 0;
    uint max;
    bool up_flag = true;
    for (i = 0; i < char_size; i++) {
      max = 0;
      for (j = 0; j < char_size; j++) {
	if (char_freq[j] > max) {
	  max_code = (uchar)j;
	  max = char_freq[j];
	}
      }
      char_freq[max_code] = 0;
      mchar_table[max_code] = k;
      if (up_flag == true) {
	k++;
	if (k == mchar_size) {
	  k = mchar_size - 1;
	  up_flag = false;
	}
      }
      else {
	k--;
	if (k < 0) {
	  k = 0;
	  up_flag = true;
	}
      }
    }
  }
  else {
    uint i;
    for (i = 0; i < char_size; i++) {
      mchar_table[i] = i;
    }
  }

  {
    int i, j, k;
    uchar context[cont_len];
    CODE id;

    i = 0;
    while (i < size_w) {
      j = i - cont_len;
      k = 0;
      while (k < cont_len) {
	if (j < 0) {
	  context[k++] = HEAD_PCODE; j++;
	}
	else {
	  context[k++] = mchar_table[seq[j++].code];
	}
      }
      id = getContextID(mchar_size, cont_len, context);
      seq[i++].pcode = id;
    }
  }

  p_max = (uint)ceil(sqrt((double)size_w))/num_contexts;
  printf("p_max = %d\n", p_max);
  {
    uint i, j;
    p_que = (PQUE **)malloc(sizeof(PQUE *)*num_contexts);
    for (i = 0; i < num_contexts; i++) {
      p_que[i] = (PQUE *)malloc(sizeof(PQUE));
      p_que[i]->h_entry = 
	(PAIR **)malloc(sizeof(PAIR *) * primes[INIT_HASH_NUM]);
      for (j = 0; j < primes[INIT_HASH_NUM]; j++) {
	p_que[i]->h_entry[j] = NULL;
      }
      p_que[i]->p_head = 
	(PAIR **)malloc(sizeof(PAIR *) * p_max);
      for (j = 0; j < p_max; j++) {
	p_que[i]->p_head[j] = NULL;
      }
      p_que[i]->h_num     = INIT_HASH_NUM;
      p_que[i]->mp_pos    = 0;
      p_que[i]->p_max     = p_max;
      p_que[i]->num_pairs = 0;
    }
  }

  crds = (CRDS *)malloc(sizeof(CRDS));
  crds->txt_len      = size_w;
  crds->char_size    = char_size;
  crds->char_table   = char_table;
  crds->mchar_size   = mchar_size;
  crds->mchar_table  = mchar_table;
  crds->cont_len     = cont_len;
  crds->num_contexts = num_contexts;
  crds->seq          = seq;
  crds->p_que        = p_que;

  {
    uint i;
    PCODE P;
    CODE A, B;
    PAIR *pair;
    
    for (i = 0; i < size_w - 1; i++) {
      P = seq[i].pcode;
      A = seq[i].code;
      B = seq[i+1].code;
      if ((pair = locatePair(crds, P, A, B)) == NULL) {
	pair = createPair(crds, P, A, B, i);
      }
      else {
	seq[i].prev = pair->b_pos;
	seq[i].next = DUMMY_POS;
	seq[pair->b_pos].next = i;
	pair->b_pos = i;
	incrementPair(crds, pair);
      }
    }
    for (i = 0; i < num_contexts; i++) {
      deletePQ(crds, 1, i);
    }
  }

  return crds;
}