Example #1
0
int main(int argc,char * argv[])
{
	const char * s1="xml.50MB";
	const char * sa="xml.csa";
	const char * s2="xml.txt";

	if(argc==2)
	{
		CSA csa(atoi(argv[1]));
		csa.Compress(s1,sa);
		cout<<"compress is ok"<<endl;
		csa.Decompress(sa,s2);
		cout<<"decompress is ok"<<endl;
	}
	else
	{
		CSA csa;
		csa.Compress(s1,sa);
		cout<<"compress is ok"<<endl;
//		time_t t1=clock();
		csa.Decompress(sa,s2);
//		time_t t2=clock();
//		cout<<(t2-t1)/1000000.0<<endl;
		cout<<"decompress is ok"<<endl;
	}
	return 0;
}
Example #2
0
cst_node cst_nextchild(cst_node node, cst_node child)
{
  int c;
  i64 l, r;
  i64 i, depth;
  CSA *csa;
  cst_node newnode;

  csa = node.csa;
  l = child.r+1;
  depth = node.depth;

  if (l > node.r) {
    newnode.depth = -1;
    newnode.l = 1;
    newnode.r = 0;
    return newnode;
  }

  for (i=0; i<depth; i++) {
    l = csa->psi(csa, l);
  }

  c = csa->head(csa, l);
  if (c == -1) {
    newnode.csa = csa;
    newnode.depth = depth+1;
    newnode.l = newnode.r = node.l;
    return newnode;
  } else {
    return cst_child(node, csa->AtoC[c]);
  }
}
Example #3
0
static VALUE // bug?
rb_csa_search_l(VALUE self, VALUE oc, VALUE range)
{
    CSA *sa = csa_ptr(self);
    i64 ll,rr;
    int c;
    i64 ret;

    c = FIX2INT(oc);
#if USE_RANGE
    ll = FIX2LONG(range_first(range));
    rr = FIX2LONG(range_last(range));  if (range_exclude_end_p(range) == Qtrue) rr--;
#else
    if (RALEN(range) != 2) {
      return Qnil;
    }
    ll = FIX2LONG(RAPTR(range)[0]);
    rr = FIX2LONG(RAPTR(range)[1]);
#endif
    ret = sa->searchsub(c, sa, &ll, &rr);
    if (ret == -1) return Qnil;

#if USE_RANGE
    return rb_range_new(LONG2FIX(ll), LONG2FIX(rr), Qnil);
#else
    return rb_ary_new3(2, LONG2FIX(ll), LONG2FIX(rr));
#endif
}
Example #4
0
cst_node cst_canonize(cst_node node)
{
  int c1,c2;
  cst_node newnode;
  i64 l, r;
  i64 i, depth;
  CSA *csa;

  newnode = node;

  csa = node.csa;
  l = node.l;  r = node.r;
  depth = node.depth;

  for (i=0; i<depth; i++) {
    l = csa->psi(csa, l);
    r = csa->psi(csa, r);
  }

  while (1) {
    c1 = csa->head(csa, l);
    c2 = csa->head(csa, r);
    if (c1 != c2) break;
    l = csa->psi(csa, l);
    r = csa->psi(csa, r);
    depth++;
  }
  
  newnode.depth = depth;
  return newnode;
}
Example #5
0
static VALUE
//rb_csa_text(VALUE self, VALUE oi, VALUE oj)
rb_csa_text(VALUE self, VALUE range)
{
    CSA *sa = csa_ptr(self);
    i64 i,j,n;
    uchar *buf;

#if USE_RANGE
    i = FIX2LONG(range_first(range));
    j = FIX2LONG(range_last(range));  if (range_exclude_end_p(range) == Qtrue) j--;
#else
//    i = FIX2LONG(oi);
//    j = FIX2LONG(oj);
    if (RALEN(range) != 2) {
      return Qnil;
    }
    i = FIX2LONG(RAPTR(range)[0]);
    j = FIX2LONG(RAPTR(range)[1]);
#endif
    n = sa->n;

    if (i < 0 || i > n || j < 0 || j > n) {    // error
      return Qnil;
    }

    buf = (uchar *)alloca(j-i+1+1);
    sa->text(buf, sa, i, j);
    buf[j-i+1] = 0;
    return rb_str_new(buf, j-i+1);
}
Example #6
0
int locate(void *index, uchar *pattern, ulong length, ulong **occ, 
		   ulong *numocc)
{
  CSA *csa;
  i64 l,r;
  ulong *buf,*buf2;
  ulong i,oc;
  i64 mlen;

  csa = (CSA *)index;
  mlen = csa->search(pattern, length, csa, &l, &r);
  if (mlen < length) {
    *numocc = 0;
    return 0;
  }
  oc = (ulong)(r - l + 1);

  buf = malloc((*numocc) * sizeof(ulong));
  if (buf == NULL) {
    printf("locate: not enough mem.\n");
    exit(1);
  }

  for (i=0; i<oc; i++) {
    buf[i] = csa->lookup(csa,l + i);
  }

  *numocc = oc;
  *occ = buf;
  return 0;
}
Example #7
0
File: cst.c Project: mrG7/hpg-libs
cst_node cst_firstchild(cst_node node)
{
  int c;
  i64 l;
  i64 i, depth;
  CSA *csa;
  cst_node newnode;

  csa = node.csa;
  l = node.l;
  depth = node.depth;

  for (i=0; i<depth; i++) {
    l = csa->psi(csa, l);
  }

  c = csa->head(csa, l);
  if (c == -1) {
    newnode.csa = csa;
    newnode.depth = depth+1;
    newnode.depth2 = depth+1;
    newnode.l = newnode.r = node.l;
    return newnode;
  } else {
    return cst_child(node, csa->AtoC[c]);
  }
}
Example #8
0
cst_node cst_suflink(cst_node node)
{
  CSA *csa;
  i64 l, r;
  cst_node newnode;
  
  csa = newnode.csa = node.csa;

  l = node.l;  r = node.r;
  newnode.l = csa->psi(csa, l);
  newnode.r = csa->psi(csa, r);
  newnode.depth = node.depth;
  return cst_parent(newnode);
}
Example #9
0
int count(void *index, uchar *pattern, ulong length, ulong *numocc)
{
  CSA *csa;
  i64 l,r;
  i64 mlen;

  csa = (CSA *)index;
  mlen = csa->search(pattern, length, csa, &l, &r);
  if (mlen < length) {
    *numocc = 0;
    return 0;
  }
  *numocc = (ulong)(r - l + 1);

  return 0;
}
Example #10
0
static VALUE
rb_csa_inverse(VALUE self, VALUE oi)
{
    CSA *sa = csa_ptr(self);
    i64 i,j,n;

    i = FIX2LONG(oi);
    n = sa->n;

    if (i < 0 || i > n) {    // error
      return Qnil;
    }

    j = sa->inverse(sa, i);
    return LONG2FIX(j);
}
Example #11
0
static VALUE
rb_csa_child_l(VALUE self, VALUE range)
{
    CSA *sa = csa_ptr(self);
    i64 l,r,ll,rr;
    int c,i;
    VALUE charset;
    i64 ret;

#if USE_RANGE
    ll = FIX2LONG(range_first(range));
    rr = FIX2LONG(range_last(range));  if (range_exclude_end_p(range) == Qtrue) rr--;
#else
    if (RALEN(range) != 2) {
      return Qnil;
    }
    ll = FIX2LONG(RAPTR(range)[0]);
    rr = FIX2LONG(RAPTR(range)[1]);
#endif
    l = r = -1;

    if (!rb_block_given_p()) charset = rb_ary_new();
    for (i=0; i<sa->m; i++) {
      c = sa->AtoC[i];
      l = ll;  r = rr;
      ret = sa->searchsub(c, sa, &l, &r);
      if (ret == 0) {
        if (rb_block_given_p()) {
            rb_yield(rb_ary_new3(2,INT2FIX(c),
#if USE_RANGE
                     rb_range_new(LONG2FIX(l), LONG2FIX(r), Qnil)));
#else
                     rb_ary_new3(2,LONG2FIX(l),LONG2FIX(r))));
#endif
        } else {
          rb_ary_push(charset,
            rb_ary_new3(2,INT2FIX(c),
#if USE_RANGE
            rb_range_new(LONG2FIX(l), LONG2FIX(r), Qnil)));
#else
            rb_ary_new3(2,LONG2FIX(l),LONG2FIX(r))));
#endif
        }
      }
    }
Example #12
0
static VALUE
rb_csa_T(VALUE self, VALUE oi)
{
    CSA *sa = csa_ptr(self);
    i64 i, n;
    int c;

    i = FIX2LONG(oi);
    n = sa->n;

    if (i < 0 || i > n) {    // error
      return Qnil;
    }

    c = sa->T(sa, i);

    return INT2FIX(c);
}
Example #13
0
static VALUE
rb_csa_lf(VALUE self, VALUE oi)
{
    CSA *sa = csa_ptr(self);
    i64 i, n;
    i64 p;

    i = FIX2LONG(oi);
    n = sa->n;

    if (i < 0 || i > n) {    // error
      return Qnil;
    }

    p = sa->LF(sa, i);

    return LONG2FIX(p);
}
Example #14
0
static VALUE
rb_csa_search(VALUE self, VALUE okey)
{
    CSA *sa = csa_ptr(self);
    i64 n, l, keylen;
    i64 i[2];
    uchar *key;

    key = StringValueCStr(okey);
    keylen = RSLEN(okey);

    if (sa->search(key, keylen, sa, &i[0], &i[1]) < keylen) return Qnil;

#if USE_RANGE
    return rb_range_new(LONG2FIX(i[0]), LONG2FIX(i[1]), Qnil);
#else
    return rb_ary_new3(2, LONG2FIX(i[0]), LONG2FIX(i[1]));
#endif
}
Example #15
0
static VALUE
rb_csa_substring(VALUE self, VALUE orank, VALUE olen)
{
    CSA *sa = csa_ptr(self);
    i64 rank,len,n;
    uchar *buf;

    rank = FIX2LONG(orank);
    len = FIX2LONG(olen);
    n = sa->n;

    if (rank < 0 || rank > n || len < 0) {    // error
      return Qnil;
    }

    buf = (uchar *)alloca(len+1);
    len = sa->substring(buf, sa, rank, len);
    buf[len] = 0;
    return rb_str_new(buf, len);
}
Example #16
0
int extract(void *index, ulong from, ulong to, uchar **snippet,
			ulong *snippet_length)
{
  CSA *csa;
  uchar *text;
  i64 i,len;

  csa = (CSA *)index;

  from++;  to++;
  if (to > csa->n) to = csa->n;

  len = to - from + 1;
  text = malloc(len);

  csa->text(text,csa,from,to);

  *snippet = text;
  *snippet_length = (ulong)len;
  return 0;
}
Example #17
0
int cst_isunary(cst_node node)
{
  int c1,c2;
  i64 l, r;
  i64 i, depth;
  CSA *csa;

  csa = node.csa;
  l = node.l;  r = node.r;
  depth = node.depth;

  for (i=0; i<depth; i++) {
    l = csa->psi(csa, l);
    r = csa->psi(csa, r);
  }

  c1 = csa->head(csa, l);
  c2 = csa->head(csa, r);

  return (c1 == c2);
}
Example #18
0
cst_node cst_parent(cst_node node)
{
  CSA *csa;
  uchar *label;
  cst_node parent;
  i64 l, r;
  i64 len;

  csa = node.csa;
  parent.csa = csa;
  parent.depth = node.depth-1;

  label = cst_pathlabel(node);
  l = node.l;  r = node.r;
  if (csa->search(label, parent.depth, csa, &l, &r) != parent.depth) {
    printf("cst_parent: ???\n");
  }
  parent.l = l;  parent.r = r;

  free(label);
  return parent;
}
Example #19
0
cst_node cst_weiner_link(cst_node node, int c)
{
  CSA *csa;
  cst_node newnode;
  i64 ll, rr;
  i64 len;
  
  csa = newnode.csa = node.csa;

  ll = node.l;  rr = node.r;
  if (csa->searchsub(c, node.csa, &ll, &rr) != 0) {
    newnode.depth = -1;
    newnode.l = 1;
    newnode.r = 0;
  } else {
    newnode.depth = node.depth+1;
    newnode.l = ll;
    newnode.r = rr;
  }
  return newnode;

}
Example #20
0
int main(int argc, char *argv[])
{
  i64 i,n;
  CSA csa;
  mytimestruct before,after;
  double t;

   if (argc<2) {
      fprintf(stderr, "syntax: suftest file\n");
      return 1;
   }

   csa_read(&csa,argc,argv);
   n = csa.n;

   mygettime(&before);
   {
     int m;
     FILE *out;
     out = fopen("output.dec","w");
     i = 0;
     while (i < n) {
       if ((i/PAGE) % PAGE == 0) {
         fprintf(stderr,"%ld \r",i/PAGE);  fflush(stderr);
       }
       m = PAGE;
       if (i+m >= n) m = n-i;
       csa.text(buf,&csa,i,i+m-1);
       fwrite(buf,1,m,out);
       i += m;
     }
     fwrite(buf,1,0,out);
     fclose(out);
   }
   mygettime(&after);
   t = mylaptime(&before,&after);
   fprintf(stderr,"time %f sec\n",t);
   return 0;
}
Example #21
0
void csa_new_from_bwt(int argc, char *argv[])
{
  i64 i,j,v,m;
  FILE *f2;
  i64 psize,isize;
  i64 n;
  int k;
  char *fname,*fidx;
  char *p;
  int psi_id, idx_id;
  CSA csa;
  int sigma;

  csa.sigma = 256; /* default alphabet size */
  csa.k2 = 1;

//  for (i=0; i<SIGMA+2; i++) csa.C[i] = 0;
//  for (i=0; i<SIGMA; i++) csa.C[i] = 0;

  fname = NULL;  fidx = NULL;
  psi_id = idx_id = -1;
  for (i=1; i<argc; i++) {
    p = argv[i];
    if (p[0] == '-') {
      p++;
      switch (toupper(p[0])) {
      case 'I':
      // -I[n]:[D]:[D2]
        p++;
        idx_id = 0;
        csa_options(&csa, p);
        break;
      case 'P':
      // -P[n]:[L]
        p++;
        psi_id = 0;
        psi_options(&csa, p);
        break;
      case 'C':
      // -C[s]
        p++;
        sigma_options(&csa, p);
        break;
      default:
        printf("??? no such option %s\n",argv[i]);
        exit(1);
      }
    } else {
      fname = argv[i];
      k = strlen(fname);
      fidx = mymalloc(k+5);
      sprintf(fidx,"%s.idx",fname);
    }
  }
  if (fname == NULL) {
    printf("no input file.\n");
    exit(0);
  }
  printf("sigma = %d k2 = %d\n", csa.sigma, csa.k2);
  sigma = csa.sigma;

  csa.C = mymalloc(sizeof(*csa.C)*sigma); //
  csa.CtoA = mymalloc(sizeof(*csa.CtoA)*sigma); //
  csa.AtoC = mymalloc(sizeof(*csa.AtoC)*sigma); //
  csa.K = mymalloc(sizeof(*csa.K)*(sigma+2)); //
  for (i=0; i<sigma; i++) csa.C[i] = 0;


  psi_id = csa.id;
  if (psi_id >= 0) {
    printf("create psi: id=%d\n",psi_id);
  }
  if (idx_id >= 0) {
    printf("create idx: id=%d D=%d D2=%d\n",idx_id,csa.D,csa.D2);
  }

  psize = 0;

  if (psi_id >= 0) {
    switch (psi_id & 0x3f) {
    case ID_DIFF_GAMMA:
    case ID_DIFF_GAMMA_RL:
    case ID_DIFF_GAMMA_SPARSE:
    case ID_DIFF_GAMMA_RL_SPARSE:
      psize = psi1_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("Psi   %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    case ID_DIFF_GAMMA_RR:
      psize = psi12_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("Psi   %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    case ID_BWT_DNA:
      psize = lf_dna_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("BW    %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    case ID_BWT_DNA2:
      psize = lf_dna2_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("BW    %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    case ID_BWT_BIT:
      psize = lf_bit_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("BW    %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    case ID_BWT_WT:
    case ID_BWT_WT_HUF:
    case ID_BWT_WT_DENSE:
    case ID_BWT_WT_SPARSE4:
    case ID_BWT_WT_RR:
      psize = lf_wt_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("BW    %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
#if 0
    case ID_BWT_HUF:
      psize = lf_bwt_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("BW    %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
#endif
    case ID_SPARSE4:
      psize = psi2_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("Psi   %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    default:
      printf("psi_id = %d\n",psi_id);
      exit(1);
    }
  }

  csa.k = (blog(csa.n+1)+1+8-1)/8;

  for (i=0; i<sigma; i++) csa.CtoA[i] = -1;
  csa.K[-1+1] = 1;
  for (m=0,v=1,i=0; i<sigma; i++) {
    if (csa.C[i]>0) {
      csa.AtoC[m] = i;
      csa.CtoA[i] = m;
      csa.K[m+1] = v;
//      printf("i=%ld v = %ld C[i] = %ld\n",i,v,csa.C[i]);
      v += csa.C[i];
      m++;
    }
  }
  csa.K[m+1] = v;
  csa.m = m;

  if (csa.D >= csa.n) {
    printf("D=%d >= n=%ld\n",csa.D,csa.n);
    exit(0);
  }
  if (csa.D2 >= csa.n) {
    printf("D2=%d >= n=%ld\n",csa.D2,csa.n);
    exit(0);
  }

  if (idx_id >= 0) {
    n = csa.n;
    k = csa.k;
////  compute SA and ISA
    if (csa.D > 0) csa.SA = mymalloc(((n-1)/csa.D+1+1)*k);
    if (csa.D2 > 0) csa.ISA = mymalloc(((n-1)/csa.D2+1+1)*k);
    if (csa.D == 0 && csa.D2 == 0) goto brk;

    switch (psi_id & 0x3f) {
    case ID_DIFF_GAMMA:
    case ID_DIFF_GAMMA_RL:
    case ID_DIFF_GAMMA_SPARSE:
    case ID_DIFF_GAMMA_RL_SPARSE:
    case ID_SPARSE4:
    case ID_DIFF_GAMMA_RR:
      j = 0;
      for (i=0; i<=n; i++) {
        display_progressbar("making sa ",i,n);
        j = csa.psi(&csa,j);
  //  sa[j] = i;
        if (csa.D > 0 && j % csa.D == 0) {
          putuint(csa.SA,j / csa.D,i,k);
        }
        if (csa.D2 > 0 && i % csa.D2 == 0) {
          putuint(csa.ISA,i / csa.D2,j,k);
        }
      }
//      putuint(csa.SA,0,n,k);
      break;
    case ID_BWT_DNA:
    case ID_BWT_DNA2:
    case ID_BWT_BIT:
    case ID_BWT_WT:
    case ID_BWT_WT_HUF:
    case ID_BWT_WT_DENSE:
    case ID_BWT_WT_SPARSE4:
    case ID_BWT_WT_RR:
    case ID_BWT_HUF:
      j = 0;
      for (i=n-1; i>=0; i--) {
        display_progressbar("making sa ",i,n);
        v = csa.LF(&csa,j);
//        printf("LF[%ld] = %ld\n",j,v);
        j = v;
        if (csa.D > 0 && j % csa.D == 0) putuint(csa.SA, j/csa.D , i, k);
        if (csa.D2 > 0 && i % csa.D2 == 0) putuint(csa.ISA, i/csa.D2, j, k);
      }
//      putuint(csa.SA,0,n,k);
      if (csa.D > 0) putuint(csa.SA,0,n,k); // 2011-12-20
      break;
    default:
      break;
    }
brk:
////      write idx
    f2 = fopen(fidx,"wb"); /* directory */
    if (f2 == NULL) {
      perror("csa2_new1: ");
      exit(1);
    }

    isize = 0;

    writeint(4,VERSION,f2); /* version */
    isize += 4;

    writeint(1,ID_HEADER,f2); // header ID
    isize += 1;
    isize = write_header(&csa, f2, isize);

    if (csa.D > 0) {
      writeint(1,ID_SA,f2);
      isize += 1;
      isize = write_sa(&csa, f2, isize);
    }

    if (csa.D2 > 0) {
      writeint(1,ID_ISA,f2);
      isize += 1;
      isize = write_isa(&csa, f2, isize);
    }


    fclose(f2);

    if (csa.D > 0) free(csa.SA);
    if (csa.D2 > 0) free(csa.ISA);

    printf("Total %ld bytes (%1.3f bpc)\n",(psize+isize),
                (double)(psize+isize)*8/csa.n);
  }
  free(fidx);
}
Example #22
0
void csa_new_from_bwt(CSA csa, char *fname, char *fidx, int psi_id, int idx_id, bool coded)
{
	int k;
	i64 i,j,v,m;
	FILE *f2;
	i64 psize,isize;
	i64 n;

	psi_id = csa.id;
	if (psi_id >= 0) {
		printf("create psi: id=%d\n",psi_id);
	}
	if (idx_id >= 0) {
		printf("create idx: id=%d D=%d D2=%d\n",idx_id,csa.D,csa.D2);
	}

	psize = 0;

	if (psi_id >= 0) {
		switch (psi_id & 0x3f) {
			case ID_DIFF_GAMMA:
			case ID_DIFF_GAMMA_RL:
			case ID_DIFF_GAMMA_SPARSE:
			case ID_DIFF_GAMMA_RL_SPARSE:
				psize = psi1_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("Psi   %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
			case ID_DIFF_GAMMA_RR:
				psize = psi12_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("Psi   %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
			case ID_BWT_DNA:
				psize = lf_dna_makeindex(&csa, fname, coded);
				printf("n     %ld\n",csa.n);
				printf("BW    %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
			case ID_BWT_BIT:
				psize = lf_bit_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("BW    %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
			case ID_BWT_WT:
			case ID_BWT_WT_HUF:
			case ID_BWT_WT_DENSE:
			case ID_BWT_WT_SPARSE4:
			case ID_BWT_WT_RR:
				psize = lf_wt_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("BW    %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
#if 0
			case ID_BWT_HUF:
				psize = lf_bwt_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("BW    %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
#endif
			case ID_SPARSE4:
				psize = psi2_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("Psi   %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
			default:
				printf("psi_id = %d\n",psi_id);
				exit(1);
		}
	}

	csa.k = (blog(csa.n+1)+1+8-1)/8;

	for (i=0; i<SIGMA; i++) csa.CtoA[i] = -1;
	//  csa.K[-1+1] = 0;
	csa.K[-1+1] = 1;
	for (m=0,v=1,i=0; i<SIGMA; i++) {
		if (csa.C[i]>0) {
			csa.AtoC[m] = i;
			csa.CtoA[i] = m;
			csa.K[m+1] = v;
			//      printf("i=%ld v = %ld C[i] = %ld\n",i,v,csa.C[i]);
			v += csa.C[i];
			m++;
		}
	}
	csa.K[m+1] = v;
	csa.m = m;

	if (csa.D >= csa.n) {
		printf("D=%d >= n=%ld\n",csa.D,csa.n);
		exit(0);
	}
	if (csa.D2 >= csa.n) {
		printf("D2=%d >= n=%ld\n",csa.D2,csa.n);
		exit(0);
	}

	if (idx_id >= 0) {
		n = csa.n;
		k = csa.k;
		////  compute SA and ISA
		if (csa.D > 0) csa.SA = (uchar *) mymalloc(((n-1)/csa.D+1+1)*k);
		if (csa.D2 > 0) csa.ISA = (uchar *) mymalloc(((n-1)/csa.D2+1+1)*k);
		if (csa.D == 0 && csa.D2 == 0) goto brk;

		switch (psi_id & 0x3f) {
			case ID_DIFF_GAMMA:
			case ID_DIFF_GAMMA_RL:
			case ID_DIFF_GAMMA_SPARSE:
			case ID_DIFF_GAMMA_RL_SPARSE:
			case ID_SPARSE4:
			case ID_DIFF_GAMMA_RR:
				j = 0;
				for (i=0; i<=n; i++) {
					display_progressbar("making sa ",i,n);
					j = csa.psi(&csa,j);
					//  sa[j] = i;
					if (csa.D > 0 && j % csa.D == 0) {
						putuint(csa.SA,j / csa.D,i,k);
					}
					if (csa.D2 > 0 && i % csa.D2 == 0) {
						putuint(csa.ISA,i / csa.D2,j,k);
					}
				}
				//      putuint(csa.SA,0,n,k);
				break;
			case ID_BWT_DNA:
			case ID_BWT_BIT:
			case ID_BWT_WT:
			case ID_BWT_WT_HUF:
			case ID_BWT_WT_DENSE:
			case ID_BWT_WT_SPARSE4:
			case ID_BWT_WT_RR:
			case ID_BWT_HUF:
				j = 0;
				for (i=n-1; i>=0; i--) {
					display_progressbar("making sa ",i,n);
					v = csa.LF(&csa,j);
					//        printf("LF[%ld] = %ld\n",j,v);
					j = v;
					if (csa.D > 0 && j % csa.D == 0) putuint(csa.SA, j/csa.D , i, k);
					if (csa.D2 > 0 && i % csa.D2 == 0) putuint(csa.ISA, i/csa.D2, j, k);
				}
				putuint(csa.SA,0,n,k);
				break;
			default:
				break;
		}
brk:
		////      write idx
		f2 = fopen(fidx,"wb"); /* directory */
		if (f2 == NULL) {
			perror("csa2_new1: ");
			exit(1);
		}

		isize = 0;

		writeint(4,VERSION,f2); /* version */
		isize += 4;

		writeint(1,ID_HEADER,f2); // header ID
		isize += 1;
		isize = write_header(&csa, f2, isize);

		if (csa.D > 0) {
			writeint(1,ID_SA,f2);
			isize += 1;
			isize = write_sa(&csa, f2, isize);
		}

		if (csa.D2 > 0) {
			writeint(1,ID_ISA,f2);
			isize += 1;
			isize = write_isa(&csa, f2, isize);
		}


		fclose(f2);

		if (csa.D > 0) free(csa.SA);
		if (csa.D2 > 0) free(csa.ISA);

		printf("Total %ld bytes (%1.3f bpc)\n",(psize+isize),
				(double)(psize+isize)*8/csa.n);
	}
	free(fidx);
}
Example #23
0
int main(int argc, char *argv[])
{
  i64 n;
  CSA SA;

  if (argc<3) {
    fprintf(stderr, "syntax: %s {indexfiles}\n", argv[0]);
    return 1;
  }

  csa_read(&SA,argc-1, argv+1);
  n = SA.n;

#if 0
{
  int i;
  rank_t x;
  unicode_t code;
  uchar buf[6], *p;
  x = SA.inverse(&SA, 0);
  for (i = 0; i < 1000; i++) {
    if (csa_utf8_T_psi(&SA, &x, &code) == -1) {
      printf("???\n");
    }
    p = &buf[0];
    unicode_to_string(&p, code);
    buf[unicode_len(code)] = 0;
    printf("%d code = %d (%s) rank = %ld\n", i, code, &buf[0], x);
  }
  for (i = 0; i < 1000; i++) {
    if (csa_utf8_BW_LF(&SA, &x, &code) == -1) {
      printf("???\n");
    }
    p = &buf[0];
    unicode_to_string(&p, code);
    buf[unicode_len(code)] = 0;
    printf("%d code = %d (%s) rank = %ld\n", i, code, &buf[0], x);
  }
}
#endif

#if 1
{
  int i;
  rank_t x;
  unicode_t code;
  uchar buf[6], *p;
  CSAFILE *csafile;

  csafile = csa_fdopen(&SA, NULL);
  for (i = 0; i < 1000; i++) {
    code = csa_fgetwc(csafile);
    x = csafile->rank;
    p = &buf[0];
    unicode_to_string(&p, code);
    buf[unicode_len(code)] = 0;
    printf("%d code = %d (%s) rank = %ld\n", i, code, &buf[0], x);
  }
  for (i = 0; i < 1000; i++) {
    code = csa_fgetwbw(csafile);
    x = csafile->rank;
    p = &buf[0];
    unicode_to_string(&p, code);
    buf[unicode_len(code)] = 0;
    printf("%d code = %d (%s) rank = %ld\n", i, code, &buf[0], x);
  }
}
#endif

  return 0;
}