unsigned long *csa_batchlookup2(CSA *SA,int l, int r) { unsigned long *I; /* z */ int *V; /* vz */ int *J; /* Itz */ int v; /* [ */ int m; /* psivZ(testp) */ int q; int i,j; int two; int *sa; int f,s; two = SA->two; sa = SA->SA; I = (unsigned long *)malloc((r-l+1)*sizeof(*I)); V = (int *)malloc((r-l+1+1)*sizeof(*V)); J = (int *)malloc((r-l+1+1)*sizeof(*J)); for (j=l; j<=r; j++) J[j-l] = -1; for (j=l; j<=r; j++) I[j-l] = 0; for (m=0,j=l; j<=r; j++) { f = 0; i = j; v = 0; while (i % two !=0) { i = csa_psi(SA,i); v++; m++; if (l <= i && i <= r) { V[j-l] = v; J[i-l] = j; f = 1; break; } } if (f==0) { i = i / two; I[j-l] = sa[i]-v; } } for (j=l; j<=r; j++) { if (I[j-l] != 0) { q = j; while (J[q-l] != -1) { s = I[q-l]; i = J[q-l]; v = V[i-l]; I[i-l] = s - v; J[q-l] = -1; q = i; } } } for (j=l; j<=r; j++) I[j-l]--; free(V); free(J); return I; }
/*/////////////////////// //Accessing the indexed// ///////////////////////*/ int display(void *index, UCHAR *pattern, ULONG length, ULONG numc, ULONG *numocc, UCHAR **snippet_text, ULONG **snippet_lengths) { int l,r; int pos; ULONG *occ, i, j, from, to, len, x; UCHAR *text_aux; CSA *SA=(CSA *) index; csa_bsearch(pattern,length,SA,&l,&r); *numocc = r-l+1; occ = csa_batchlookup2(SA,l,r); *snippet_lengths = (ULONG *) malloc((*numocc)*sizeof(ULONG)); if (!(*snippet_lengths)) return 1; *snippet_text = (UCHAR *) malloc((*numocc)*(length+2*numc)*sizeof(UCHAR)); if (!(*snippet_text)) return 1; text_aux=*snippet_text; for (i=0;i<(*numocc);i++) { x=occ[i]; if (x>numc) from = x-numc; else from=0; to= ((int)(x+length+numc-1)<(int)(SA->n-1)?(x+length+numc-1):(SA->n-1)); len =to-from+1; pos = csa_inverse(SA,from+1); for (j=0; (int)j<(int)len;j++) { text_aux[j] = csa_T(SA,pos); pos= csa_psi(SA,pos); } text_aux+=length+2*numc; (*snippet_lengths)[i] = len; } free(occ); return 0; }
void csa_decode1line(unsigned char *p,CSA *SA,int suf,int maxlen) { int i,k,m,pos; unsigned char *tmp; m = maxlen*2; tmp = (unsigned char *)malloc(m+1); if (tmp==NULL) {perror("csa_decode1line"); exit(1);} k = suf - maxlen; if (k <= 0) k = 1; pos = csa_inverse(SA,k); i = 0; while (i < m) { tmp[i] = csa_T(SA,pos); pos = csa_psi(SA,pos); i++; } for (i = suf-k; i < m; i++) { if (tmp[i] == 0x0a) {i--; break;} } m = i; for (i = suf-k; i >= 0; i--) { if (tmp[i] == 0x0a) {i++; break;} } if (m-i > maxlen) i = m-maxlen; while (i < m) *p++ = tmp[i++]; *p = 0; free(tmp); }
void csa_decode2(unsigned char *p,CSA *SA,int pos,int len) { int i; i = 0; while (i < len) { *p++ = csa_T(SA,pos); pos = csa_psi(SA,pos); i++; } }
int csa_lookup(CSA *SA, int i) { int v,two; v = 0; two = SA->two; while (i % two !=0) { i = csa_psi(SA,i); v++; } i = i / two; return SA->SA[i]-v; }
void csa_decode(unsigned char *p,CSA *SA,int suf,int len) { int pos; int i; pos = csa_inverse(SA,suf); i = 0; while (i < len) { *p++ = csa_T(SA,pos); pos = csa_psi(SA,pos); i++; } }
int csa_inverse(CSA *SA, int suf) { int p,pos; int two2; two2 = SA->two2; p = ((suf-1)/two2)*two2+1; pos = SA->ISA[(suf-1)/two2]; while (p < suf) { pos = csa_psi(SA,pos); p++; } return pos; }
int extract(void *index, ULONG from, ULONG to, UCHAR **snippet, ULONG *snippet_length){ CSA *SA=(CSA *) index; ULONG n = SA->n; int pos; if (to >= n) to=n-1; if (from > to) { *snippet = NULL; *snippet_length=0; } else { ULONG j; ULONG len =to-from+1; *snippet = (UCHAR *) malloc((len)*sizeof(UCHAR)); if (!(*snippet)) return 1; pos = csa_inverse(SA,from+1); for (j=0; j<len;j++) { (*snippet)[j]=csa_T(SA,pos); pos= csa_psi(SA,pos); } (*snippet_length)=len; } return 0; }
int extract(void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) { CSA *SA=(CSA *) index; ulong n = SA->n; int pos; if (to >= n) to=n-1; if (from > to) { *snippet = NULL; *snippet_length=0; } else { ulong j; ulong len =to-from+1; *snippet = (uchar *) malloc((len)*sizeof(uchar)); if (!(*snippet)) return 1; pos = csa_inverse(SA,from+1); for (j=0; j<len;j++) { (*snippet)[j]=csa_T(SA,pos); pos= csa_psi(SA,pos); } (*snippet_length)=len; } return 0; }
size_t TextIndexCSA::getPsi(size_t i) const{ return csa_psi(csa,i+1)-1; }
/* backward search */ int csa_bsearch(unsigned char *key,int keylen,CSA *SA,int *li,int *ri) { int c,h,l,r,m,ll,rr,pl,pr; int x,b,w,d,n,*R; unsigned short *B; int len; c = key[keylen-1]; r = SA->C[c]; if (c>0) l = SA->C[c-1]+1; else l = 1; len = 0; if (l > r) goto end; len++; for (h = keylen-2; h >= 0; h--) { pl = l; pr = r; c = key[h]; r = SA->C[c]; if (c>0) l = SA->C[c-1]+1; else l = 1; if (l > r) goto end; #if 0 while (1) { // find maximum r such that Psi[r] <= pr j = csa_psi(SA,r); if (j <= pr) break; r--; //if (l > r) goto end; } #else #if 0 ll = l; rr = r; while (ll <= rr) { m = (ll + rr) / 2; if (csa_psi(SA,m) <= pr) ll = m+1; else rr = m-1; } r = ll-1; #else R = SA->R; B = SA->B; w = SA->l; n = SA->n; ll = l / w + 1; rr = r / w; while (ll <= rr) { m = (ll + rr) / 2; if (R[m*2] <= pr) ll = m+1; else rr = m-1; } m = (ll-1)*w; x = R[(m / w)*2]; b = R[(m / w)*2+1]; #if 1 while (m < l) { b += DECODENUM(B,b,&d); x += d; //if (x > n) printf("??? \n"); if (x > n) {x = -1; m--;} m++; } #endif while (x <= pr && m <= r) { b += DECODENUM(B,b,&d); x += d; //if (x > n) printf("??? \n"); m++; } r = m-1; #endif #endif #if 0 while (1) { // find minimum l such that Psi[l] >= pl j = csa_psi(SA,l); if (j >= pl) break; l++; //if (l > r) goto end; } #else #if 0 ll = l; rr = r; while (ll <= rr) { m = (ll + rr) / 2; if (csa_psi(SA,m) >= pl) rr = m-1; else ll = m+1; } l = rr+1; #else //ll = l / w + 1; ll = l / w; rr = r / w; while (ll <= rr) { m = (ll + rr) / 2; if (R[m*2] >= pl) rr = m-1; else ll = m+1; } m = rr*w; x = R[(m / w)*2]; b = R[(m / w)*2+1]; while (m < l) { b += DECODENUM(B,b,&d); x += d; if (x > n) {x = -1; m--;} m++; } while (x < pl && m <= r) { b += DECODENUM(B,b,&d); x += d; m++; } l = m; #endif #endif if (l > r) goto end; len++; } end: *li = l; *ri = r; return len; }
int *csa_batchlookup3(CSA *SA,int l, int r,int len) { int *I; /* z */ int *P; /* r i z */ int v; /* [ */ int m; /* SA */ int q; int i,j; int two; int *sa; int k,b,d,x,n,w; unsigned short *B; n = SA->n; B = SA->B; two = SA->two; sa = SA->SA; w = SA->l; I =(int *) malloc((r-l+1+1)*sizeof(*I)); P =(int *) malloc((r-l+1+1)*sizeof(*I)); #if 1 x = SA->R[(l / w)*2]; b = SA->R[(l / w)*2+1]; j = l % w; for (k=0; k<j; k++) { b += DECODENUM(B,b,&d); x += d; if (x > n) {x = -1; k--;} } for (m = 0, q = 0, i = l; i <= r; i++) { if (i % two == 0) { I[1+m] = sa[i / two]; m++; } else { P[q++] = x; } b += DECODENUM(B,b,&d); x += d; if (x > n) { x = -1; b += DECODENUM(B,b,&d); x += d; } } v = 1; #else for (q = 0, i = l; i <= r; i++) { P[q++] = i; } v = 0; m = 0; #endif while (q > 0 && v <= len) { for (k = 0, j = 0; j < q; j++) { i = P[j]; if (i % two == 0) { I[1+m] = sa[i / two] - v; m++; } else { P[k++] = csa_psi(SA,i); } } q = k; v++; } for (j = 0; j < q; j++) { I[1+m] = csa_lookup(SA,P[j])-v; m++; } qsort(I+1, r-l+1, sizeof(int), intcompare); I[0] = r-l+1; free(P); return I; }