CSA *csa_load(ifstream & fp) { CSA *SA; SA = (CSA *) malloc(sizeof(CSA)); csa_init(SA); initranktables(); mkdecodetable(); SA->m = loadValue<int>(fp); SA->two = loadValue<int>(fp); SA->two2 = loadValue<int>(fp); SA->l = loadValue<int>(fp); SA->n = loadValue<int>(fp); for(int i=0; i<(SIGMA+2); i++) SA->K[i] = loadValue<int>(fp); for(int i=0; i<(SIGMA+1); i++) SA->C[i] = loadValue<int>(fp); for(int i=0; i<(SIGMA+1); i++) SA->C2[i] = loadValue<int>(fp); #ifdef USE_MMAP SA->p_size = loadValue<int>(fp); SA->B = loadValue<unsigned short>(fp, SA->p_size); SA->i_size = loadValue<int>(fp); SA->r_size = loadValue<int>(fp); SA->R = loadValue<int>(fp, SA->r_size); SA->sa_size = loadValue<int>(fp); SA->SA = loadValue<int>(fp, SA->sa_size); SA->isa_size = loadValue<int>(fp); SA->ISA = loadValue<int>(fp, SA->isa_size); SA->mapp = NULL; SA->mapi = NULL; #endif return SA; }
int load_index(char *filename, void **index){ char fname1[128],fname2[128]; CSA *SA; SA = (CSA *) malloc(sizeof(CSA)); initranktables(); mkdecodetable(); sprintf(fname1,"%s.psi",filename); sprintf(fname2,"%s.idx",filename); csa_read(SA,fname1,fname2); (*index) = SA; return 0; }
i64 psi12_makeindex(CSA *csa, char *fname) { i64 psize,psize1,psize2; i64 b, b2; i64 i,j,x,xx; i64 y,d,w; int k; FILE *f1,*f2; char *fpsi, *fpsd; //psi1_iterator *pi; i64 runlen; i64 maxrun; i64 n,L; psi1 *ps; int id,id2; FILE *out; diskbuf *psi; char *fbw, *flst; SPARSEARRAY sx, sb; int mm; ps = (psi1 *)csa->psi_struc; id = ps->id; id2 = id & 0x3f; k = strlen(fname); fbw = mymalloc(k+5); flst = mymalloc(k+5); fpsi = mymalloc(k+5); fpsd = mymalloc(k+5); sprintf(fbw,"%s.bw",fname); sprintf(flst,"%s.lst",fname); switch (id2) { case ID_DIFF_GAMMA_RR: sprintf(fpsi,"%s.pxi",fname); sprintf(fpsd,"%s.pxd",fname); break; } out = create_tmp(0); bw_to_psi(out,csa,fbw,flst,&k); psi = open_diskbuf(out,k); ps->last = getint_diskbuf(psi,0); printf("last = %ld\n",ps->last); n = csa->n; L = ps->L; if (L >= n) { printf("L=%ld >= n=%ld\n",L,n); exit(0); } maxrun = L; mkdecodetable(); f1 = fopen(fpsi,"wb"); psize1 = 0; f2 = fopen(fpsd,"wb"); psize2 = 0; ps->k = k = (blog(n+1)+1+8-1)/8; // pi = psi1_iterator_new(ps,0); writeint(1,ID_PSI,f2); writeint(1,k,f2); /* #bytes of integer */ writeint(k,n,f2); writeint(k,L,f2); psize2 += 1+1+2*k; writeint(1,id,f2); psize2 += 1; if (id & ID_COMPPTR) { mm = 0; for (i=0; i<SIGMA; i++) { if (csa->C[i] > 0) mm++; } SPARSEARRAY_construct_init(&sx, (mm+1)*(n+1), n/L+1); SPARSEARRAY_construct_init(&sb, n, n/L+1); } b = b2 = 0; mm = 0; xx = 0; for (j=0; j<=n/L; j++) { // display_progressbar("writing psi ",j,n/L); if (j % 100000 == 0) { printf("%ld %1.3f bpc\r",j,(double)psize2*8/(j+1)/L); fflush(stdout); } y = getint_diskbuf(psi,j*L); if (id & ID_COMPPTR) { if (y <= xx) { mm++; } SPARSEARRAY_construct_set(&sx, j, mm*(n+1) + y); SPARSEARRAY_construct_set(&sb, j, b); xx = y; } else { // printf("%ld x=%ld sp=%ld\n",j,y,b); writeint(k,y,f2); writeint(k,b,f2); psize2 += 2*k; } x = y; runlen = 0; b2 = 0; for (i=j*L+1; i<(j+1)*L && i <= n; i++) { /* psi[j*L] are not encoded */ y = getint_diskbuf(psi,i); d = y - x; if (d <= 0) { d += n+1; } if (i == j*L+1) { if (d == 1) {setbit(Btmp,b2+1,1); runlen = 1;} else {setbit(Btmp,b2+1,0); runlen = 0;} b2++; } if (d > 1) { if (runlen>0) { // printf("d=%ld encode runlen=%ld\n",d,runlen); w = ENCODENUM(Btmp,b2,runlen); b2 += w; runlen = 0; } // printf("encode d-1=%ld\n",d-1); w = ENCODENUM(Btmp,b2,d-1); b2 += w; runlen = 1; } else { runlen++; } x = y; } if (runlen>0) { // printf("encode runlen=%ld\n",runlen); w = ENCODENUM(Btmp,b2,runlen); b2 += w; runlen = 0; } fwrite(Btmp,(b2+15) / 16,sizeof(short),f1); psize1 += (b2+15)/16*sizeof(short); b += (b2+15) / 16; b2 = 0; } if (b2 > 0) { fwrite(Btmp,(b2+15) / 16,sizeof(short),f1); psize1 += (b2+15)/16*sizeof(short); } fwrite(Btmp,1,sizeof(short),f1); // getbitDで1ワード余計に読むため psize1 += 1*sizeof(short); if (id & ID_COMPPTR) { SPARSEARRAY_construct_end(&sx, SDARRAY_SELECT1); SPARSEARRAY_construct_end(&sb, SDARRAY_SELECT1); SPARSEARRAY_write(&sx, f2); SPARSEARRAY_write(&sb, f2); } psize = psize1 + psize2; printf("size %ld (%1.3f bpc)\n",psize,(double)psize*8 / n); fclose(f1); fclose(f2); // psi1_iterator_remove(pi); close_diskbuf(psi); fclose(out); remove_tmp(0); psi1_read(csa, fpsd); free(fpsi); free(fpsd); free(fbw); free(flst); return psize; }
i64 psi1_read(CSA *csa, char *fname) { FILE *f1; i64 psize1,psize2; i64 n; int k,l,id,id2; char *fpsi, *fpsd, *fname2; psi1 *ps; uchar *p,*q; csa->psi_struc = ps = mymalloc(sizeof(psi1)); k = strlen(fname); fname2 = mymalloc(k-4+1); strncpy(fname2,fname,k-4); fname2[k-4] = 0; k -= 5; initranktables(); mkdecodetable(); fpsi = mymalloc(k+5+1); // fpsd = mymalloc(k+5); // sprintf(fpsd,"%s.psd",fname2); fpsd = fname; // printf("psi_read: read %s\n",fpsd); ps->mappsd = mymmap(fpsd); if (ps->mappsd->addr==NULL) { perror("psi1_read: mmap2\n"); exit(1); } p = q = (uchar *)ps->mappsd->addr; psize1 = ps->mappsd->len; id = getuint(p,0,1); p += 1; if (id != ID_PSI) { printf("read_psi: id = %d is not supported.\n",id); exit(1); } ps->k = k = getuint(p,0,1); p += 1; ps->n = n = getuint(p,0,k); p += k; ps->L = l = getuint(p,0,k); p += k; id = getuint(p,0,1); p += 1; // printf("read_psi: psi_id = %d L = %d\n",id,l); csa->id = ps->id = id; id2 = id & 0x3f; switch (id2) { case ID_DIFF_GAMMA: printf("#psi format = GAMMA L=%d C=%d\n",l,(id>>7)); sprintf(fpsi,"%s.psi",fname2); break; case ID_DIFF_GAMMA_RL: printf("#psi format = GAMMA_RL L=%d C=%d\n",l,(id>>7)); sprintf(fpsi,"%s.pri",fname2); break; case ID_DIFF_GAMMA_SPARSE: printf("#psi format = GAMMA_SPARSE L=%d C=%d\n",l,(id>>7)); sprintf(fpsi,"%s.psi",fname2); break; case ID_DIFF_GAMMA_RL_SPARSE: printf("#psi format = GAMMA_RL_SPARSE L=%d C=%d\n",l,(id>>7)); sprintf(fpsi,"%s.pri",fname2); break; case ID_DIFF_GAMMA_RR: printf("#psi format = GAMMA_RR L=%d C=%d\n",l,(id>>7)); sprintf(fpsi,"%s.pxi",fname2); break; default: printf("read_csa: ID %d is not supported.\n",id); break; } if (id & ID_COMPPTR) { printf("COMPPTR\n"); ps->sx = mymalloc(sizeof(SPARSEARRAY)); ps->sb = mymalloc(sizeof(SPARSEARRAY)); SPARSEARRAY_read(ps->sx, &p); SPARSEARRAY_read(ps->sb, &p); } else { ps->R = p; } // printf("psize = %ld\n",psize); //// read psi // printf("psi_read: map %s\n",fpsi); ps->mappsi = mymmap(fpsi); if (ps->mappsi->addr==NULL) { perror("psi1_read: mmap1\n"); exit(1); } ps->B = (unsigned short *)ps->mappsi->addr; psize2 = ps->mappsi->len; // printf("psize2 = %ld\n",psize2); // printf("psi1_read: psize1 = %ld psize2 = %ld\n",psize1,psize2); ps->psize = psize1 + psize2; free(fpsi); // free(fpsd); free(fname2); // user-specific functions csa->psi = psi1_psi; if (id2 == ID_DIFF_GAMMA_RR) { csa->psi = psi12_psi; csa->psi_pred = csa_psi_pred_naive; csa->psi_succ = csa_psi_succ_naive; } else { if ((id & ID_COMPPTR) || 0) { csa->psi_pred = csa_psi_pred_naive; csa->psi_succ = csa_psi_succ_naive; } else { csa->psi_succ = psi1_succ_tmp; csa->psi_pred = psi1_pred_tmp; // csa->psi_succ = csa_psi_succ_naive; // csa->psi_pred = csa_psi_pred_naive; } } // default functions csa->LF = csa_LF_by_psi; csa->lookup = csa_lookup; csa->inverse = csa_inverse; csa->text = csa_text; csa->substring = csa_substring; csa->T = csa_T; csa->head = csa_head_rank; csa->search = csa_search; csa->searchsub = csa_searchsub; return psize1 + psize2; }