/*//////////////////// //Building the Index// ////////////////////*/ int build_index(uchar *text, ulong length, char *build_options, void **index){ /*if (text[length-1]!='\0') return 2;*/ ulong i, *p; long overshoot; TSA_Un *_index= (TSA_Un *) malloc(sizeof(TSA_Un)); uchar *x; char delimiters[] = " =;"; int j,num_parameters; char ** parameters; int copy_text=false; /* don't copy text by default */ int free_text=false; /* don't free text by default */ if (!_index) return 1; if (build_options != NULL) { parse_parameters(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters;j++) { if (strcmp(parameters[j], "copy_text") == 0 ) copy_text=true; else if (strcmp(parameters[j], "free_text") == 0 ) free_text=true; } free_parameters(num_parameters, ¶meters); } /* Consistence of parameters */ if ((!copy_text) && (free_text)) return 5; /* */ if ( !copy_text ) { _index->text = text; _index->own=false; } else { _index->text = (uchar *) malloc(sizeof(uchar)*length); if (!_index->text) return 1; for (i=0;i<length;i++) _index->text[i]=text[i]; _index->own=true; } if ( free_text ) free(text); _index->n=length; /* Make suffix array */ overshoot = init_ds_ssort(500, 2000); p= (ulong *) malloc (sizeof(ulong)*(length)); if (!p) return 1; x= (uchar *) malloc (sizeof(uchar)*(length+overshoot)); if (!x) return 1; for (i=0;i<length;i++) x[i]=_index->text[i]; ds_ssort( x, p, _index->n); free(x); _index->pos = p; (*index) = _index; return 0; }
Int64 *getSuffixArray(Sequence *seq){ Int64 overshoot; Int64 *sa; UChar *textu; /* init ds suffix sort routine (cf. DeepShallow/testlcp.c) */ overshoot = init_ds_ssort(500, 2000); if (overshoot == 0) { eprintf("ERROR: ds initialization failed.\n"); } sa = (Int64 *)emalloc((size_t)(seq->len + 1) * sizeof(Int64)); seq->seq = (char *)erealloc(seq->seq, (size_t)(seq->len + overshoot) * sizeof(char)); textu = (UChar *)seq->seq; ds_ssort(textu, (sa + 1), seq->len); return sa; }
/* *************************************************************** 1. Read Infile and store it to text[]. 2. Compute the bwt and the lcp arrays 3. Compute an optimal paritioning and compress *************************************************************** */ void compress_file(void) { double bwt_partition1(bwt_data *b, int *lcp); double pseudo_compr(uint8 *t, int size); uint8 *text; int *sa, n, overshoot, i, k, extra_bytes; int occ[ALPHA_SIZE], *lcp=NULL; bwt_data b; double start, end, estimate, apost_est=0, apost_est_tot=0; int bloque1,bloque2; // ----- init ds suffix sort routine ----- overshoot=init_ds_ssort(500,2000); if(overshoot==0) fatal_error("ds initialization failed! (compress_file)\n"); // ----- allocate text and suffix array ----- n = Infile_size; // length of input text sa=malloc((n+1)*sizeof *sa); // suffix array text=malloc((n+overshoot)*sizeof *text); // text if (! sa || ! text) out_of_mem("compress_file"); // ----- read text and build suffix array ------ rewind(Infile); i=fread(text, (size_t) 1, (size_t) n, Infile); if(i!=n) fatal_error("Error reading the input file!"); fprintf(stdout,"File size: %d bytes\n",n); // ----- build suffix array ---------------- start = getTime(); ds_ssort(text,sa+1,n); // sort suffixes end=getTime(); fprintf(stdout,"Suffix array construction: %.2f seconds\n",end-start); // ---- compute lcp using 6n algorithm --------- start = getTime(); for(i=0;i<ALPHA_SIZE;i++) occ[i]=0; for(i=0;i<n;i++) occ[text[i]]++; if( (b.bwt = (uint8 *) malloc(n+1)) == NULL) out_of_mem("bwtopt1_file"); _bw_sa2bwt(text, n, sa, &b); extra_bytes = _lcp_sa2lcp_6n(text,&b,sa,occ); lcp = sa; end=getTime(); fprintf(stdout,"lcp6 construction: %.2f seconds\n",end-start); fprintf(stdout,"Total memory for lcp6: %.2fn bytes\n", 6+(4.0*extra_bytes)/n); // ---- compute the optimal partition --------- start = getTime(); estimate = bwt_partition1(&b,lcp); end=getTime(); bloque1 = 0; bloque2 = lcp[0]; while ( bloque2 != n+1 ) { printf("[%7d,%7d]\n", bloque1, bloque2-1); for (i=bloque1; i<=bloque2-1; i++) printf("%d ", b.bwt[i]); printf("\n"); bloque1 = bloque2; bloque2 = lcp[bloque2]; } printf("[%7d,%7d]\n", bloque1,bloque2-1); fprintf(stdout,"Optimal partition computation: %.2f seconds\n",end-start); // ---- compress -------- for(k=i=0;i<=n; ) { assert(lcp[i]>i); // bwt[i] -> bwt[lcp[i]-1] is a segment if(Verbose>0) { apost_est_tot += apost_est = pseudo_compr(b.bwt+i,lcp[i]-i); if(Verbose>1) fprintf(stderr,"%d) %d <-> %d: %f bits\n",k,i,lcp[i]-1,apost_est); } i = lcp[i]; // starting point of next segment k++; // increase # of segment } assert(i==n+1); fprintf(stdout, "Number of partitions: %d\n",k); fprintf(stdout, "Estimated compressed size: %lf (not reliable)\n",estimate); if(Verbose>0) { fprintf(stdout, "A posteriori estimate: %lf ",apost_est_tot); fprintf(stdout, "Delta %lf (should be zero)\n",apost_est_tot-estimate); } free(b.bwt); free(text); free(sa); }
int main(int argc, char *argv[]) { void write_sa(char *filename, int *p, int n); void write_lcp(char *filename, UChar *x,int *p, int n); void write_bwt(char *filename, UChar *x,int *p, int n); void check_sa_ordering(UChar *x,int *p, int n, int); void print_sa_onscreen(UChar *x,int *p, int n, int); int c, *p, n; int print_sa, check_sa, num_opt,overshoot; UChar *x; clock_t end,start, end_real, start_real; struct tms r; double tot_time = 0.0; double tot_time_real = 0.0; extern char *optarg; extern int optind, opterr, optopt; char *fnam, *sa_filename; char *lcp_filename,*bwt_filename; // names for (optional) lcp and bwt files FILE *f; /* ------------ set default values ------------- */ set_global_variables(); print_sa=check_sa=0; sa_filename = NULL; lcp_filename = NULL; bwt_filename = NULL; /* ------------- read options from command line ----------- */ num_opt = opterr = 0; while ((c=getopt(argc, argv, "b:d:l:p:r:w:cvux:f:T:W:B:")) != -1) { switch (c) { case 'b': bwt_filename = optarg; break; case 'c': check_sa++; break; case 'd': Anchor_dist = atoi(optarg); break; case 'l': Shallow_limit = atoi(optarg); break; case 'p': print_sa = atoi(optarg); break; case 'x': _ds_Word_size = atoi(optarg); break; case 'v': _ds_Verbose++; break; case 'w': sa_filename = optarg; break; case 'f': Max_pseudo_anchor_offset = atoi(optarg); break; case 'r': B2g_ratio = atoi(optarg); break; case 'u': Update_anchor_ranks = 1; break; case 'T': Mk_qs_thresh = atoi(optarg); break; case 'W': lcp_filename = optarg; break; case 'B': Blind_sort_ratio = atoi(optarg); break; case '?': fprintf(stderr,"Unknown option: %c -main-\n", optopt); exit(1); } num_opt++; } if(optind<argc) fnam=argv[optind]; else { fprintf(stderr, "Usage:\n\t%s [-b bwtfile][-cuv][-d dist]",argv[0]); fprintf(stderr, "[-l len][-p num][-f maxoff][-r ratio]\n"); fprintf(stderr, "\t [-T thresh][-w safile][-W lcpfile][-x wsize][-B ratio]"); fprintf(stderr, " file\n\n"); fprintf(stderr,"\t-b bwtfile write bwt to bwtfile\n"); fprintf(stderr, "\t-B ratio blind_sort ratio [def. %d]\n",Blind_sort_ratio); fprintf(stderr,"\t-c check the sa (could be very slow)\n"); fprintf(stderr,"\t-d dist anchor distance [def. %d]\n",Anchor_dist); fprintf(stderr,"\t-f maxoff Maximum offset for forward "); fprintf(stderr,"pseudo-anchors [def. %d]\n",Max_pseudo_anchor_offset); fprintf(stderr, "\t-l len shallow sort limit [def. %d]\n",Shallow_limit); fprintf(stderr, "\t-r ratio bucket to group max ratio [def. %d]\n",B2g_ratio); fprintf(stderr,"\t-p num print num char of each suffix [def. 0]\n"); fprintf(stderr, "\t-T thresh Threshold for mk-qs [def. %d]\n", Mk_qs_thresh); fprintf(stderr,"\t-u updates anchor ranks in get_rank()\n"); fprintf(stderr,"\t-v produces a verbose output\n"); fprintf(stderr,"\t-w safile write sa to safile\n"); fprintf(stderr, "\t-W lcpfile check sa and write lcp to lcpfile (very slow)\n"); fprintf(stderr, "\t-x wsize word size in mkqs (default %d)\n\n",_ds_Word_size); return 0; } if(_ds_Verbose) { fprintf(stderr,"Command line: "); for(c=0;c<argc;c++) fprintf(stderr,"%s ",argv[c]); fprintf(stderr,"\n"); } /* -------- check parameters ------------- */ if(check_global_variables()) { exit(1); } /* ---------- open file and read text ----------- */ if (! (f=fopen(fnam, "rb"))) { perror(fnam); return 1; } if (fseek(f, 0L, SEEK_END)) { perror(fnam); return 1; } n=ftell(f); if (n==0) { fprintf(stderr, "%s: file empty\n", fnam); return 0; } // ------ allocate memory for text and sa ------- overshoot = compute_overshoot(); p=malloc((n)*sizeof *p); // sa x=malloc((n+overshoot)*sizeof *x); // text if (! p || ! x) { fprintf(stderr, "malloc failed\n"); return 1; } // ------------ read input text --------------- rewind(f); c=fread(x, (size_t) 1, (size_t) n, f); // lseek(fileno(f),0,SEEK_SET); // c=read(fileno(f), x, (size_t) n); if(c!=n) { fprintf(stderr,"Error in read() (%d vs %d) (main)\n",c,n); perror(fnam); return 1; } fclose(f); /* --------- start measuring time ------------- */ if(_ds_Verbose) fprintf(stderr,"Starting sa construction ... \n"); start_real = times(&r); start = (r.tms_utime+r.tms_stime); /* user + system */ ds_ssort(x, p, n); end_real = times(&r); end = (r.tms_utime+r.tms_stime); /* user + system */ // tot_time = ((double) (end-start))/CLK_TCK; //tot_time_real = ((double) (end_real-start_real))/CLK_TCK; printf("Elapsed time: %.2f seconds (user+sys). Total real time: %.2f.\n", tot_time, tot_time_real); // --------------- write bwt to a file if(bwt_filename!=NULL) write_bwt(bwt_filename,x,p,n); // --------------- write sa to a file if(sa_filename!=NULL) write_sa(sa_filename,p,n); // --------------- write lcp to a file if(lcp_filename!=NULL) write_lcp(lcp_filename,x,p,n); // ------------ check sa -------- if(check_sa) check_sa_ordering(x,p,n,check_sa); // ----- display sa ------- if(print_sa) print_sa_onscreen(x,p,n,print_sa); // deallocate and exit free(x); free(p); return 0; }
ESA build_ESA(char *pStr, int size, char *pAlphabet, char *pIgnore, int free_pStr) { // Check if the string includes a zero termination if(pStr[size] != '\0') { setError("The string MUST include a zero termination within the size\n"); if(free_pStr) free(pStr); return NULL; } initTimer(); int overshoot; ESA esa = malloc(sizeof(*esa)); if(!esa) { setError("Couldn't allocate memory for ESA.\n"); if(free_pStr) { free(pStr); freeTimer(); } return NULL; } unsigned char *text; int n = size + 1; // Include the zeroterninatin in the string // Calculate the overshoot overshoot=init_ds_ssort(500,2000); text = malloc((n + overshoot)*sizeof *text); if(!text) { setError("Couldn't allocate memory for translated text.\n"); free(esa); if(free_pStr) { free(pStr); freeTimer(); } return NULL; } // Translate the text and stop if it fails if(! translate(text, pStr, n-1, pAlphabet, pIgnore) ) { free(text); free(esa); if(free_pStr) free(pStr); freeTimer(); return NULL; } // Free pStr if possible if(free_pStr) free(pStr); // Save the text, alphabet and size in the esa structure setStr(esa, text); setSize(esa, n); setAlphabetSize(esa, strlen(pAlphabet)); setIgnoreAlphabetSize(esa, strlen(pIgnore)); setAlphabet(esa, pAlphabet); setIgnoreAlphabet(esa, pIgnore); addTimer("Initializing"); // Do the sorting, calc. lcp and calc. skip esa->suf = malloc(sizeof(int) * n); if(!esa->suf) { free(text); free(esa); freeTimer(); setError("Couldn't allocate memory for suffix column in suffix array.\n"); return NULL; } ds_ssort(esa->pStr, esa->suf, n, MAXPSSMSIZE); addTimer("DS-Sort"); esa->lcp = malloc(sizeof(unsigned char) * n); if(!esa->lcp) { setError("Couldn't allocate memory for LCP column in suffix array.\n"); free(esa->suf); free(text); free(esa); freeTimer(); return NULL; } calcLcpNaiv(esa); addTimer("Calc Lcp"); // The line below can be commented in to verify that there are "errors" in the suffix array // it will scan the array for errors and report the minimum depth at which an error was found // the last parameter specifies the max depth to search to). // As a side effect it calculates lcp (when used for this purpose the depth parameter should equa // that used when calling ds_ssort). // verifyNaively(esa, n, MAX_DEPTH); esa->skip = malloc(sizeof(int) * n); if(!esa->skip) { setError("Couldn't allocate memory for SKIP column in suffix array.\n"); free(esa->lcp); free(esa->suf); free(text); free(esa); freeTimer(); return NULL; } if(calcSkip(esa) == 0) { free(esa->skip); free(esa->lcp); free(esa->suf); free(text); free(esa); freeTimer(); return NULL; } addTimer("Calc Skip"); printTimer(); freeTimer(); return esa; }
/*//////////////////// //Building the Index// ////////////////////*/ int build_index(uchar *text, ulong length, char *build_options, void **index) { /*if (text[length-1]!='\0') return 2;*/ ulong i, *p, *sa_diff; long overshoot; TSA_Un *_index= (TSA_Un *) malloc(sizeof(TSA_Un)); uchar *x; FILE *f; char fnamext[1024]; char fnameaux[1024]; char delimiters[] = " =;"; int j,num_parameters; char ** parameters; int copy_text=false; /* don't copy text by default */ int free_text=false; /* don't free text by default */ int withload=false; /* don't load SA and BPE by default */ int samplerate=64; /* samplerate for bpe */ int max_phrase=256; bool verbose=false; double cutoff=100.0; bool SA_treap=true,SA_psi=false; if (!_index) return 1; if (build_options != NULL) { parse_parameters(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters; j++) { if (strcmp(parameters[j], "copy_text") == 0 ) copy_text=true; else if (strcmp(parameters[j], "withload") == 0 ) withload=true; else if (strcmp(parameters[j], "filename") == 0 ) { strcpy(fnamext,parameters[j+1]); j++; } else if ((strcmp(parameters[j], "samplerate") == 0 ) && (j < num_parameters-1) ) { samplerate=atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "max_phrase") == 0 ) && (j < num_parameters-1) ) { max_phrase=atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "cutoff") == 0 ) && (j < num_parameters-1) ) { cutoff=atof(parameters[j+1]); j++; } else if (strcmp(parameters[j], "free_text") == 0 ) free_text=true; else if (strcmp(parameters[j], "verbose") == 0 ) verbose=true; else if (strcmp(parameters[j], "SA_treap") == 0 ) { SA_treap=true; SA_psi=false; } else if (strcmp(parameters[j], "SA_psi") == 0 ) { SA_treap=false; SA_psi=true; } } free_parameters(num_parameters, ¶meters); } //printf("samplerate = %lu\n",samplerate); /* Consistence of parameters */ if ((!copy_text) && (free_text)) return 5; /* */ if ( !copy_text ) { _index->text = text; _index->own=false; } else { _index->text = (uchar *) malloc(sizeof(uchar)*length); if (!_index->text) return 1; for (i=0; i<length; i++) _index->text[i]=text[i]; _index->own=true; } if ( free_text ) free(text); _index->n=length; /* Make suffix array */ if (withload) { ulong filename_len; p= (ulong *) malloc (sizeof(ulong)*(length)); if (!p) return 1; sprintf (fnameaux,"%s.sa",fnamext); f = fopen (fnameaux,"r"); if (fread (&filename_len,sizeof(ulong),1,f) != 1) return 25; assert(filename_len==_index->n); if (fread (p,sizeof(ulong),filename_len,f) != filename_len) return 25; if (fclose(f) != 0) return 28; } else { overshoot = init_ds_ssort(500, 2000); p= (ulong *) malloc (sizeof(ulong)*(length)); if (!p) return 1; x= (uchar *) malloc (sizeof(uchar)*(length+overshoot)); if (!x) return 1; for (i=0; i<length; i++) x[i]=_index->text[i]; ds_ssort( x, p, _index->n); free(x); } /* Make bpe */ if (withload && false ) { int error; sprintf (fnameaux,"%s.bpe",fnamext); f = fopen (fnameaux,"r"); _index->bpe = new BPE(f,&error); if (error !=0) return error; if (fclose(f) != 0) return 28; } else { if (SA_treap) { sa_diff= (ulong *) malloc (sizeof(ulong)*(length+3)); if (!sa_diff) return 1; for (i=0; i<length-1; i++) { assert(p[i+1]-p[i]+length>0); sa_diff[i+1]=p[i+1]-p[i]+length; } free(p); ulong maximo=0; for (i=0; i<length-1; i++) { if (maximo < sa_diff[i+1]) maximo=sa_diff[i+1]; } sa_diff[0]=maximo+1; sa_diff[length+1]=maximo+2; sa_diff[length+2]=maximo+3; _index->bpe = new BPE(sa_diff,length-1+3, max_phrase, cutoff, verbose); } if (SA_psi) { ulong *ip= (ulong *) malloc (sizeof(ulong)*(length)); for (i=0; i<length; i++) ip[p[i]] = i; for (i=0; i<length; i++) assert(ip[p[i]] == i); ulong *Psi= (ulong *) malloc (sizeof(ulong)*(length)); for (i=0; i<length; i++) if (p[i] == length-1) Psi[i] = ip[0]; else Psi[i] = ip[p[i]+1]; ulong ini=ip[0]; free(ip); sa_diff= (ulong *) malloc (sizeof(ulong)*length); if (!sa_diff) return 1; for (i=0; i<length-1; i++) { assert(p[i+1]-p[i]+length>0); sa_diff[i]=p[i+1]-p[i]+length; } free(p); _index->bpe = new BPE(sa_diff,Psi,ini,length-1,verbose); } } /* Make suffix array again */ if (withload) { ulong filename_len; p= (ulong *) malloc (sizeof(ulong)*(length)); if (!p) return 1; sprintf (fnameaux,"%s.sa",fnamext); f = fopen (fnameaux,"r"); if (fread (&filename_len,sizeof(ulong),1,f) != 1) return 25; assert(filename_len==_index->n); if (fread (p,sizeof(ulong),filename_len,f) != filename_len) return 25; if (fclose(f) != 0) return 28; } else { overshoot = init_ds_ssort(500, 2000); p= (ulong *) malloc (sizeof(ulong)*(length)); if (!p) return 1; x= (uchar *) malloc (sizeof(uchar)*(length+overshoot)); if (!x) return 1; for (i=0; i<length; i++) x[i]=_index->text[i]; ds_ssort( x, p, _index->n); free(x); } /* //////////////////////////////////////////////////// sa_diff= (ulong *) malloc (sizeof(ulong)*length); for (i=0;i<length-1;i++){ assert(p[i+1]-p[i]+length>0); sa_diff[i]=p[i+1]-p[i]+length; } ulong *z2; z2=_index->bpe->dispairall(); printf("Check SA_diff todo\n"); for (i=0;i<length-1;i++){ if (z2[i]-sa_diff[i] !=0) {printf("%lu, %lu %lu,%lu\n",i, z2[i]-sa_diff[i],z2[i],sa_diff[i]);fflush(stdout);} } printf("End Check SA_diff todo\n"); free(z2); printf("End Check SA_diff 2\n"); for (ulong mmm=1; mmm < 5000; mmm++) { printf("Check SA_diff %lu ",mmm); for (i=0;i<length-1-(mmm-1);i++){ z2=_index->bpe->dispair(i,mmm); //if (i % (n/10) == 0) {printf("C2 %lu\n",i);fflush(stdout);} for (ulong mm =1 ; mm <= mmm; mm++) if (z2[mm]-sa_diff[i+mm-1] !=0) {printf("T%lu %lu, %lu %lu,%lu\n",mm,i, z2[mm]-sa_diff[i+mm-1],z2[mm],sa_diff[i+mm-1]);fflush(stdout);} free(z2); } printf("End Check SA_diff %lu\n",mmm);fflush(stdout); } free(sa_diff); ///////////////////////////////////////////////////////// */ /* Make samplerate */ _index->samplerate = samplerate; _index->ns = (length-1)/samplerate+1; if (((length-1) % samplerate) != 0) _index->ns++; _index->pos = (ulong *) malloc (sizeof(ulong)*_index->ns); //_index->pos[0]=p[0]; j=0; for (i=0; i < length ; i+=samplerate) { if (i != length-1) { //if (p[_index->bpe->BR->prev(i)] != _index->pos[j-1]) { _index->pos[j]=p[_index->bpe->BR->prev(i)]; j++; // } } else { _index->pos[j]=p[i]; j++; } } if (((length-1) % samplerate) != 0) _index->pos[j]=p[length-1]; _index->ns=j+1; /* _index->samplerate = samplerate; _index->ns = (length-1)/samplerate+1; if (((length-1) % samplerate) != 0) _index->ns++; _index->pos = (ulong *) malloc (sizeof(ulong)*_index->ns); j=0; for (i=0; i < length ; i+=samplerate) { _index->pos[j]=p[i]; j++; } if (((length-1) % samplerate) != 0) _index->pos[j]=p[length-1]; assert(j+1==_index->ns); */ free(p); (*index) = _index; return 0; }