/*//////////////////// //Building the Index// ////////////////////*/ int build_index(uchar *text, ulong length, char *build_options, void **index){ /*if (text[length-1]!='\0') return 2;*/ ulong i, *p; long overshoot; TSA_Un *_index= (TSA_Un *) malloc(sizeof(TSA_Un)); uchar *x; char delimiters[] = " =;"; int j,num_parameters; char ** parameters; int copy_text=false; /* don't copy text by default */ int free_text=false; /* don't free text by default */ if (!_index) return 1; if (build_options != NULL) { parse_parameters(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters;j++) { if (strcmp(parameters[j], "copy_text") == 0 ) copy_text=true; else if (strcmp(parameters[j], "free_text") == 0 ) free_text=true; } free_parameters(num_parameters, ¶meters); } /* Consistence of parameters */ if ((!copy_text) && (free_text)) return 5; /* */ if ( !copy_text ) { _index->text = text; _index->own=false; } else { _index->text = (uchar *) malloc(sizeof(uchar)*length); if (!_index->text) return 1; for (i=0;i<length;i++) _index->text[i]=text[i]; _index->own=true; } if ( free_text ) free(text); _index->n=length; /* Make suffix array */ overshoot = init_ds_ssort(500, 2000); p= (ulong *) malloc (sizeof(ulong)*(length)); if (!p) return 1; x= (uchar *) malloc (sizeof(uchar)*(length+overshoot)); if (!x) return 1; for (i=0;i<length;i++) x[i]=_index->text[i]; ds_ssort( x, p, _index->n); free(x); _index->pos = p; (*index) = _index; return 0; }
Int64 *getSuffixArray(Sequence *seq){ Int64 overshoot; Int64 *sa; UChar *textu; /* init ds suffix sort routine (cf. DeepShallow/testlcp.c) */ overshoot = init_ds_ssort(500, 2000); if (overshoot == 0) { eprintf("ERROR: ds initialization failed.\n"); } sa = (Int64 *)emalloc((size_t)(seq->len + 1) * sizeof(Int64)); seq->seq = (char *)erealloc(seq->seq, (size_t)(seq->len + overshoot) * sizeof(char)); textu = (UChar *)seq->seq; ds_ssort(textu, (sa + 1), seq->len); return sa; }
/* *************************************************************** 1. Read Infile and store it to text[]. 2. Compute the bwt and the lcp arrays 3. Compute an optimal paritioning and compress *************************************************************** */ void compress_file(void) { double bwt_partition1(bwt_data *b, int *lcp); double pseudo_compr(uint8 *t, int size); uint8 *text; int *sa, n, overshoot, i, k, extra_bytes; int occ[ALPHA_SIZE], *lcp=NULL; bwt_data b; double start, end, estimate, apost_est=0, apost_est_tot=0; int bloque1,bloque2; // ----- init ds suffix sort routine ----- overshoot=init_ds_ssort(500,2000); if(overshoot==0) fatal_error("ds initialization failed! (compress_file)\n"); // ----- allocate text and suffix array ----- n = Infile_size; // length of input text sa=malloc((n+1)*sizeof *sa); // suffix array text=malloc((n+overshoot)*sizeof *text); // text if (! sa || ! text) out_of_mem("compress_file"); // ----- read text and build suffix array ------ rewind(Infile); i=fread(text, (size_t) 1, (size_t) n, Infile); if(i!=n) fatal_error("Error reading the input file!"); fprintf(stdout,"File size: %d bytes\n",n); // ----- build suffix array ---------------- start = getTime(); ds_ssort(text,sa+1,n); // sort suffixes end=getTime(); fprintf(stdout,"Suffix array construction: %.2f seconds\n",end-start); // ---- compute lcp using 6n algorithm --------- start = getTime(); for(i=0;i<ALPHA_SIZE;i++) occ[i]=0; for(i=0;i<n;i++) occ[text[i]]++; if( (b.bwt = (uint8 *) malloc(n+1)) == NULL) out_of_mem("bwtopt1_file"); _bw_sa2bwt(text, n, sa, &b); extra_bytes = _lcp_sa2lcp_6n(text,&b,sa,occ); lcp = sa; end=getTime(); fprintf(stdout,"lcp6 construction: %.2f seconds\n",end-start); fprintf(stdout,"Total memory for lcp6: %.2fn bytes\n", 6+(4.0*extra_bytes)/n); // ---- compute the optimal partition --------- start = getTime(); estimate = bwt_partition1(&b,lcp); end=getTime(); bloque1 = 0; bloque2 = lcp[0]; while ( bloque2 != n+1 ) { printf("[%7d,%7d]\n", bloque1, bloque2-1); for (i=bloque1; i<=bloque2-1; i++) printf("%d ", b.bwt[i]); printf("\n"); bloque1 = bloque2; bloque2 = lcp[bloque2]; } printf("[%7d,%7d]\n", bloque1,bloque2-1); fprintf(stdout,"Optimal partition computation: %.2f seconds\n",end-start); // ---- compress -------- for(k=i=0;i<=n; ) { assert(lcp[i]>i); // bwt[i] -> bwt[lcp[i]-1] is a segment if(Verbose>0) { apost_est_tot += apost_est = pseudo_compr(b.bwt+i,lcp[i]-i); if(Verbose>1) fprintf(stderr,"%d) %d <-> %d: %f bits\n",k,i,lcp[i]-1,apost_est); } i = lcp[i]; // starting point of next segment k++; // increase # of segment } assert(i==n+1); fprintf(stdout, "Number of partitions: %d\n",k); fprintf(stdout, "Estimated compressed size: %lf (not reliable)\n",estimate); if(Verbose>0) { fprintf(stdout, "A posteriori estimate: %lf ",apost_est_tot); fprintf(stdout, "Delta %lf (should be zero)\n",apost_est_tot-estimate); } free(b.bwt); free(text); free(sa); }
ESA read_ESA_from_file(char *pFileName, unsigned char **ppExtraData, int *pDataRead) { struct ESAFileFormat ff; int overshoot; ESA esa = malloc(sizeof(*esa)); if(!esa) { setError("Couldn't allocate memory for ESA.\n"); return NULL; } unsigned char *text; FILE *f; int n; f = fopen(pFileName, "r"); if(!f) { char st[512]; sprintf(st, "Could not open '%s' for reading.", pFileName); setError(st); free(esa); return NULL; } if(fread(&ff, sizeof(struct ESAFileFormat), 1, f) != 1) { setError("An error occurred reading the file."); free(esa); fclose(f); return NULL; } if(strncmp(ff.ID, HEADERNAME, HEADERLENGTH) != 0) { setError("Header name mismatch in ESA structure file."); free(esa); fclose(f); return NULL; } if(ff.major != MAJOR_VERSION) { setError("Incompatible version of the ESA structure file."); free(esa); fclose(f); return NULL; } esa->alphabetSize = ff.alphabetSize; esa->alphabet = malloc( (esa->alphabetSize + 1) * sizeof(char)); if(!esa->alphabet) { setError("Couldn't allocate space for alphabet."); free(esa); fclose(f); return NULL; } strncpy(esa->alphabet, ff.alphabet, ff.alphabetSize); esa->alphabet[esa->alphabetSize] = '\0'; esa->ignoreAlphabetSize = ff.ignoreAlphabetSize; esa->ignoreAlphabet = malloc( (esa->ignoreAlphabetSize + 1) * sizeof(char)); if(!esa->ignoreAlphabet) { setError("Couldn't allocate space for ignore alphabet."); free(esa->alphabet); free(esa); fclose(f); return NULL; } strncpy(esa->ignoreAlphabet, ff.ignoreAlphabet, ff.ignoreAlphabetSize); esa->ignoreAlphabet[esa->ignoreAlphabetSize] = '\0'; n = esa->size = ff.size; overshoot=init_ds_ssort(500,2000); text=malloc((n + overshoot)*sizeof *text); if(!text) { setError("Couldn't allocate space for text."); free(esa->alphabet); free(esa->ignoreAlphabet); free(esa); fclose(f); return NULL; } if(fread(text, sizeof(unsigned char), ff.size+1, f) != ff.size+1) { setError("Couldn't read text from file."); free(text); free(esa->alphabet); free(esa->ignoreAlphabet); free(esa); fclose(f); return NULL; } esa->pStr = text; esa->suf = malloc(sizeof(int) * n); if(!esa->suf) { setError("Couldn't allocate memory for suf column in suffix array."); free(text); free(esa->alphabet); free(esa->ignoreAlphabet); free(esa); fclose(f); return NULL; } if(fread(esa->suf, sizeof(int), ff.size, f) != ff.size) { setError("Couldn't read suffix column from file."); free(esa->suf); free(text); free(esa->alphabet); free(esa->ignoreAlphabet); free(esa); fclose(f); return NULL; } esa->lcp = malloc(sizeof(unsigned char) * n); if(!esa->lcp) { setError("Couldn't allocate memory for lcp column in suffix array."); free(esa->suf); free(text); free(esa->alphabet); free(esa->ignoreAlphabet); free(esa); fclose(f); return NULL; } if(fread(esa->lcp, sizeof(unsigned char), ff.size, f) != ff.size) { setError("Couldn't read lcp from file."); free(esa->lcp); free(esa->suf); free(text); free(esa->alphabet); free(esa->ignoreAlphabet); free(esa); fclose(f); return NULL; } esa->skip = malloc(sizeof(int) * n); if(!esa->skip) { setError("Couldn't allocate space for skip column in suffix array."); free(esa->lcp); free(esa->suf); free(text); free(esa->alphabet); free(esa->ignoreAlphabet); free(esa); fclose(f); return NULL; } if(calcSkip(esa) == 0) { free(esa->skip); free(esa->lcp); free(esa->suf); free(text); free(esa->alphabet); free(esa->ignoreAlphabet); free(esa); fclose(f); return NULL; } // Read extra data if it was asked for and if it is present. if(ppExtraData != NULL) { *pDataRead = ff.nExtraData; if(*pDataRead != -1) { *ppExtraData = malloc(sizeof(unsigned char) * ff.nExtraData); if(*ppExtraData == NULL) { setError("Error - couldn't allocate space for extra data.\n"); free(esa->skip); free(esa->lcp); free(esa->suf); free(text); free(esa->alphabet); free(esa->ignoreAlphabet); free(esa); fclose(f); return NULL; } if(fread(*ppExtraData, sizeof(unsigned char), ff.nExtraData, f) != (unsigned int)ff.nExtraData) { setError("Error reading extra data.\n"); free(*ppExtraData); free(esa->skip); free(esa->lcp); free(esa->suf); free(text); free(esa->alphabet); free(esa->ignoreAlphabet); free(esa); fclose(f); return NULL; } } else { *ppExtraData = NULL; } } fclose(f); return esa; }
ESA build_ESA(char *pStr, int size, char *pAlphabet, char *pIgnore, int free_pStr) { // Check if the string includes a zero termination if(pStr[size] != '\0') { setError("The string MUST include a zero termination within the size\n"); if(free_pStr) free(pStr); return NULL; } initTimer(); int overshoot; ESA esa = malloc(sizeof(*esa)); if(!esa) { setError("Couldn't allocate memory for ESA.\n"); if(free_pStr) { free(pStr); freeTimer(); } return NULL; } unsigned char *text; int n = size + 1; // Include the zeroterninatin in the string // Calculate the overshoot overshoot=init_ds_ssort(500,2000); text = malloc((n + overshoot)*sizeof *text); if(!text) { setError("Couldn't allocate memory for translated text.\n"); free(esa); if(free_pStr) { free(pStr); freeTimer(); } return NULL; } // Translate the text and stop if it fails if(! translate(text, pStr, n-1, pAlphabet, pIgnore) ) { free(text); free(esa); if(free_pStr) free(pStr); freeTimer(); return NULL; } // Free pStr if possible if(free_pStr) free(pStr); // Save the text, alphabet and size in the esa structure setStr(esa, text); setSize(esa, n); setAlphabetSize(esa, strlen(pAlphabet)); setIgnoreAlphabetSize(esa, strlen(pIgnore)); setAlphabet(esa, pAlphabet); setIgnoreAlphabet(esa, pIgnore); addTimer("Initializing"); // Do the sorting, calc. lcp and calc. skip esa->suf = malloc(sizeof(int) * n); if(!esa->suf) { free(text); free(esa); freeTimer(); setError("Couldn't allocate memory for suffix column in suffix array.\n"); return NULL; } ds_ssort(esa->pStr, esa->suf, n, MAXPSSMSIZE); addTimer("DS-Sort"); esa->lcp = malloc(sizeof(unsigned char) * n); if(!esa->lcp) { setError("Couldn't allocate memory for LCP column in suffix array.\n"); free(esa->suf); free(text); free(esa); freeTimer(); return NULL; } calcLcpNaiv(esa); addTimer("Calc Lcp"); // The line below can be commented in to verify that there are "errors" in the suffix array // it will scan the array for errors and report the minimum depth at which an error was found // the last parameter specifies the max depth to search to). // As a side effect it calculates lcp (when used for this purpose the depth parameter should equa // that used when calling ds_ssort). // verifyNaively(esa, n, MAX_DEPTH); esa->skip = malloc(sizeof(int) * n); if(!esa->skip) { setError("Couldn't allocate memory for SKIP column in suffix array.\n"); free(esa->lcp); free(esa->suf); free(text); free(esa); freeTimer(); return NULL; } if(calcSkip(esa) == 0) { free(esa->skip); free(esa->lcp); free(esa->suf); free(text); free(esa); freeTimer(); return NULL; } addTimer("Calc Skip"); printTimer(); freeTimer(); return esa; }
int main(int argc, char* argv[]) { char datafname[FNAMEBUFSIZE]; char offsetfname[FNAMEBUFSIZE]; int i = 0; int dataLen; int numSamples; int overshoot; unsigned char* data; int* offsets; TokenVectorHndl tvHead, tvTemp; if( argc < 4 ){ fprintf( stderr, "Error: Usage 'token <datadir> <min_token_len> <min_occ_ratio>'\n" ); exit(-1); } snprintf(datafname,FNAMEBUFSIZE,"%s/data",argv[1]); snprintf(offsetfname,FNAMEBUFSIZE,"%s/offsets",argv[1]); unsigned int min_token_len=atoi(argv[2]); double min_occ_ratio=atof(argv[3]); dataLen = get_filesize(datafname); numSamples = (int)(get_filesize(offsetfname) / sizeof(int)) ; overshoot = init_ds_ssort( 500, 2000 ); data = (unsigned char*)calloc( dataLen+overshoot, sizeof(unsigned char) ); offsets = (int*)calloc( numSamples + 1, sizeof(int) ); if( !loadSamplesDataIntoMemory( numSamples, data, offsets, datafname, offsetfname ) ){ fprintf(stderr, "%s: Error loading samples into memory.\n", __FUNCTION__); return -1; } offsets[numSamples] = dataLen; // fprintf(stderr, "DONE!\n"); tvHead = TokenExtraction( data, dataLen, offsets, numSamples, min_token_len, (unsigned int)(numSamples*min_occ_ratio)); tvTemp=tvHead; while( tvTemp != NULL ){ //printf("TOKEN:'"); printf("'"); print_str( tvTemp->string, tvTemp->strLen ); printf("'\t"); //printf("TOKENVECTOR "); //for( i = 0; i < numSamples; i++ ) //{ // if( tvTemp->occuranceVector[i] != 0 ){ // printf("%d:%d ", i, tvTemp->occuranceVector[i] ); // } //} //printf("\n"); tvTemp = tvTemp->next; } DeleteTokenVectorList(tvHead); return 0; }
/*//////////////////// //Building the Index// ////////////////////*/ int build_index(uchar *text, ulong length, char *build_options, void **index) { /*if (text[length-1]!='\0') return 2;*/ ulong i, *p, *sa_diff; long overshoot; TSA_Un *_index= (TSA_Un *) malloc(sizeof(TSA_Un)); uchar *x; FILE *f; char fnamext[1024]; char fnameaux[1024]; char delimiters[] = " =;"; int j,num_parameters; char ** parameters; int copy_text=false; /* don't copy text by default */ int free_text=false; /* don't free text by default */ int withload=false; /* don't load SA and BPE by default */ int samplerate=64; /* samplerate for bpe */ int max_phrase=256; bool verbose=false; double cutoff=100.0; bool SA_treap=true,SA_psi=false; if (!_index) return 1; if (build_options != NULL) { parse_parameters(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters; j++) { if (strcmp(parameters[j], "copy_text") == 0 ) copy_text=true; else if (strcmp(parameters[j], "withload") == 0 ) withload=true; else if (strcmp(parameters[j], "filename") == 0 ) { strcpy(fnamext,parameters[j+1]); j++; } else if ((strcmp(parameters[j], "samplerate") == 0 ) && (j < num_parameters-1) ) { samplerate=atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "max_phrase") == 0 ) && (j < num_parameters-1) ) { max_phrase=atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "cutoff") == 0 ) && (j < num_parameters-1) ) { cutoff=atof(parameters[j+1]); j++; } else if (strcmp(parameters[j], "free_text") == 0 ) free_text=true; else if (strcmp(parameters[j], "verbose") == 0 ) verbose=true; else if (strcmp(parameters[j], "SA_treap") == 0 ) { SA_treap=true; SA_psi=false; } else if (strcmp(parameters[j], "SA_psi") == 0 ) { SA_treap=false; SA_psi=true; } } free_parameters(num_parameters, ¶meters); } //printf("samplerate = %lu\n",samplerate); /* Consistence of parameters */ if ((!copy_text) && (free_text)) return 5; /* */ if ( !copy_text ) { _index->text = text; _index->own=false; } else { _index->text = (uchar *) malloc(sizeof(uchar)*length); if (!_index->text) return 1; for (i=0; i<length; i++) _index->text[i]=text[i]; _index->own=true; } if ( free_text ) free(text); _index->n=length; /* Make suffix array */ if (withload) { ulong filename_len; p= (ulong *) malloc (sizeof(ulong)*(length)); if (!p) return 1; sprintf (fnameaux,"%s.sa",fnamext); f = fopen (fnameaux,"r"); if (fread (&filename_len,sizeof(ulong),1,f) != 1) return 25; assert(filename_len==_index->n); if (fread (p,sizeof(ulong),filename_len,f) != filename_len) return 25; if (fclose(f) != 0) return 28; } else { overshoot = init_ds_ssort(500, 2000); p= (ulong *) malloc (sizeof(ulong)*(length)); if (!p) return 1; x= (uchar *) malloc (sizeof(uchar)*(length+overshoot)); if (!x) return 1; for (i=0; i<length; i++) x[i]=_index->text[i]; ds_ssort( x, p, _index->n); free(x); } /* Make bpe */ if (withload && false ) { int error; sprintf (fnameaux,"%s.bpe",fnamext); f = fopen (fnameaux,"r"); _index->bpe = new BPE(f,&error); if (error !=0) return error; if (fclose(f) != 0) return 28; } else { if (SA_treap) { sa_diff= (ulong *) malloc (sizeof(ulong)*(length+3)); if (!sa_diff) return 1; for (i=0; i<length-1; i++) { assert(p[i+1]-p[i]+length>0); sa_diff[i+1]=p[i+1]-p[i]+length; } free(p); ulong maximo=0; for (i=0; i<length-1; i++) { if (maximo < sa_diff[i+1]) maximo=sa_diff[i+1]; } sa_diff[0]=maximo+1; sa_diff[length+1]=maximo+2; sa_diff[length+2]=maximo+3; _index->bpe = new BPE(sa_diff,length-1+3, max_phrase, cutoff, verbose); } if (SA_psi) { ulong *ip= (ulong *) malloc (sizeof(ulong)*(length)); for (i=0; i<length; i++) ip[p[i]] = i; for (i=0; i<length; i++) assert(ip[p[i]] == i); ulong *Psi= (ulong *) malloc (sizeof(ulong)*(length)); for (i=0; i<length; i++) if (p[i] == length-1) Psi[i] = ip[0]; else Psi[i] = ip[p[i]+1]; ulong ini=ip[0]; free(ip); sa_diff= (ulong *) malloc (sizeof(ulong)*length); if (!sa_diff) return 1; for (i=0; i<length-1; i++) { assert(p[i+1]-p[i]+length>0); sa_diff[i]=p[i+1]-p[i]+length; } free(p); _index->bpe = new BPE(sa_diff,Psi,ini,length-1,verbose); } } /* Make suffix array again */ if (withload) { ulong filename_len; p= (ulong *) malloc (sizeof(ulong)*(length)); if (!p) return 1; sprintf (fnameaux,"%s.sa",fnamext); f = fopen (fnameaux,"r"); if (fread (&filename_len,sizeof(ulong),1,f) != 1) return 25; assert(filename_len==_index->n); if (fread (p,sizeof(ulong),filename_len,f) != filename_len) return 25; if (fclose(f) != 0) return 28; } else { overshoot = init_ds_ssort(500, 2000); p= (ulong *) malloc (sizeof(ulong)*(length)); if (!p) return 1; x= (uchar *) malloc (sizeof(uchar)*(length+overshoot)); if (!x) return 1; for (i=0; i<length; i++) x[i]=_index->text[i]; ds_ssort( x, p, _index->n); free(x); } /* //////////////////////////////////////////////////// sa_diff= (ulong *) malloc (sizeof(ulong)*length); for (i=0;i<length-1;i++){ assert(p[i+1]-p[i]+length>0); sa_diff[i]=p[i+1]-p[i]+length; } ulong *z2; z2=_index->bpe->dispairall(); printf("Check SA_diff todo\n"); for (i=0;i<length-1;i++){ if (z2[i]-sa_diff[i] !=0) {printf("%lu, %lu %lu,%lu\n",i, z2[i]-sa_diff[i],z2[i],sa_diff[i]);fflush(stdout);} } printf("End Check SA_diff todo\n"); free(z2); printf("End Check SA_diff 2\n"); for (ulong mmm=1; mmm < 5000; mmm++) { printf("Check SA_diff %lu ",mmm); for (i=0;i<length-1-(mmm-1);i++){ z2=_index->bpe->dispair(i,mmm); //if (i % (n/10) == 0) {printf("C2 %lu\n",i);fflush(stdout);} for (ulong mm =1 ; mm <= mmm; mm++) if (z2[mm]-sa_diff[i+mm-1] !=0) {printf("T%lu %lu, %lu %lu,%lu\n",mm,i, z2[mm]-sa_diff[i+mm-1],z2[mm],sa_diff[i+mm-1]);fflush(stdout);} free(z2); } printf("End Check SA_diff %lu\n",mmm);fflush(stdout); } free(sa_diff); ///////////////////////////////////////////////////////// */ /* Make samplerate */ _index->samplerate = samplerate; _index->ns = (length-1)/samplerate+1; if (((length-1) % samplerate) != 0) _index->ns++; _index->pos = (ulong *) malloc (sizeof(ulong)*_index->ns); //_index->pos[0]=p[0]; j=0; for (i=0; i < length ; i+=samplerate) { if (i != length-1) { //if (p[_index->bpe->BR->prev(i)] != _index->pos[j-1]) { _index->pos[j]=p[_index->bpe->BR->prev(i)]; j++; // } } else { _index->pos[j]=p[i]; j++; } } if (((length-1) % samplerate) != 0) _index->pos[j]=p[length-1]; _index->ns=j+1; /* _index->samplerate = samplerate; _index->ns = (length-1)/samplerate+1; if (((length-1) % samplerate) != 0) _index->ns++; _index->pos = (ulong *) malloc (sizeof(ulong)*_index->ns); j=0; for (i=0; i < length ; i+=samplerate) { _index->pos[j]=p[i]; j++; } if (((length-1) % samplerate) != 0) _index->pos[j]=p[length-1]; assert(j+1==_index->ns); */ free(p); (*index) = _index; return 0; }