lzma_index_stream_size(const lzma_index *i) { // Stream Header + Blocks + Index + Stream Footer return LZMA_STREAM_HEADER_SIZE + i->total_size + index_size(i->count, i->index_list_size) + LZMA_STREAM_HEADER_SIZE; }
bool update_index(char *path, idx *idx, uint8_t *kek, kdfp *kdfp, uint8_t *id, entry *entry) { size_t size = index_size(idx); uint8_t *flags = scan_attrs(entry, &size); size += 1024 - (size % 1024); void *addr = mmfile(path, &size); box *kbox = BOX_PTR(addr, KDFP_LEN); box *data = BOX_PTR(kbox, BOX_LEN(KEY_LEN)); uint8_t *key = BOX_DATA(kbox); if (!addr || !flags) return false; uint32_t *count = (uint32_t *) BOX_DATA(data); uint8_t *cursor = BOX_DATA(data) + sizeof(uint32_t); *count = idx->count; for (uint32_t i = 0; i < idx->count; i++) { term *term = &idx->terms[i]; bool append = false; for (uint32_t j = 0; !append && j < entry->count; j++) { string *val = &entry->attrs[j].val; if (flags[j] == SKIP) continue; if (val->len == term->len && !memcmp(val->str, term->str, val->len)) { flags[j] = SKIP; append = true; break; } } if (!append && term->count == 1 && !memcmp(term->ids, id, ID_LEN)) { (*count)--; continue; } cursor = write_term(cursor, term, id, append); } for (uint32_t i = 0; i < entry->count; i++) { string *val = &entry->attrs[i].val; if (flags[i] == WRITE) { term term = { .len = val->len, .str = val->str }; cursor = write_term(cursor, &term, id, true); (*count)++; } } *count = htonl(*count); free(flags); write_kdfp(addr, kdfp); memcpy(key, idx->key, KEY_LEN); encrypt_box(key, data, INDEX_LEN(size)); encrypt_box(kek, kbox, KEY_LEN); return mmsync(path, addr, size); }
int main(int argc, char *argv[]) { char *infile, *outfile; uchar *text; char *params = NULL; ulong text_len; void *index; int error, i; double start, end; if (argc < 3) print_usage(argv[0]); if (argc > 3) { int nchars, len; nchars = argc-3; for(i=2;i<argc;i++) nchars += strlen(argv[i]); params = (char *) malloc((nchars+1)*sizeof(char)); params[nchars] = '\0'; nchars = 0; for(i=3;i<argc;i++) { len = strlen(argv[i]); strncpy(params+nchars,argv[i],len); params[nchars+len] = ' '; nchars += len+1; } params[nchars-1] = '\0'; } infile = argv[1]; outfile = argv[2]; start = getTime(); error = read_file(infile, &text, &text_len); IFERROR(error); error = build_index(text, text_len, params, &index); IFERROR(error); error = save_index(index, outfile); IFERROR(error); end = getTime(); fprintf(stderr, "Building time: %.3f secs\n", end-start ); ulong index_len; index_size(index, &index_len); fprintf(stdout,"Input: %lu bytes --> Output %lu bytes.\n", text_len, index_len); fprintf(stdout,"Overall compression --> %.2f%% (%.2f bits per char).\n\n", (100.0*index_len)/text_len, (index_len*8.0)/text_len); error = free_index(index); IFERROR(error); exit(0); }
lzma_index_file_size(const lzma_index *i) { // If multiple Streams are concatenated, the Stream Header, Index, // and Stream Footer fields of all but the last Stream are already // included in old.streams_size. Thus, we need to calculate only the // size of the last Index, not all Indexes. return i->old.streams_size + LZMA_STREAM_HEADER_SIZE + i->total_size + index_size(i->count - i->old.count, i->index_list_size - i->old.index_list_size) + LZMA_STREAM_HEADER_SIZE; }
/** returns detailled info of the data structures of the index **/ int index_info(void *index, char msg[]) { twcsa *wcsa=(twcsa *)index; char msgt[1000]; int error; uint sizeil; error = size_il(wcsa->ils, &sizeil); long sizeL = (long) size_il; if (sizeL <0){ printf("\n\n [WARNING!! size is being reported badly (%ld bytes!) due to IL_postings_list_technique\n",sizeL); } IFERRORIL(error); ulong indexs; index_size(index,&indexs); ulong Text_length; get_length(index, &Text_length); uint totalVoc=0; //Vocabulary of words. totalVoc += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers totalVoc += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words. uint sizerepres; //Compressed representation of the source text size_representation(wcsa->ct, &sizerepres); sprintf(msg,"\n\tSpace statistics for II-words-DocOriented\n"); sprintf (msgt,"\t tindex structure = %lu bytes.\n",(ulong)sizeof(twcsa) ); strcat(msg,msgt); sprintf (msgt,"\t number of diff words= %u\n",wcsa->nwords ); strcat(msg,msgt); sprintf (msgt,"\t number of docs (ndocs)= %u\n",wcsa->ndocs ); strcat(msg,msgt); sprintf (msgt,"\t vocabulary of words = %u bytes.\n", totalVoc ); strcat(msg,msgt); sprintf (msgt,"\t Inverted list-structure = %u bytes.\n",sizeil ); strcat(msg,msgt); sprintf (msgt,"\t Text (compressed representation) = %u bytes.\n", sizerepres); strcat(msg,msgt); sprintf (msgt,"\t Whole index = %lu bytes ", indexs); strcat(msg,msgt); sprintf (msgt," ** Ratio = %2.3f %% **\n ", 100.0*indexs / Text_length); strcat(msg,msgt); fprintf(stderr,"\n\tSpace statistics for II-words-DocOriented\n %s",msg); return 0; }
/** returns detailled info of the data structures of the index **/ int index_info(void *index, char msg[]) { twcsa *wcsa=(twcsa *)index; char msgt[1000]; uint sizeil; int error = size_il(wcsa->ils, &sizeil); IFERRORIL(error); ulong indexs; index_size(index,&indexs); ulong Text_length; get_length(index, &Text_length); uint totalVoc=0; //Vocabulary of words. totalVoc += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+WI32-1) /WI32) * (sizeof(uint)); //the pointers totalVoc += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words. uint size_doc_offsets_sids = sizeof (uint) * (wcsa->ndocs+1); uint sizerepres; //Compressed representation of the source text size_representation(wcsa->ct, &sizerepres); sprintf(msg,"\n\tSpace statistics for II-words-DocOriented\n"); sprintf (msgt,"\t tindex structure = %lu bytes.\n",(ulong)sizeof(twcsa) ); strcat(msg,msgt); sprintf (msgt,"\t number of diff words= %u\n",wcsa->nwords ); strcat(msg,msgt); sprintf (msgt,"\t number of docs (ndocs)= %u\n",wcsa->ndocs ); strcat(msg,msgt); sprintf (msgt,"\t vocabulary of words = %u bytes.\n", totalVoc ); strcat(msg,msgt); sprintf (msgt,"\t Mapping Docs-2-wordposition = %u bytes.\n", size_doc_offsets_sids); strcat(msg,msgt); sprintf (msgt,"\t Inverted list-structure = %u bytes.\n",sizeil ); strcat(msg,msgt); sprintf (msgt,"\t Text (compressed representation) = %u bytes.\n", sizerepres); strcat(msg,msgt); sprintf (msgt,"\t Whole index = %lu bytes ", indexs); strcat(msg,msgt); sprintf (msgt," ** Ratio = %2.3f %% **\n ", 100.0*indexs / Text_length); strcat(msg,msgt); fprintf(stderr,"\n\tSpace statistics for II-words-DocOriented\n %s",msg); return 0; }
static lzma_vli index_file_size(lzma_vli compressed_base, lzma_vli unpadded_sum, lzma_vli record_count, lzma_vli index_list_size, lzma_vli stream_padding) { // Earlier Streams and Stream Paddings + Stream Header // + Blocks + Index + Stream Footer + Stream Padding // // This might go over LZMA_VLI_MAX due to too big unpadded_sum // when this function is used in lzma_index_append(). lzma_vli file_size = compressed_base + 2 * LZMA_STREAM_HEADER_SIZE + stream_padding + vli_ceil4(unpadded_sum); if (file_size > LZMA_VLI_MAX) return LZMA_VLI_UNKNOWN; // The same applies here. file_size += index_size(record_count, index_list_size); if (file_size > LZMA_VLI_MAX) return LZMA_VLI_UNKNOWN; return file_size; }
static void show_encoded_statistics(GtStrArray *infiles, const char *indexname) { int i; off_t orig_size = 0, enc_size = 0; const char *seqfile; gt_assert(infiles); for (i=0; i < gt_str_array_size(infiles);i ++) { seqfile = gt_str_array_get(infiles, i); orig_size += gt_file_size(seqfile); } enc_size += index_size(indexname, GT_ALPHABETFILESUFFIX); enc_size += index_size(indexname, GT_ENCSEQFILESUFFIX); enc_size += index_size(indexname, GT_SSPTABFILESUFFIX); enc_size += index_size(indexname, GT_DESTABFILESUFFIX); enc_size += index_size(indexname, GT_SDSTABFILESUFFIX); enc_size += index_size(indexname, GT_OISTABFILESUFFIX); printf("encoded sequence file(s) are %.1f%% of original file size\n", ((double) enc_size / orig_size) * 100.0); }
lzma_index_size(const lzma_index *i) { return index_size(i->count, i->index_list_size); }
int main(int argc, char *argv[]) { char *infile, *outfile; uchar *text; ulong text_len; void *index; int error; if (argc < 3) print_usage(argv[0]); infile = argv[1]; // input file outfile = argv[2]; // output file error = read_file(infile, &text, &text_len); IFERROR(error); /* Possible options: "-a x": indicates the behaviour of FM-index with the pointer 'text'. x == 0 FM-index uses 'text' directly to build the suffix array. This means that you are responsable to allocate 'length+overshoot' bytes for the text instead of 'length' bytes. You must include ds_ssort.h. See function read_file(); x == 1 FM-index frees the allocated memory for the 'text'. overshoot and ds_ssort.h are not necessary. x == 2 FM-index makes its internal copy of 'text'. After the call, 'text' is available. overshoot and ds_ssort.h are not necessary. -B Bsize: where Bsize is the size in Kbytes of level 1 buckets. -b bsize: where bsize is the size in bytes of level 2 buckets. bsize must divide Bsize*1024; -f frequency: where frequency is a number from 0 to 1 that indicates the frequency of the marked characters. default "-b 512 -B 16 -f 0.02 -a 1" Example of some call to build_index(): - build_index(text, text_len, NULL, &index); uses the default parameters. - build_index(text, text_len, "-a 1 -f 0.1", &index); tries to mark 10% of the positions instead of 2% but I cannot reuse 'text' after this call. */ fprintf(stdout, "Building\n"); error = build_index(text, text_len, "-a 1", &index); IFERROR(error); ulong index_len; index_size(index, &index_len); fprintf(stdout,"Input: %lu bytes --> Output %lu bytes.\n", text_len, index_len); fprintf(stdout,"Overall compression --> %.2f%% (%.2f bits per char).\n\n", (100.0*index_len)/text_len, (index_len*8.0)/text_len); uchar *snippet; ulong i, readen, from = 11, to = 100, numocc, *occ; error = extract(index, from, to, &snippet, &readen); IFERROR(error); fprintf(stdout, "try extract\n\n"); for(i=0;i<readen;i++) printf("%c", snippet[i]); printf("\n"); uchar *pattern = snippet; fprintf(stdout, "try count\n\n"); error = count (index, pattern, 5, &numocc); printf("pattern: "); fwrite(pattern, sizeof(uchar), 5, stdout); printf(" # occs %lu\n\n",numocc); fprintf(stdout, "try locate\n\n"); error = locate (index, pattern, 5, &occ, &numocc); IFERROR (error); for(i=0;i<numocc;i++) printf("pos %lu\n", occ[i]); printf("\n"); free(snippet); if(numocc) free(occ); error = save_index(index, outfile); IFERROR(error); error = free_index(index); IFERROR(error); exit(0); }
size_t TextIndexCSA::getSize() const{ ulong size; index_size(csa, &size); return (size_t)size; }
/** *********************************************************************************** CONSTRUCTION OF THE INDEX, from a given text file "inbasename". ***********************************************************************************/ int build_WordIndex_from_postings (char *inbasename, char *build_options, void **index){ twcsa *wcsa; wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); *index = wcsa; void *Index = *index; wcsa->text = NULL; double t0, t1; t0 = getSYSTimeBF(); /** processing the parameters of the index:: blockSize, and q-gram-len (q) */ { char delimiters[] = " =;"; int j,num_parameters; char ** parameters; if (build_options != NULL) { parse_parameters_II(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters;j++) { if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) { //wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER; j++; } else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) { //wcsa->q =atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) { //strcpy(path2repaircompressor,parameters[j+1]); j++; } } free_parameters_II(num_parameters, ¶meters); } //fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q); //fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor); } wcsa->freqs=NULL; /** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/ StartValid(); /** 1 ** loads the vocabulary of words. zonewords, words vector, nwords */ t1 = getSYSTimeBF(); loadVocabulary (Index, inbasename); { uint totaltmp=0; //words totaltmp += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words. fprintf(stderr,"\n\t*Loaded Vocabulary: %u words, %d bytes", wcsa->nwords, totaltmp); } fprintf(stderr,"\n\t... Done: %2.2f seconds (sys+usr t)\n", getSYSTimeBF() -t1); /** 2 ** Loads some configuration constants: sourceTextSize, maxNumOccs */ loadIndexConstants(Index, inbasename); fprintf(stderr,"\n\t*Loaded configuration constants: %lu bytes\n", (ulong) (2 * sizeof(uint ) + sizeof(ulong)) ); /* //shows the words parsed... { int i; fprintf(stderr,"\n\n Despues de sorting ...."); fflush(stderr); unsigned char *str; uint len; // for (i = 0; i<100; i++) { for (i = 0; ((uint)i)<wcsa->nwords; i++) { if ((i<10) || (((uint)i) >wcsa->nwords-5)) { getWord(wcsa,i,&str,&len); fprintf(stderr,"\n freq[%6d]=%6u ",i, wcsa->freqs[i]); fprintf(stderr,", words[%6d] = ",i); printWord(str,len); } } } t1 = getSYSTimeBF(); fprintf(stderr,"\n %u words have been loaded", wcsa->nwords); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); */ #ifdef FREQ_VECTOR_AVAILABLE /** 3 ** Loading freq vector */ {uint size; //the size of the vocabulary in #ofwords =>> already init in "loadvocabulary" loadFreqVector(&(wcsa->freqs), &size, (char *)inbasename); fprintf(stderr,"\t*Loaded freq vector: %d bytes\n", wcsa->nwords * sizeof(uint) ); } #endif /** 5 ** Loading the Representation of the source text */ load_representation( &wcsa->ct,inbasename); { uint size; size_representation(wcsa->ct, &size); fprintf(stderr,"\n\t*Loaded (compressed) representation of the source Text: %u bytes\n", size); } /** 4 ** Loading the uncompressed posting lists previously created by the indexer. */ //Preparing a "list of occurrences" that will be later indexed through build_il() ** uint *source_il, sourcelen_il; uint maxPost ; //just to check it loads OK ;) ulong source_il_ulong; t1 = getSYSTimeBF(); fprintf(stderr,"\n... Loading the posting lists from disk \n"); fflush(stderr); load_posting_lists_from_file(&maxPost, &source_il_ulong, &source_il, inbasename); /** FOR CIKM ILISTS_DO NOT STILL SUPPORT an ULONG HERE **/ sourcelen_il = (uint)source_il_ulong; fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", (ulong)sourcelen_il - wcsa->nwords -2, (ulong) sizeof(uint)*(sourcelen_il - wcsa->nwords -2)); fprintf(stderr,"\n MAXPOST loaded = %u, source_il_len = %u \n\n",maxPost,sourcelen_il); fprintf(stderr,"\n NLISTS loaded = %u, MAXPOSTS_sET \n\n",source_il[0],source_il[1]); t1 = getSYSTimeBF(); fprintf(stderr,"\n**... entering BUILD INVERTED LIST REPRESENTATION************!! \n\n"); fflush(stderr); /* { char fileposts[2048]; sprintf(fileposts,"%s.%s.%u","postinglistsXX","posts", getpid()); FILE *ff = fopen(fileposts,"w"); fwrite(source_il, sizeof(uint), sourcelen_il,ff); fclose(ff); } */ //compressing the lists of occurrences and setting wcsa->ils int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils)); //source_il is freed inside!. IFERRORIL(error); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); #ifndef FREQ_VECTOR_AVAILABLE //<----- not needed in advance, only during construction free(wcsa->freqs); #endif ulong sizeI; index_size(*index, &sizeI); fflush(stderr); fflush(stdout); fprintf(stderr,"\n ---------------------------------------------"); fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI); fprintf(stderr,"\n... Done: OVERALL TIME = %2.2f seconds (sys+usr time)", getSYSTimeBF() -t0); fprintf(stderr,"\n ---------------------------------------------\n\n\n"); fflush(stderr); fflush(stdout); return 0; }
/** *********************************************************************************** CONSTRUCTION OF THE INDEX, from a given text file "inbasename". ***********************************************************************************/ int build_WordIndex (char *inbasename, char *build_options, void **index){ twcsa *wcsa; wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); *index = wcsa; wcsa->text = NULL; double t0, t1; t0 = getSYSTimeBF(); //char path2repaircompressor[1000]="./src/repair64bit/repairCompressor"; //wcsa->blockSize = DEFAULT_BLOCK_SIZE; //wcsa->q = DEFAULT_QGRAM_LEN; /** processing the parameters of the index:: blockSize, and q-gram-len (q) */ { char delimiters[] = " =;"; int j,num_parameters; char ** parameters; if (build_options != NULL) { parse_parameters_II(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters;j++) { if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) { //wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER; j++; } else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) { //wcsa->q =atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) { //strcpy(path2repaircompressor,parameters[j+1]); j++; } } free_parameters_II(num_parameters, ¶meters); } //fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q); //fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor); } /** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/ StartValid(); /** 1 ** Loads the compressed text into memory. */ t1 = getSYSTimeBF(); fprintf(stderr,"\n... Entering LoadTextInmem:%s\n",inbasename); loadTextInMem(&(wcsa->text), &(wcsa->sourceTextSize),(char *)inbasename); fprintf(stderr,"... Loaded Source Sequence: %lu bytes\n", wcsa->sourceTextSize); fprintf(stderr,"... Done: %2.2f seconds (sys+usr time)\n\n", getSYSTimeBF() -t1); fflush(stderr); /** 2 ** loads the array of document boundaries */ uint ndocs; ulong *docboundaries; loadDocBeginngins(&docboundaries, &ndocs,(char *)inbasename); wcsa->ndocs = ndocs; //just for statistics. /** 3 ** Parses the sequence and gathers the vocabulary of words (sorted alphanumerically) the frecuency of such words: obtains "words", "nwords", and "wordsZone" Sets also wcsa->freqs (freq of each word) Sets also wcsa->maxNumOccs (needed for malloc during extraction) */ fprintf(stderr,"\n... Entering CreateVocabularyOfWords (1st pass) \n"); fflush(stderr); CreateVocabularyOfWords(*index, docboundaries, ndocs); //shows the words parsed... { int i; fprintf(stderr,"\n\n Despues de sorting ...."); fflush(stderr); unsigned char *str; uint len; // for (i = 0; i<100; i++) { for (i = 0; ((uint)i)<wcsa->nwords; i++) { if ((i<10) || (((uint)i) >wcsa->nwords-5)) { getWord(wcsa,i,&str,&len); fprintf(stderr,"\n freq[%6d]=%6u ",i, wcsa->freqs[i]); fprintf(stderr,", words[%6d] = ",i); printWord(str,len); } } } t1 = getSYSTimeBF(); fprintf(stderr,"\n %u words have been parsed", wcsa->nwords); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); /** 4 ** creates a temporal list of occurrences of each word (block-oriented). gives also the len of each list */ { //decompression of the source text and creation of occList[][] and lenList[] uint **occList; uint *lenList; t1 = getSYSTimeBF(); fprintf(stderr,"\n... Entering createListsOfOccurrences (2nd pass) \n"); fflush(stderr); createListsOfOccurrences (*index, &occList, &lenList, docboundaries, ndocs); fprintf(stderr,"\n %u lists of occurrences were created.", wcsa->nwords);fflush(stderr); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); #ifdef CIKM2011_HURRY free(wcsa->text); wcsa->text = NULL; #endif //Preparing a "list of occurrences" that will be later indexed through build_il() ** uint *source_il, sourcelen_il; uint maxPost = ndocs; uint nwords = wcsa->nwords; ulong source_il_ulong; t1 = getSYSTimeBF(); fprintf(stderr,"\n... Entering prepareSourceFormatForIListBuilder \n"); fflush(stderr); prepareSourceFormatForIListBuilder(nwords,maxPost,lenList, occList, &source_il, &source_il_ulong); /** FOR CIKM ILISTS_DO NOT STILL SUPPORT an ULONG HERE **/ sourcelen_il = (uint)source_il_ulong; fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", (ulong)sourcelen_il - nwords -2, (ulong) sizeof(uint)*(sourcelen_il - nwords -2)); /* char fileuintpostings[256] = "postingsSequence.uint32"; output_posting_lists_concatenated_DEBUGGING_ONLY (nwords, maxPost, lenList, occList,fileuintpostings); */ { char fileposts[2048]; sprintf(fileposts,"%s.%s.%u","postinglists","posts", getpid()); FILE *ff = fopen(fileposts,"w"); fwrite(source_il, sizeof(uint), sourcelen_il,ff); fclose(ff); } /* FILE *ff = fopen ("2gbnopositional.posts.uint32","w"); fwrite (source_il, sizeof(uint), sourcelen_il, ff); fclose(ff); */ fprintf(stderr,"\n the lists of occurrences were formatted for build_il.");fflush(stderr); fprintf(stderr,"\n...Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); t1 = getSYSTimeBF(); fprintf(stderr,"\n**... entering BUILD INVERTED LIST REPRESENTATION!! \n"); fflush(stderr); //compressing the lists of occurrences and setting wcsa->ils int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils)); //source_il is freed inside!. IFERRORIL(error); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); { //frees memory for the posting lists uint i; for (i=0;i<wcsa->nwords;i++) free(occList[i]); free(occList); free(lenList); } /** 5 ** compressed representation of the source text */ { #ifdef CIKM2011_HURRY uchar text_null[109] = "NULL-TEXT"; uint text_len_null=1; uint docbounds_null[2]= {0,10}; uint ndoc_null = 1; build_representation (text_null, text_len_null, docbounds_null, ndoc_null, build_options, &wcsa->ct); #endif #ifndef CIKM2011_HURRY build_representation (wcsa->text, wcsa->sourceTextSize, docboundaries, ndocs, build_options, &wcsa->ct); #endif unsigned char *document; uint doclen; extract_doc_representation (wcsa->ct, 0, &document, &doclen); fprintf(stderr,"\n =================== DOC 0 ======================"); fprintf(stderr,"\n%s",document); fprintf(stderr,"\n =================== ***** ======================\n"); free(document); //free(docbegsUL); } } #ifndef FREQ_VECTOR_AVAILABLE //<----- not needed in advance, only during construction free(wcsa->freqs); #endif free(docboundaries); ulong sizeI; index_size(*index, &sizeI); fflush(stderr); fflush(stdout); fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI); fprintf(stderr,"\n... Done: OVERALL TIME = %2.2f seconds (sys+usr time)\n\n\n", getSYSTimeBF() -t0); fflush(stderr); fflush(stdout); return 0; }
lzma_index_append(lzma_index *i, const lzma_allocator *allocator, lzma_vli unpadded_size, lzma_vli uncompressed_size) { // Validate. if (i == NULL || unpadded_size < UNPADDED_SIZE_MIN || unpadded_size > UNPADDED_SIZE_MAX || uncompressed_size > LZMA_VLI_MAX) return LZMA_PROG_ERROR; index_stream *s = (index_stream *)(i->streams.rightmost); index_group *g = (index_group *)(s->groups.rightmost); const lzma_vli compressed_base = g == NULL ? 0 : vli_ceil4(g->records[g->last].unpadded_sum); const lzma_vli uncompressed_base = g == NULL ? 0 : g->records[g->last].uncompressed_sum; const uint32_t index_list_size_add = lzma_vli_size(unpadded_size) + lzma_vli_size(uncompressed_size); // Check that the file size will stay within limits. if (index_file_size(s->node.compressed_base, compressed_base + unpadded_size, s->record_count + 1, s->index_list_size + index_list_size_add, s->stream_padding) == LZMA_VLI_UNKNOWN) return LZMA_DATA_ERROR; // The size of the Index field must not exceed the maximum value // that can be stored in the Backward Size field. if (index_size(i->record_count + 1, i->index_list_size + index_list_size_add) > LZMA_BACKWARD_SIZE_MAX) return LZMA_DATA_ERROR; if (g != NULL && g->last + 1 < g->allocated) { // There is space in the last group at least for one Record. ++g->last; } else { // We need to allocate a new group. g = lzma_alloc(sizeof(index_group) + i->prealloc * sizeof(index_record), allocator); if (g == NULL) return LZMA_MEM_ERROR; g->last = 0; g->allocated = i->prealloc; // Reset prealloc so that if the application happens to // add new Records, the allocation size will be sane. i->prealloc = INDEX_GROUP_SIZE; // Set the start offsets of this group. g->node.uncompressed_base = uncompressed_base; g->node.compressed_base = compressed_base; g->number_base = s->record_count + 1; // Add the new group to the Stream. index_tree_append(&s->groups, &g->node); } // Add the new Record to the group. g->records[g->last].uncompressed_sum = uncompressed_base + uncompressed_size; g->records[g->last].unpadded_sum = compressed_base + unpadded_size; // Update the totals. ++s->record_count; s->index_list_size += index_list_size_add; i->total_size += vli_ceil4(unpadded_size); i->uncompressed_size += uncompressed_size; ++i->record_count; i->index_list_size += index_list_size_add; return LZMA_OK; }
//符号表是全局变量 struct InterCodes* translate_Exp(struct TreeNode* Exp, Operand place){ if (strcmp(Exp->children->name, "INT")){ printf("Exp - INT\n"); if(place != NULL){ FieldList item = (FieldList)malloc(sizeof(struct FieldList_)); lookupTable(Exp->children->value_str,&item,0); place->kind = CONSTANT; place->u.value = Exp->children->value_int; struct InterCodes* temp = (struct InterCodes*)malloc(sizeof(struct InterCodes)); temp->code.kind = NONE_; return temp; } else{ struct InterCodes* temp = (struct InterCodes*)malloc(sizeof(struct InterCodes)); temp->code.kind = NONE_; return temp; } } if (strcmp(Exp->children->name, "ID")){ if(Exp->children->neighbours == NULL){ printf("Exp - ID\n"); if(place != NULL){ FieldList item = (FieldList)malloc(sizeof(struct FieldList_)); lookupTable(Exp->children->value_str,&item,0); place->kind = VARIABLE; strcpy(place->u.ID, item->inter_name); struct InterCodes* temp = (struct InterCodes*)malloc(sizeof(struct InterCodes)); temp->code.kind = NONE_; return temp; } else{ struct InterCodes* temp = (struct InterCodes*)malloc(sizeof(struct InterCodes)); temp->code.kind = NONE_; return temp; } } if(strcmp(Exp->children->neighbours->name, "DOT")){ printf("Exp - Exp DOT ID\n"); //Operand t1 = new_temp(); //struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1); FieldList item = (FieldList)malloc(sizeof(struct FieldList_)); int i = lookupTable(Exp->children->neighbours->neighbours->value_str,&item,0); switch(i){ case 0:{ Operand t1 = new_temp(); struct InterCodes* code1 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code1->code.kind = ADD_; code1->code.u.binop.result = t1; code1->code.u.binop.op1 = (Operand)malloc(sizeof(struct Operand_)); code1->code.u.binop.op1->kind = VARIABLE; strcpy(code1->code.u.binop.op1->u.ID, item->inter_name); code1->code.u.binop.op2 = (Operand)malloc(sizeof(struct Operand_)); code1->code.u.binop.op2->kind = CONSTANT; code1->code.u.binop.op2->u.value = index_size(item, Exp->children->value_str); place->kind = REFERENCE; strcpy(place->u.ID, t1->u.ID); return code1; //break; } case 1:{ Operand t1 = new_temp(); Operand t2 = new_temp(); struct InterCodes* code1 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code1->code.kind = ASSIGN_; code1->code.u.assign.left = t1; code1->code.u.assign.right = (Operand)malloc(sizeof(struct Operand_)); code1->code.u.assign.right->kind = ADDRESS; strcpy(code1->code.u.assign.right->u.ID, item->inter_name); struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code2->code.kind = ADD_; code2->code.u.binop.result = t2; code2->code.u.binop.op1 = t1; code2->code.u.binop.op2 = (Operand)malloc(sizeof(struct Operand_)); code2->code.u.binop.op2->kind = CONSTANT; code2->code.u.binop.op2->u.value = index_size(item, Exp->children->value_str); place->kind = REFERENCE; strcpy(place->u.ID, t2->u.ID); code1->next = code2; code2->prev = code1; return code1; } } } } if (strcmp(Exp->children->name, "Exp")){ if (strcmp(Exp->children->neighbours->name, "ASSIGNOP")){ if (strcmp(Exp->children->neighbours->neighbours->children->name, "ID")){ printf("Exp - Exp ASSIGNOP Exp\n"); if(place != NULL){ Operand t1 = new_temp(); FieldList item = (FieldList)malloc(sizeof(struct FieldList_)); lookupTable(Exp->children->neighbours->neighbours->children->value_str,&item,0); struct InterCodes* code1 = translate_Exp(Exp->children, t1); struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code2->code.kind = ASSIGN_; code2->code.u.assign.left = (Operand)malloc(sizeof(struct Operand_)); code2->code.u.assign.left->kind = VARIABLE; strcpy(code2->code.u.assign.left->u.ID, item->inter_name); code2->code.u.assign.right = t1; /*struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code3->code.kind = ASSIGN; code3->code.u.assign.left = (Operand)malloc(sizeof(struct Operand_)); code3->code.u.assign.left->kind = VARIABLE; strcpy(code3->code.u.assign.left->u.ID, place->u.ID); code3->code.u.assign.right = (Operand)malloc(sizeof(struct Operand_)); code3->code.u.assign.right->kind = VARIABLE; strcpy(code3->code.u.assign.right->u.ID, Exp->children->neighbours->neighbours->children->inter_name);*/ place->kind = VARIABLE; strcpy(place->u.ID, item->inter_name); struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; code2->next = NULL; return code1; } else{ Operand t1 = new_temp(); FieldList item = (FieldList)malloc(sizeof(struct FieldList_)); lookupTable(Exp->children->neighbours->neighbours->children->value_str,&item,0); struct InterCodes* code1 = translate_Exp(Exp->children, t1); struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code2->code.kind = ASSIGN_; code2->code.u.assign.left = (Operand)malloc(sizeof(struct Operand_)); code2->code.u.assign.left->kind = VARIABLE; strcpy(code2->code.u.assign.left->u.ID, item->inter_name); code2->code.u.assign.right = t1; struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; code2->next = NULL; return code1; } } } if(strcmp(Exp->children->neighbours->name, "PLUS")){ printf("Exp - Exp PLUS Exp\n"); if(place != NULL){ Operand t1 = new_temp(); Operand t2 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1); struct InterCodes* code2 = translate_Exp(Exp->children, t2); struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code3->code.kind = ADD_; code3->code.u.binop.result = (Operand)malloc(sizeof(struct Operand_)); code3->code.u.binop.result->kind = VARIABLE; strcpy(code3->code.u.binop.result->u.ID, place->u.ID); code3->code.u.binop.op1 = t1; code3->code.u.binop.op2 = t2; struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; p = code2; while(p->next != NULL) p = p->next; p->next = code3; code3->prev = p; code3->next = NULL; return code1; } else{ Operand t1 = new_temp(); Operand t2 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1); struct InterCodes* code2 = translate_Exp(Exp->children, t2); struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; code2->next = NULL; return code1; } } if(strcmp(Exp->children->neighbours->name, "MINUS")){ printf("Exp - Exp MINUS Exp\n"); if(place != NULL){ if(Exp->children->neighbours->neighbours != NULL){ Operand t1 = new_temp(); Operand t2 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1); struct InterCodes* code2 = translate_Exp(Exp->children, t2); struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code3->code.kind = SUB_; code3->code.u.binop.result = (Operand)malloc(sizeof(struct Operand_)); code3->code.u.binop.result->kind = VARIABLE; strcpy(code3->code.u.binop.result->u.ID, place->u.ID); code3->code.u.binop.op1 = t1; code3->code.u.binop.op2 = t2; struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; p = code2; while(p->next != NULL) p = p->next; p->next = code3; code3->prev = p; code3->next = NULL; return code1; } else{ Operand t1 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children, t1); struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code2->code.kind = SUB_; code2->code.u.binop.result = (Operand)malloc(sizeof(struct Operand_)); code2->code.u.binop.result->kind = VARIABLE; strcpy(code2->code.u.binop.result->u.ID, place->u.ID); code2->code.u.binop.op1 = (Operand)malloc(sizeof(struct Operand_)); code2->code.u.binop.op1->kind = CONSTANT; code2->code.u.binop.op1->u.value = 0; code2->code.u.binop.op2 = t1; struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; code2->next = NULL; return code1; } } else{ if(Exp->children->neighbours->neighbours != NULL){ Operand t1 = new_temp(); Operand t2 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1); struct InterCodes* code2 = translate_Exp(Exp->children, t2); struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; code2->next = NULL; return code1; } else{ Operand t1 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children, t1); return code1; } } } if(strcmp(Exp->children->neighbours->name, "STAR")){ printf("Exp - Exp STAR Exp\n"); if(place != NULL){ Operand t1 = new_temp(); Operand t2 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1); struct InterCodes* code2 = translate_Exp(Exp->children, t2); struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code3->code.kind = MUL_; code3->code.u.binop.result = (Operand)malloc(sizeof(struct Operand_)); code3->code.u.binop.result->kind = VARIABLE; strcpy(code3->code.u.binop.result->u.ID, place->u.ID); code3->code.u.binop.op1 = t1; code3->code.u.binop.op2 = t2; struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; p = code2; while(p->next != NULL) p = p->next; p->next = code3; code3->prev = p; code3->next = NULL; return code1; } else{ Operand t1 = new_temp(); Operand t2 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1); struct InterCodes* code2 = translate_Exp(Exp->children, t2); struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; code2->next = NULL; return code1; } } if(strcmp(Exp->children->neighbours->name, "DIV")){ printf("Exp - Exp DIV Exp\n"); if(place != NULL){ Operand t1 = new_temp(); Operand t2 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1); struct InterCodes* code2 = translate_Exp(Exp->children, t2); struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code3->code.kind = DIV_; code3->code.u.binop.result = (Operand)malloc(sizeof(struct Operand_)); code3->code.u.binop.result->kind = VARIABLE; strcpy(code3->code.u.binop.result->u.ID, place->u.ID); code3->code.u.binop.op1 = t1; code3->code.u.binop.op2 = t2; struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; p = code2; while(p->next != NULL) p = p->next; p->next = code3; code3->prev = p; code3->next = NULL; return code1; } else{ Operand t1 = new_temp(); Operand t2 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1); struct InterCodes* code2 = translate_Exp(Exp->children, t2); struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; code2->next = NULL; return code1; } } if(strcmp(Exp->children->neighbours->name, "RELOP") || strcmp(Exp->children->neighbours->name, "NOT") || strcmp(Exp->children->neighbours->name, "AND") || strcmp(Exp->children->neighbours->name, "OR")){ printf("Exp - Exp Cond Exp\n"); if(place != NULL){ Operand label1 = new_label(); Operand label2 = new_label(); struct InterCodes* code1 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code1->code.kind = ASSIGN_; code1->code.u.assign.left = (Operand)malloc(sizeof(struct Operand_)); code1->code.u.assign.left->kind = VARIABLE; strcpy(code1->code.u.assign.left->u.ID, place->u.ID); code1->code.u.assign.right = (Operand)malloc(sizeof(struct Operand_)); code1->code.u.assign.right->kind = CONSTANT; code1->code.u.assign.right->u.value = 0; struct InterCodes* code2 = translate_Cond(Exp, label1, label2); struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code3->code.kind = LABEL_; code3->code.u.labelcode.label = label1; struct InterCodes* code4 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code4->code.kind = ASSIGN_; code4->code.u.assign.left = (Operand)malloc(sizeof(struct Operand_)); code4->code.u.assign.left->kind = VARIABLE; strcpy(code4->code.u.assign.left->u.ID, place->u.ID); code4->code.u.assign.right = (Operand)malloc(sizeof(struct Operand_)); code4->code.u.assign.right->kind = CONSTANT; code4->code.u.assign.right->u.value = 1; struct InterCodes* code5 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code5->code.kind = LABEL_; code5->code.u.labelcode.label = label2; code1->next = code2; code2->prev = code1; struct InterCodes* p = code2; while(p->next != NULL) p = p->next; p->next = code3; code3->prev = p; code3->next = code4; code4->prev = code3; code4->next = code5; code5->prev = code4; code5->next = NULL; return code1; } } } if (strcmp(Exp->children->neighbours->name, "LP")){ printf("Exp - ID LP RP\n"); if(strcmp(Exp->children->neighbours->neighbours->value_str, "read") == 0){ struct InterCodes* code1 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code1->code.kind = READ_; code1->code.u.read.rd = place; return code1; } else{ FieldList item = (FieldList)malloc(sizeof(struct FieldList_)); lookupTable(Exp->children->neighbours->neighbours->value_str,&item,1);//查函数 struct InterCodes* code1 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code1->code.kind = CALL_; code1->code.u.call.left = (Operand)malloc(sizeof(struct Operand_)); code1->code.u.call.left->kind = VARIABLE; strcpy(code1->code.u.assign.left->u.ID, place->u.ID); code1->code.u.call.function = (Operand)malloc(sizeof(struct Operand_)); code1->code.u.call.function->kind = VARIABLE; strcpy(code1->code.u.call.function->u.ID, item->name); return code1; } } if(strcmp(Exp->children->neighbours->name, "Args")){ printf("Exp - ID LP Args RP\n"); struct ArgList* arg_list = NULL; struct InterCodes* code1 = translate_Args(Exp->children->neighbours, arg_list); if(strcpy(Exp->children->neighbours->neighbours->neighbours->value_str, "write") == 0){ struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code2->code.kind = WRITE_; code2->code.u.write.wr = arg_list->op; struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; code2->next = NULL; return code1; } else{ FieldList item = (FieldList)malloc(sizeof(struct FieldList_)); lookupTable(Exp->children->neighbours->neighbours->neighbours->value_str,&item,0); struct ArgList* q = arg_list; struct InterCodes* code2 = NULL; while(q != NULL){ struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code3->code.kind = ARG_; code3->code.u.arg.argument = q->op; if(code2 == NULL){ code2 = code3; } else{ struct InterCodes* p = code2; while(p->next != NULL) p = p->next; p->next = code3; code3->prev = p; } q = q->next; } struct InterCodes* code4 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code4->code.kind = CALL_; code4->code.u.call.left = (Operand)malloc(sizeof(struct Operand_)); code4->code.u.call.left->kind = VARIABLE; strcpy(code4->code.u.assign.left->u.ID, place->u.ID); code4->code.u.call.function = (Operand)malloc(sizeof(struct Operand_)); code4->code.u.call.function->kind = VARIABLE; strcpy(code4->code.u.call.function->u.ID, item->name); struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; p = code2; while(p->next != NULL) p = p->next; p->next = code4; code4->prev = p; code4->next = NULL; return code1; } } if (strcmp(Exp->children->name, "RB") == 0){ printf("Exp - Exp LB Exp RB\n"); FieldList item = (FieldList)malloc(sizeof(struct FieldList_)); lookupTable(Exp->children->neighbours->neighbours->neighbours->children->value_str,&item,0); Operand t1 = new_temp(); Operand t2 = new_temp(); Operand t3 = new_temp(); struct InterCodes* code1 = translate_Exp(Exp->children->neighbours, t1); struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code2->code.kind = MUL_; code2->code.u.binop.result = t2; code2->code.u.binop.op1 = t1; code2->code.u.binop.op2 = (Operand)malloc(sizeof(struct Operand_)); code2->code.u.binop.result->kind = CONSTANT; code2->code.u.binop.result->u.value = 4; struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes)); code3->code.kind = ADD_; code3->code.u.binop.result = t3; code3->code.u.binop.op1 = (Operand)malloc(sizeof(struct Operand_)); code3->code.u.binop.op1->kind = ADDRESS; strcpy(code3->code.u.binop.op1->u.ID, item->inter_name); code3->code.u.binop.op2 = t2; //place = t4; place->kind = REFERENCE; strcpy(place->u.ID, t3->u.ID); struct InterCodes* p = code1; while(p->next != NULL) p = p->next; p->next = code2; code2->prev = p; code2->next = code3; code3->prev = code2; code3->next = NULL; return code1; } }
/** *********************************************************************************** CONSTRUCTION OF THE INDEX ***********************************************************************************/ int build_WordIndex (char *inbasename, char *build_options, void **index){ twcsa *wcsa; wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); *index = wcsa; wcsa->text = NULL; //char path2repaircompressor[1000]="./src/repair64bit/repairCompressor"; //wcsa->blockSize = DEFAULT_BLOCK_SIZE; //wcsa->q = DEFAULT_QGRAM_LEN; /** processing the parameters of the index:: blockSize, and q-gram-len (q) */ { char delimiters[] = " =;"; int j,num_parameters; char ** parameters; if (build_options != NULL) { parse_parameters_II(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters;j++) { if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) { //wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER; j++; } else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) { //wcsa->q =atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) { //strcpy(path2repaircompressor,parameters[j+1]); j++; } } free_parameters_II(num_parameters, ¶meters); } //fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q); //fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor); } /** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/ StartValid(); /** 1 ** Loads the compressed text into memory. */ loadTextInMem(&(wcsa->text), &(wcsa->textSize),(char *)inbasename); wcsa->sourceTextSize = wcsa->textSize; fprintf(stderr,"... Loaded Source Sequence: %u bytes\n", wcsa->textSize); fflush(stderr); fflush(stderr); /** 2 ** loads the array of document boundaries */ uint ndocs; uint *docboundaries; loadDocBeginngins(&docboundaries, &ndocs,(char *)inbasename); wcsa->ndocs = ndocs; //just for statistics. /** 3 ** Parses the sequence and gathers the vocabulary of words (sorted alphanumerically) the frecuency of such words: obtains "words", "nwords", and "wordsZone" Sets also wcsa->freqs (freq of each word) Sets also wcsa->maxNumOccs (needed for malloc during extraction) */ fprintf(stderr,"... Entering CreateVocabularyOfWords \n"); fflush(stderr); CreateVocabularyOfWords(*index, docboundaries, ndocs); //shows the words parsed... { int i; fprintf(stderr,"\n\n Después de sorting ...."); fflush(stderr); unsigned char *str; uint len; // for (i = 0; i<100; i++) { for (i = 0; i<wcsa->nwords; i++) { if ((i<15) || (i>wcsa->nwords-5)) { getWord(wcsa,i,&str,&len); fprintf(stderr,"\n freq[%6d]=%6u ",i, wcsa->freqs[i]); fprintf(stderr,", words[%6d] = ",i); printWord(str,len); } } } fprintf(stderr,"\n %u words have been parsed", wcsa->nwords); /** 4 ** creates a temporal list of occurrences of each word (block-oriented). gives also the len of each list */ { //decompression of the source text and creation of occList[][] and lenList[] uint **occList; uint *lenList; uint *doc_offsets_sids; createListsOfOccurrences (*index, &occList, &lenList, &doc_offsets_sids, docboundaries, ndocs); wcsa->doc_offsets_sids=doc_offsets_sids; fprintf(stderr,"\n %u lists of occurrences were created.", wcsa->nwords);fflush(stderr); //fprintf(stderr,"\n The ranks of the document beginnings are:"); //{ // int i; // for (i=0;i<=ndocs;i++) fprintf(stderr,"[%u-th-> %u]",i,wcsa->doc_offsets_sids[i]); //} //Preparing a "list of occurrences" that will be later indexed through build_il() ** uint *source_il, sourcelen_il; //uint maxPost = ndocs; //!! uint maxPost = doc_offsets_sids[ndocs]; uint nwords = wcsa->nwords; prepareSourceFormatForIListBuilder(nwords,maxPost,lenList, occList, &source_il, &sourcelen_il); /* char fileuintpostings[256] = "postingsSequence.uint32"; output_posting_lists_concatenated_DEBUGGING_ONLY (nwords, maxPost, lenList, occList,fileuintpostings); */ FILE *ff = fopen ("postings.posts","w"); fwrite (source_il, sizeof(uint), sourcelen_il, ff); fclose(ff); #ifdef WRITE_POSTING_LIST FILE *ff = fopen ("postings.posts","w"); fwrite (source_il, sizeof(uint), sourcelen_il, ff); fclose(ff); #endif fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", (ulong)sourcelen_il - nwords -2, (ulong) sizeof(uint)*(sourcelen_il - nwords -2)); fprintf(stderr,"\n maxPostValue = %u",maxPost); /*{ char fileposts[256]; sprintf(fileposts,"%s.%s.%u","POSTING_LISTS","posts", getpid()); FILE *ff = fopen(fileposts,"w"); fwrite(source_il, sizeof(uint), sourcelen_il,ff); fclose(ff); } for (int x=0;x<10;x++) printf("\n%u --> %u",x,source_il[x]); exit(0); */ fprintf(stderr,"\n the lists of occurrences were formatted for build_il.");fflush(stderr); //compressing the lists of occurrences and setting wcsa->ils int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils)); //source_il is freed inside!. IFERRORIL(error); { //frees memory for the posting lists uint i; for (i=0;i<wcsa->nwords;i++) free(occList[i]); free(occList); free(lenList); } /** 5 ** compressed representation of the source text */ { //unsigned long *docbegsUL; unsigned int i; //docbegsUL = (ulong *) malloc (sizeof(ulong) * (ndocs+1)); //for (i=0;i<=ndocs;i++) docbegsUL[i] = docboundaries[i]; //build_representation (wcsa->text, wcsa->textSize, docbegsUL, ndocs, build_options, &wcsa->ct); fprintf(stderr,"\nNow compressing the text: %lu bytes", (ulong)wcsa->textSize);fflush(stderr); build_representation (wcsa->text, wcsa->textSize, docboundaries, ndocs, build_options, &wcsa->ct); unsigned char *document; uint doclen; extract_doc_representation (wcsa->ct, 0, &document, &doclen); fprintf(stderr,"\n =================== DOC 0 ======================"); fprintf(stderr,"\n%s",document); fprintf(stderr,"\n =================== ***** ======================\n"); free(document); //free(docbegsUL); } } #ifndef FREQ_VECTOR_AVAILABLE //<----- not needed in advance, only during construction free(wcsa->freqs); #endif free(docboundaries); ulong sizeI; index_size(*index, &sizeI); fflush(stderr); fflush(stdout); fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI); fflush(stderr); fflush(stdout); return 0; }