/** Loads index from one or more file(s) named filename, possibly adding the proper extensions. */ int load_index(char *filename, void **index){ twcsa *wcsa; wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); void *Index = (void *) wcsa; int error; wcsa->text = NULL; // Inicializes the arrays used to detect if a char is valid or not. StartValid(); /** 1 ** loads the vocabulary of words. zonewords, words vector, nwords */ loadVocabulary (Index, filename); { uint totaltmp=0; //words totaltmp += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words. fprintf(stderr,"\n\t*Loaded Vocabulary of text: %u words, %d bytes\n", wcsa->nwords, totaltmp); } /** 2 ** Loads some configuration constants: sourceTextSize, maxNumOccs */ loadIndexConstants(Index, filename); fprintf(stderr,"\t*Loaded configuration constants: %lu bytes\n", (ulong) (2 * sizeof(uint ) + sizeof(ulong)) ); #ifdef FREQ_VECTOR_AVAILABLE /** 3 ** Loading freq vector */ {uint size; //the size of the vocabulary in #ofwords =>> already init in "loadvocabulary" loadFreqVector(&(wcsa->freqs), &size, (char *)filename); fprintf(stderr,"\t*Loaded freq vector: %d bytes\n", wcsa->nwords * sizeof(uint) ); } #endif /** 4 ** Loading Compressed Structure of posting lists (il) */ error = load_il((char*) filename,&(wcsa->ils)); IFERRORIL(error); uint sizeil; error = size_il(wcsa->ils,&sizeil); IFERRORIL(error); fprintf(stderr,"\n \t*loaded compressed inverted lists structure: %d bytes\n", sizeil); /** 5 ** Loading the Representation of the source text */ load_representation( &wcsa->ct,filename); { uint size; size_representation(wcsa->ct, &size); fprintf(stderr,"\n\t*Loaded (compressed) representation of the source Text: %u bytes\n", size); } (*index) = Index; return 0; }
Writer createWriter (char *fileName){ Writer w = (Writer) malloc (sizeof(struct Wrt)); w->_fileName = (byte *) malloc( strlen(fileName) +1 ); strcpy(w->_fileName,fileName); w->bw = bufCreateWriter(w->_fileName); openBW(w->bw); w->_previousAlfanumerical = 0; strncpy((char *)w->_SPACE," ",1); StartValid(); return w; }
/** *********************************************************************************** CONSTRUCTION OF THE INDEX, from a given text file "inbasename". ***********************************************************************************/ int build_WordIndex_from_postings (char *inbasename, char *build_options, void **index){ twcsa *wcsa; wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); *index = wcsa; void *Index = *index; wcsa->text = NULL; double t0, t1; t0 = getSYSTimeBF(); /** processing the parameters of the index:: blockSize, and q-gram-len (q) */ { char delimiters[] = " =;"; int j,num_parameters; char ** parameters; if (build_options != NULL) { parse_parameters_II(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters;j++) { if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) { //wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER; j++; } else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) { //wcsa->q =atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) { //strcpy(path2repaircompressor,parameters[j+1]); j++; } } free_parameters_II(num_parameters, ¶meters); } //fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q); //fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor); } wcsa->freqs=NULL; /** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/ StartValid(); /** 1 ** loads the vocabulary of words. zonewords, words vector, nwords */ t1 = getSYSTimeBF(); loadVocabulary (Index, inbasename); { uint totaltmp=0; //words totaltmp += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words. fprintf(stderr,"\n\t*Loaded Vocabulary: %u words, %d bytes", wcsa->nwords, totaltmp); } fprintf(stderr,"\n\t... Done: %2.2f seconds (sys+usr t)\n", getSYSTimeBF() -t1); /** 2 ** Loads some configuration constants: sourceTextSize, maxNumOccs */ loadIndexConstants(Index, inbasename); fprintf(stderr,"\n\t*Loaded configuration constants: %lu bytes\n", (ulong) (2 * sizeof(uint ) + sizeof(ulong)) ); /* //shows the words parsed... { int i; fprintf(stderr,"\n\n Despues de sorting ...."); fflush(stderr); unsigned char *str; uint len; // for (i = 0; i<100; i++) { for (i = 0; ((uint)i)<wcsa->nwords; i++) { if ((i<10) || (((uint)i) >wcsa->nwords-5)) { getWord(wcsa,i,&str,&len); fprintf(stderr,"\n freq[%6d]=%6u ",i, wcsa->freqs[i]); fprintf(stderr,", words[%6d] = ",i); printWord(str,len); } } } t1 = getSYSTimeBF(); fprintf(stderr,"\n %u words have been loaded", wcsa->nwords); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); */ #ifdef FREQ_VECTOR_AVAILABLE /** 3 ** Loading freq vector */ {uint size; //the size of the vocabulary in #ofwords =>> already init in "loadvocabulary" loadFreqVector(&(wcsa->freqs), &size, (char *)inbasename); fprintf(stderr,"\t*Loaded freq vector: %d bytes\n", wcsa->nwords * sizeof(uint) ); } #endif /** 5 ** Loading the Representation of the source text */ load_representation( &wcsa->ct,inbasename); { uint size; size_representation(wcsa->ct, &size); fprintf(stderr,"\n\t*Loaded (compressed) representation of the source Text: %u bytes\n", size); } /** 4 ** Loading the uncompressed posting lists previously created by the indexer. */ //Preparing a "list of occurrences" that will be later indexed through build_il() ** uint *source_il, sourcelen_il; uint maxPost ; //just to check it loads OK ;) ulong source_il_ulong; t1 = getSYSTimeBF(); fprintf(stderr,"\n... Loading the posting lists from disk \n"); fflush(stderr); load_posting_lists_from_file(&maxPost, &source_il_ulong, &source_il, inbasename); /** FOR CIKM ILISTS_DO NOT STILL SUPPORT an ULONG HERE **/ sourcelen_il = (uint)source_il_ulong; fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", (ulong)sourcelen_il - wcsa->nwords -2, (ulong) sizeof(uint)*(sourcelen_il - wcsa->nwords -2)); fprintf(stderr,"\n MAXPOST loaded = %u, source_il_len = %u \n\n",maxPost,sourcelen_il); fprintf(stderr,"\n NLISTS loaded = %u, MAXPOSTS_sET \n\n",source_il[0],source_il[1]); t1 = getSYSTimeBF(); fprintf(stderr,"\n**... entering BUILD INVERTED LIST REPRESENTATION************!! \n\n"); fflush(stderr); /* { char fileposts[2048]; sprintf(fileposts,"%s.%s.%u","postinglistsXX","posts", getpid()); FILE *ff = fopen(fileposts,"w"); fwrite(source_il, sizeof(uint), sourcelen_il,ff); fclose(ff); } */ //compressing the lists of occurrences and setting wcsa->ils int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils)); //source_il is freed inside!. IFERRORIL(error); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); #ifndef FREQ_VECTOR_AVAILABLE //<----- not needed in advance, only during construction free(wcsa->freqs); #endif ulong sizeI; index_size(*index, &sizeI); fflush(stderr); fflush(stdout); fprintf(stderr,"\n ---------------------------------------------"); fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI); fprintf(stderr,"\n... Done: OVERALL TIME = %2.2f seconds (sys+usr time)", getSYSTimeBF() -t0); fprintf(stderr,"\n ---------------------------------------------\n\n\n"); fflush(stderr); fflush(stdout); return 0; }
/** *********************************************************************************** CONSTRUCTION OF THE INDEX, from a given text file "inbasename". ***********************************************************************************/ int build_WordIndex (char *inbasename, char *build_options, void **index){ twcsa *wcsa; wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); *index = wcsa; wcsa->text = NULL; double t0, t1; t0 = getSYSTimeBF(); //char path2repaircompressor[1000]="./src/repair64bit/repairCompressor"; //wcsa->blockSize = DEFAULT_BLOCK_SIZE; //wcsa->q = DEFAULT_QGRAM_LEN; /** processing the parameters of the index:: blockSize, and q-gram-len (q) */ { char delimiters[] = " =;"; int j,num_parameters; char ** parameters; if (build_options != NULL) { parse_parameters_II(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters;j++) { if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) { //wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER; j++; } else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) { //wcsa->q =atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) { //strcpy(path2repaircompressor,parameters[j+1]); j++; } } free_parameters_II(num_parameters, ¶meters); } //fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q); //fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor); } /** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/ StartValid(); /** 1 ** Loads the compressed text into memory. */ t1 = getSYSTimeBF(); fprintf(stderr,"\n... Entering LoadTextInmem:%s\n",inbasename); loadTextInMem(&(wcsa->text), &(wcsa->sourceTextSize),(char *)inbasename); fprintf(stderr,"... Loaded Source Sequence: %lu bytes\n", wcsa->sourceTextSize); fprintf(stderr,"... Done: %2.2f seconds (sys+usr time)\n\n", getSYSTimeBF() -t1); fflush(stderr); /** 2 ** loads the array of document boundaries */ uint ndocs; ulong *docboundaries; loadDocBeginngins(&docboundaries, &ndocs,(char *)inbasename); wcsa->ndocs = ndocs; //just for statistics. /** 3 ** Parses the sequence and gathers the vocabulary of words (sorted alphanumerically) the frecuency of such words: obtains "words", "nwords", and "wordsZone" Sets also wcsa->freqs (freq of each word) Sets also wcsa->maxNumOccs (needed for malloc during extraction) */ fprintf(stderr,"\n... Entering CreateVocabularyOfWords (1st pass) \n"); fflush(stderr); CreateVocabularyOfWords(*index, docboundaries, ndocs); //shows the words parsed... { int i; fprintf(stderr,"\n\n Despues de sorting ...."); fflush(stderr); unsigned char *str; uint len; // for (i = 0; i<100; i++) { for (i = 0; ((uint)i)<wcsa->nwords; i++) { if ((i<10) || (((uint)i) >wcsa->nwords-5)) { getWord(wcsa,i,&str,&len); fprintf(stderr,"\n freq[%6d]=%6u ",i, wcsa->freqs[i]); fprintf(stderr,", words[%6d] = ",i); printWord(str,len); } } } t1 = getSYSTimeBF(); fprintf(stderr,"\n %u words have been parsed", wcsa->nwords); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); /** 4 ** creates a temporal list of occurrences of each word (block-oriented). gives also the len of each list */ { //decompression of the source text and creation of occList[][] and lenList[] uint **occList; uint *lenList; t1 = getSYSTimeBF(); fprintf(stderr,"\n... Entering createListsOfOccurrences (2nd pass) \n"); fflush(stderr); createListsOfOccurrences (*index, &occList, &lenList, docboundaries, ndocs); fprintf(stderr,"\n %u lists of occurrences were created.", wcsa->nwords);fflush(stderr); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); #ifdef CIKM2011_HURRY free(wcsa->text); wcsa->text = NULL; #endif //Preparing a "list of occurrences" that will be later indexed through build_il() ** uint *source_il, sourcelen_il; uint maxPost = ndocs; uint nwords = wcsa->nwords; ulong source_il_ulong; t1 = getSYSTimeBF(); fprintf(stderr,"\n... Entering prepareSourceFormatForIListBuilder \n"); fflush(stderr); prepareSourceFormatForIListBuilder(nwords,maxPost,lenList, occList, &source_il, &source_il_ulong); /** FOR CIKM ILISTS_DO NOT STILL SUPPORT an ULONG HERE **/ sourcelen_il = (uint)source_il_ulong; fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", (ulong)sourcelen_il - nwords -2, (ulong) sizeof(uint)*(sourcelen_il - nwords -2)); /* char fileuintpostings[256] = "postingsSequence.uint32"; output_posting_lists_concatenated_DEBUGGING_ONLY (nwords, maxPost, lenList, occList,fileuintpostings); */ { char fileposts[2048]; sprintf(fileposts,"%s.%s.%u","postinglists","posts", getpid()); FILE *ff = fopen(fileposts,"w"); fwrite(source_il, sizeof(uint), sourcelen_il,ff); fclose(ff); } /* FILE *ff = fopen ("2gbnopositional.posts.uint32","w"); fwrite (source_il, sizeof(uint), sourcelen_il, ff); fclose(ff); */ fprintf(stderr,"\n the lists of occurrences were formatted for build_il.");fflush(stderr); fprintf(stderr,"\n...Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); t1 = getSYSTimeBF(); fprintf(stderr,"\n**... entering BUILD INVERTED LIST REPRESENTATION!! \n"); fflush(stderr); //compressing the lists of occurrences and setting wcsa->ils int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils)); //source_il is freed inside!. IFERRORIL(error); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); { //frees memory for the posting lists uint i; for (i=0;i<wcsa->nwords;i++) free(occList[i]); free(occList); free(lenList); } /** 5 ** compressed representation of the source text */ { #ifdef CIKM2011_HURRY uchar text_null[109] = "NULL-TEXT"; uint text_len_null=1; uint docbounds_null[2]= {0,10}; uint ndoc_null = 1; build_representation (text_null, text_len_null, docbounds_null, ndoc_null, build_options, &wcsa->ct); #endif #ifndef CIKM2011_HURRY build_representation (wcsa->text, wcsa->sourceTextSize, docboundaries, ndocs, build_options, &wcsa->ct); #endif unsigned char *document; uint doclen; extract_doc_representation (wcsa->ct, 0, &document, &doclen); fprintf(stderr,"\n =================== DOC 0 ======================"); fprintf(stderr,"\n%s",document); fprintf(stderr,"\n =================== ***** ======================\n"); free(document); //free(docbegsUL); } } #ifndef FREQ_VECTOR_AVAILABLE //<----- not needed in advance, only during construction free(wcsa->freqs); #endif free(docboundaries); ulong sizeI; index_size(*index, &sizeI); fflush(stderr); fflush(stdout); fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI); fprintf(stderr,"\n... Done: OVERALL TIME = %2.2f seconds (sys+usr time)\n\n\n", getSYSTimeBF() -t0); fflush(stderr); fflush(stdout); return 0; }
/** *********************************************************************************** CONSTRUCTION OF THE INDEX ***********************************************************************************/ int build_WordIndex (char *inbasename, char *build_options, void **index){ twcsa *wcsa; wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); *index = wcsa; wcsa->text = NULL; //char path2repaircompressor[1000]="./src/repair64bit/repairCompressor"; //wcsa->blockSize = DEFAULT_BLOCK_SIZE; //wcsa->q = DEFAULT_QGRAM_LEN; /** processing the parameters of the index:: blockSize, and q-gram-len (q) */ { char delimiters[] = " =;"; int j,num_parameters; char ** parameters; if (build_options != NULL) { parse_parameters_II(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters;j++) { if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) { //wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER; j++; } else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) { //wcsa->q =atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) { //strcpy(path2repaircompressor,parameters[j+1]); j++; } } free_parameters_II(num_parameters, ¶meters); } //fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q); //fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor); } /** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/ StartValid(); /** 1 ** Loads the compressed text into memory. */ loadTextInMem(&(wcsa->text), &(wcsa->textSize),(char *)inbasename); wcsa->sourceTextSize = wcsa->textSize; fprintf(stderr,"... Loaded Source Sequence: %u bytes\n", wcsa->textSize); fflush(stderr); fflush(stderr); /** 2 ** loads the array of document boundaries */ uint ndocs; uint *docboundaries; loadDocBeginngins(&docboundaries, &ndocs,(char *)inbasename); wcsa->ndocs = ndocs; //just for statistics. /** 3 ** Parses the sequence and gathers the vocabulary of words (sorted alphanumerically) the frecuency of such words: obtains "words", "nwords", and "wordsZone" Sets also wcsa->freqs (freq of each word) Sets also wcsa->maxNumOccs (needed for malloc during extraction) */ fprintf(stderr,"... Entering CreateVocabularyOfWords \n"); fflush(stderr); CreateVocabularyOfWords(*index, docboundaries, ndocs); //shows the words parsed... { int i; fprintf(stderr,"\n\n Después de sorting ...."); fflush(stderr); unsigned char *str; uint len; // for (i = 0; i<100; i++) { for (i = 0; i<wcsa->nwords; i++) { if ((i<15) || (i>wcsa->nwords-5)) { getWord(wcsa,i,&str,&len); fprintf(stderr,"\n freq[%6d]=%6u ",i, wcsa->freqs[i]); fprintf(stderr,", words[%6d] = ",i); printWord(str,len); } } } fprintf(stderr,"\n %u words have been parsed", wcsa->nwords); /** 4 ** creates a temporal list of occurrences of each word (block-oriented). gives also the len of each list */ { //decompression of the source text and creation of occList[][] and lenList[] uint **occList; uint *lenList; uint *doc_offsets_sids; createListsOfOccurrences (*index, &occList, &lenList, &doc_offsets_sids, docboundaries, ndocs); wcsa->doc_offsets_sids=doc_offsets_sids; fprintf(stderr,"\n %u lists of occurrences were created.", wcsa->nwords);fflush(stderr); //fprintf(stderr,"\n The ranks of the document beginnings are:"); //{ // int i; // for (i=0;i<=ndocs;i++) fprintf(stderr,"[%u-th-> %u]",i,wcsa->doc_offsets_sids[i]); //} //Preparing a "list of occurrences" that will be later indexed through build_il() ** uint *source_il, sourcelen_il; //uint maxPost = ndocs; //!! uint maxPost = doc_offsets_sids[ndocs]; uint nwords = wcsa->nwords; prepareSourceFormatForIListBuilder(nwords,maxPost,lenList, occList, &source_il, &sourcelen_il); /* char fileuintpostings[256] = "postingsSequence.uint32"; output_posting_lists_concatenated_DEBUGGING_ONLY (nwords, maxPost, lenList, occList,fileuintpostings); */ FILE *ff = fopen ("postings.posts","w"); fwrite (source_il, sizeof(uint), sourcelen_il, ff); fclose(ff); #ifdef WRITE_POSTING_LIST FILE *ff = fopen ("postings.posts","w"); fwrite (source_il, sizeof(uint), sourcelen_il, ff); fclose(ff); #endif fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", (ulong)sourcelen_il - nwords -2, (ulong) sizeof(uint)*(sourcelen_il - nwords -2)); fprintf(stderr,"\n maxPostValue = %u",maxPost); /*{ char fileposts[256]; sprintf(fileposts,"%s.%s.%u","POSTING_LISTS","posts", getpid()); FILE *ff = fopen(fileposts,"w"); fwrite(source_il, sizeof(uint), sourcelen_il,ff); fclose(ff); } for (int x=0;x<10;x++) printf("\n%u --> %u",x,source_il[x]); exit(0); */ fprintf(stderr,"\n the lists of occurrences were formatted for build_il.");fflush(stderr); //compressing the lists of occurrences and setting wcsa->ils int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils)); //source_il is freed inside!. IFERRORIL(error); { //frees memory for the posting lists uint i; for (i=0;i<wcsa->nwords;i++) free(occList[i]); free(occList); free(lenList); } /** 5 ** compressed representation of the source text */ { //unsigned long *docbegsUL; unsigned int i; //docbegsUL = (ulong *) malloc (sizeof(ulong) * (ndocs+1)); //for (i=0;i<=ndocs;i++) docbegsUL[i] = docboundaries[i]; //build_representation (wcsa->text, wcsa->textSize, docbegsUL, ndocs, build_options, &wcsa->ct); fprintf(stderr,"\nNow compressing the text: %lu bytes", (ulong)wcsa->textSize);fflush(stderr); build_representation (wcsa->text, wcsa->textSize, docboundaries, ndocs, build_options, &wcsa->ct); unsigned char *document; uint doclen; extract_doc_representation (wcsa->ct, 0, &document, &doclen); fprintf(stderr,"\n =================== DOC 0 ======================"); fprintf(stderr,"\n%s",document); fprintf(stderr,"\n =================== ***** ======================\n"); free(document); //free(docbegsUL); } } #ifndef FREQ_VECTOR_AVAILABLE //<----- not needed in advance, only during construction free(wcsa->freqs); #endif free(docboundaries); ulong sizeI; index_size(*index, &sizeI); fflush(stderr); fflush(stdout); fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI); fflush(stderr); fflush(stdout); return 0; }