Esempio n. 1
0
lzma_index_stream_size(const lzma_index *i)
{
	// Stream Header + Blocks + Index + Stream Footer
	return LZMA_STREAM_HEADER_SIZE + i->total_size
			+ index_size(i->count, i->index_list_size)
			+ LZMA_STREAM_HEADER_SIZE;
}
Esempio n. 2
0
bool update_index(char *path, idx *idx, uint8_t *kek, kdfp *kdfp, uint8_t *id, entry *entry) {
    size_t size    = index_size(idx);
    uint8_t *flags = scan_attrs(entry, &size);

    size += 1024 - (size % 1024);

    void    *addr = mmfile(path, &size);
    box     *kbox = BOX_PTR(addr, KDFP_LEN);
    box     *data = BOX_PTR(kbox, BOX_LEN(KEY_LEN));
    uint8_t *key  = BOX_DATA(kbox);

    if (!addr || !flags) return false;

    uint32_t *count = (uint32_t *) BOX_DATA(data);
    uint8_t *cursor = BOX_DATA(data) + sizeof(uint32_t);

    *count = idx->count;
    for (uint32_t i = 0; i < idx->count; i++) {
        term *term = &idx->terms[i];
        bool append = false;

        for (uint32_t j = 0; !append && j < entry->count; j++) {
            string *val = &entry->attrs[j].val;
            if (flags[j] == SKIP) continue;
            if (val->len == term->len && !memcmp(val->str, term->str, val->len)) {
                flags[j] = SKIP;
                append   = true;
                break;
            }
        }

        if (!append && term->count == 1 && !memcmp(term->ids, id, ID_LEN)) {
            (*count)--;
            continue;
        }

        cursor = write_term(cursor, term, id, append);
    }

    for (uint32_t i = 0; i < entry->count; i++) {
        string *val = &entry->attrs[i].val;
        if (flags[i] == WRITE) {
            term term  = { .len = val->len, .str = val->str };
            cursor = write_term(cursor, &term, id, true);
            (*count)++;
        }
    }

    *count = htonl(*count);
    free(flags);

    write_kdfp(addr, kdfp);
    memcpy(key, idx->key, KEY_LEN);
    encrypt_box(key, data, INDEX_LEN(size));
    encrypt_box(kek, kbox, KEY_LEN);

    return mmsync(path, addr, size);
}
Esempio n. 3
0
int main(int argc, char *argv[]) {

	char *infile, *outfile;
    uchar *text;
	char *params = NULL;
	ulong text_len;
	void *index;
	int error, i;
	double start, end;

    if (argc < 3) print_usage(argv[0]);
	if (argc > 3) { 
		int nchars, len;
		nchars = argc-3;
		for(i=2;i<argc;i++)
			nchars += strlen(argv[i]);
		params = (char *) malloc((nchars+1)*sizeof(char));
		params[nchars] = '\0';
		nchars = 0;
		for(i=3;i<argc;i++) {
			len = strlen(argv[i]);
			strncpy(params+nchars,argv[i],len);
			params[nchars+len] = ' ';
			nchars += len+1;
		}
		params[nchars-1] = '\0';
	}

	infile = argv[1];
	outfile = argv[2];

	start = getTime();
	error = read_file(infile, &text, &text_len);
	IFERROR(error);

	error = build_index(text, text_len, params, &index);
	IFERROR(error);

	error = save_index(index, outfile);
	IFERROR(error);
	end = getTime();	

	fprintf(stderr, "Building time: %.3f secs\n", end-start );
	
	ulong index_len;
	index_size(index, &index_len);
	fprintf(stdout,"Input: %lu bytes --> Output %lu bytes.\n", text_len, index_len);
	fprintf(stdout,"Overall compression --> %.2f%% (%.2f bits per char).\n\n",
     			(100.0*index_len)/text_len, (index_len*8.0)/text_len);

	error = free_index(index);
	IFERROR(error);


	exit(0);
}
Esempio n. 4
0
lzma_index_file_size(const lzma_index *i)
{
	// If multiple Streams are concatenated, the Stream Header, Index,
	// and Stream Footer fields of all but the last Stream are already
	// included in old.streams_size. Thus, we need to calculate only the
	// size of the last Index, not all Indexes.
	return i->old.streams_size + LZMA_STREAM_HEADER_SIZE + i->total_size
			+ index_size(i->count - i->old.count,
				i->index_list_size - i->old.index_list_size)
			+ LZMA_STREAM_HEADER_SIZE;
}
Esempio n. 5
0
/** returns detailled info of the data structures of the index **/
int index_info(void *index, char msg[]) {
	twcsa *wcsa=(twcsa *)index;
	char msgt[1000];
	int error;
	uint sizeil; 
	
	error = size_il(wcsa->ils, &sizeil);
	long sizeL = (long) size_il;
	if (sizeL <0){
		printf("\n\n [WARNING!! size is being reported badly (%ld bytes!) due to IL_postings_list_technique\n",sizeL);
	}
	IFERRORIL(error);
	
	ulong indexs;
	index_size(index,&indexs);
	
	ulong Text_length;
	get_length(index, &Text_length);	


	uint totalVoc=0;  //Vocabulary of words.
	totalVoc += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));  //the pointers
	totalVoc += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
		
	uint sizerepres; //Compressed representation of the source text
	size_representation(wcsa->ct, &sizerepres);
		
	sprintf(msg,"\n\tSpace statistics for II-words-DocOriented\n");
	sprintf (msgt,"\t tindex structure = %lu bytes.\n",(ulong)sizeof(twcsa) );	strcat(msg,msgt);
    sprintf (msgt,"\t number of diff words= %u\n",wcsa->nwords );	strcat(msg,msgt);	
    sprintf (msgt,"\t number of docs (ndocs)= %u\n",wcsa->ndocs );	strcat(msg,msgt);	
	
	
	sprintf (msgt,"\t vocabulary of words = %u bytes.\n", totalVoc );	strcat(msg,msgt);
	sprintf (msgt,"\t Inverted list-structure = %u bytes.\n",sizeil );	strcat(msg,msgt);
	sprintf (msgt,"\t Text (compressed representation) = %u bytes.\n", sizerepres);	strcat(msg,msgt);
	sprintf (msgt,"\t Whole index = %lu bytes ", indexs);	strcat(msg,msgt);
	sprintf (msgt," ** Ratio = %2.3f %% **\n ",  100.0*indexs / Text_length);	strcat(msg,msgt);
	
	fprintf(stderr,"\n\tSpace statistics for II-words-DocOriented\n %s",msg);
	
	return 0;
}
Esempio n. 6
0
/** returns detailled info of the data structures of the index **/
int index_info(void *index, char msg[]) {
	twcsa *wcsa=(twcsa *)index;
	char msgt[1000];
	uint sizeil; 
	int error = size_il(wcsa->ils, &sizeil);
	IFERRORIL(error);
	
	ulong indexs;
	index_size(index,&indexs);
	
	ulong Text_length;
	get_length(index, &Text_length);	


	uint totalVoc=0;  //Vocabulary of words.
	totalVoc += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+WI32-1) /WI32) * (sizeof(uint));  //the pointers
	totalVoc += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
	
	uint size_doc_offsets_sids = sizeof (uint) * (wcsa->ndocs+1);
		
	uint sizerepres; //Compressed representation of the source text
	size_representation(wcsa->ct, &sizerepres);
		
	sprintf(msg,"\n\tSpace statistics for II-words-DocOriented\n");
	sprintf (msgt,"\t tindex structure = %lu bytes.\n",(ulong)sizeof(twcsa) );	strcat(msg,msgt);
    sprintf (msgt,"\t number of diff words= %u\n",wcsa->nwords );	strcat(msg,msgt);	
    sprintf (msgt,"\t number of docs (ndocs)= %u\n",wcsa->ndocs );	strcat(msg,msgt);	
	
	
	sprintf (msgt,"\t vocabulary of words = %u bytes.\n", totalVoc );	strcat(msg,msgt);
	sprintf (msgt,"\t Mapping Docs-2-wordposition = %u bytes.\n", size_doc_offsets_sids);	strcat(msg,msgt);
	sprintf (msgt,"\t Inverted list-structure = %u bytes.\n",sizeil );	strcat(msg,msgt);
	sprintf (msgt,"\t Text (compressed representation) = %u bytes.\n", sizerepres);	strcat(msg,msgt);
	sprintf (msgt,"\t Whole index = %lu bytes ", indexs);	strcat(msg,msgt);
	sprintf (msgt," ** Ratio = %2.3f %% **\n ",  100.0*indexs / Text_length);	strcat(msg,msgt);
	
	fprintf(stderr,"\n\tSpace statistics for II-words-DocOriented\n %s",msg);
	
	return 0;
}
Esempio n. 7
0
static lzma_vli
index_file_size(lzma_vli compressed_base, lzma_vli unpadded_sum,
		lzma_vli record_count, lzma_vli index_list_size,
		lzma_vli stream_padding)
{
	// Earlier Streams and Stream Paddings + Stream Header
	// + Blocks + Index + Stream Footer + Stream Padding
	//
	// This might go over LZMA_VLI_MAX due to too big unpadded_sum
	// when this function is used in lzma_index_append().
	lzma_vli file_size = compressed_base + 2 * LZMA_STREAM_HEADER_SIZE
			+ stream_padding + vli_ceil4(unpadded_sum);
	if (file_size > LZMA_VLI_MAX)
		return LZMA_VLI_UNKNOWN;

	// The same applies here.
	file_size += index_size(record_count, index_list_size);
	if (file_size > LZMA_VLI_MAX)
		return LZMA_VLI_UNKNOWN;

	return file_size;
}
Esempio n. 8
0
static void show_encoded_statistics(GtStrArray *infiles, const char *indexname)
{
  int i;
  off_t orig_size = 0, enc_size = 0;
  const char *seqfile;
  gt_assert(infiles);
  for (i=0; i < gt_str_array_size(infiles);i ++) {
    seqfile = gt_str_array_get(infiles, i);
    orig_size += gt_file_size(seqfile);
  }
  enc_size += index_size(indexname, GT_ALPHABETFILESUFFIX);
  enc_size += index_size(indexname, GT_ENCSEQFILESUFFIX);
  enc_size += index_size(indexname, GT_SSPTABFILESUFFIX);
  enc_size += index_size(indexname, GT_DESTABFILESUFFIX);
  enc_size += index_size(indexname, GT_SDSTABFILESUFFIX);
  enc_size += index_size(indexname, GT_OISTABFILESUFFIX);
  printf("encoded sequence file(s) are %.1f%% of original file size\n",
         ((double) enc_size / orig_size) * 100.0);
}
Esempio n. 9
0
lzma_index_size(const lzma_index *i)
{
	return index_size(i->count, i->index_list_size);
}
Esempio n. 10
0
int main(int argc, char *argv[]) {

	char *infile, *outfile;
    uchar *text;
	ulong text_len;
	void *index;
	int error;

    if (argc < 3) print_usage(argv[0]);

	infile = argv[1];  // input file 
	outfile = argv[2]; // output file

	error = read_file(infile, &text, &text_len);
	IFERROR(error);

	/* Possible options:
	   "-a x": indicates the behaviour of FM-index with the pointer 'text'.
			   x == 0 FM-index uses 'text' directly to build the suffix array.
					  This means that you are responsable to allocate 'length+overshoot' 
					  bytes for the text instead of 'length' bytes. You must include
					  ds_ssort.h. See function read_file();
			   x == 1 FM-index frees the allocated memory for the 'text'. overshoot 
	                  and ds_ssort.h are not necessary.
			   x == 2 FM-index makes its internal copy of 'text'. After the call, 
	                  'text' is available. overshoot and ds_ssort.h are not 
		              necessary.
	    -B Bsize: where Bsize is the size in Kbytes of level 1 buckets.
		-b bsize: where bsize is the size in bytes of level 2 buckets. 
				  bsize must divide Bsize*1024;
	    -f frequency: where frequency is a number from 0 to 1 that indicates the 
					  frequency of the marked characters.
	
	   default "-b 512 -B 16 -f 0.02 -a 1"
	   
	   Example of some call to build_index():
	   - build_index(text, text_len, NULL, &index);
		 uses the default parameters.
	   - build_index(text, text_len, "-a 1 -f 0.1", &index);
	   	 tries to mark 10% of the positions instead of 2% but I cannot reuse 'text' 
         after this call. 	 
	*/
	fprintf(stdout, "Building\n");	
	error = build_index(text, text_len, "-a 1", &index);
	IFERROR(error);

	ulong index_len;
	index_size(index, &index_len);
	fprintf(stdout,"Input: %lu bytes --> Output %lu bytes.\n", text_len, index_len);
	fprintf(stdout,"Overall compression --> %.2f%% (%.2f bits per char).\n\n",
     			(100.0*index_len)/text_len, (index_len*8.0)/text_len);

	uchar *snippet;
	ulong i, readen, from = 11, to = 100, numocc, *occ;
	error = extract(index, from, to, &snippet, &readen);
	IFERROR(error);
	fprintf(stdout, "try extract\n\n");
	for(i=0;i<readen;i++)
		printf("%c", snippet[i]);
	printf("\n");
	
	uchar *pattern = snippet;
	fprintf(stdout, "try count\n\n");
	error =	count (index, pattern, 5, &numocc);
	printf("pattern: ");
	fwrite(pattern, sizeof(uchar), 5, stdout);
	printf(" # occs %lu\n\n",numocc);
	
	fprintf(stdout, "try locate\n\n");
	error =	locate (index, pattern, 5, &occ, &numocc);
	IFERROR (error);
	
	for(i=0;i<numocc;i++)
		printf("pos %lu\n", occ[i]);
	printf("\n");
	
	
	free(snippet);
	if(numocc) free(occ);
	error = save_index(index, outfile);
	IFERROR(error);

	error = free_index(index);
	IFERROR(error);


	exit(0);
}
Esempio n. 11
0
	size_t TextIndexCSA::getSize() const{
		ulong size;
		index_size(csa, &size);
		return (size_t)size;
	}
Esempio n. 12
0
/** ***********************************************************************************
	 CONSTRUCTION OF THE INDEX, from a given text file "inbasename".
    ***********************************************************************************/
int build_WordIndex_from_postings (char *inbasename, char *build_options, void **index){
	twcsa *wcsa;
	wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
	*index = wcsa;
	void *Index = *index;
	wcsa->text = NULL;
	double t0, t1;
	t0 = getSYSTimeBF();

	/** processing the parameters of the index:: blockSize, and q-gram-len (q) */
	{
		char delimiters[] = " =;";
		int j,num_parameters;
		char ** parameters;
		
		if (build_options != NULL) {
			parse_parameters_II(build_options,&num_parameters, &parameters, delimiters);
			for (j=0; j<num_parameters;j++) {

			  if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) {
				//wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER;	    
				j++;
			  } 
			  else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) {
				//wcsa->q =atoi(parameters[j+1]);	    
				j++;
			  }
			  else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) {
				//strcpy(path2repaircompressor,parameters[j+1]);	    
				j++;
			  }
			  
			}
			free_parameters_II(num_parameters, &parameters);
		}
		//fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q);
		//fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor);
	}

	wcsa->freqs=NULL;


	/** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/
	StartValid();
	
		
	/** 1 ** loads the vocabulary of words. zonewords, words vector, nwords */
	t1 = getSYSTimeBF();
	
	loadVocabulary (Index, inbasename);
		{	
		uint totaltmp=0;  //words
		totaltmp += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));  //the pointers
		totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.	
		fprintf(stderr,"\n\t*Loaded Vocabulary: %u words, %d bytes", wcsa->nwords, totaltmp);
		}
		fprintf(stderr,"\n\t... Done: %2.2f seconds (sys+usr t)\n", getSYSTimeBF() -t1);		

	/** 2 ** Loads some configuration constants: sourceTextSize, maxNumOccs */
	loadIndexConstants(Index, inbasename);
	fprintf(stderr,"\n\t*Loaded  configuration constants: %lu bytes\n", (ulong) (2 * sizeof(uint ) + sizeof(ulong)) );		
/*
	//shows the words parsed...
	{
		int i;
		fprintf(stderr,"\n\n Despues de sorting ....");	fflush(stderr);
		unsigned char *str;
		uint len;
//		for (i = 0; i<100; i++) {
		for (i = 0; ((uint)i)<wcsa->nwords; i++) {
			if ((i<10) || (((uint)i) >wcsa->nwords-5)) {
				getWord(wcsa,i,&str,&len);				
				fprintf(stderr,"\n freq[%6d]=%6u ",i,  wcsa->freqs[i]);
				fprintf(stderr,", words[%6d] = ",i);
				printWord(str,len);
			}
		}		
	}

	t1 = getSYSTimeBF();
	fprintf(stderr,"\n %u words have been loaded", wcsa->nwords);
	fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); 					
*/	

	#ifdef FREQ_VECTOR_AVAILABLE
	/** 3 ** Loading freq vector */
	{uint size; //the size of the vocabulary in #ofwords =>> already init in "loadvocabulary"
	loadFreqVector(&(wcsa->freqs), &size, (char *)inbasename);	
	fprintf(stderr,"\t*Loaded freq vector: %d bytes\n", wcsa->nwords * sizeof(uint) );
	}		
	#endif	
	
	/** 5 ** Loading the Representation of the source text */
	load_representation( &wcsa->ct,inbasename); 
	{
		uint size;
		size_representation(wcsa->ct, &size);	
		fprintf(stderr,"\n\t*Loaded (compressed) representation of the source Text: %u bytes\n", size);
	}
		
	/** 4 ** Loading the uncompressed posting lists previously created by the indexer. */
	
		//Preparing a "list of occurrences" that will be later indexed through build_il() **
		uint *source_il, sourcelen_il;
		uint maxPost ; //just to check it loads OK ;)
		ulong source_il_ulong;

		t1 = getSYSTimeBF();
		fprintf(stderr,"\n... Loading the posting lists from disk \n"); fflush(stderr);	
		
		load_posting_lists_from_file(&maxPost, &source_il_ulong, &source_il, inbasename);

		/** FOR CIKM ILISTS_DO NOT STILL SUPPORT an ULONG HERE **/
		sourcelen_il = (uint)source_il_ulong;
		
		fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", 
		               (ulong)sourcelen_il - wcsa->nwords -2, (ulong) sizeof(uint)*(sourcelen_il - wcsa->nwords -2));
		fprintf(stderr,"\n MAXPOST loaded = %u, source_il_len = %u \n\n",maxPost,sourcelen_il);
		fprintf(stderr,"\n NLISTS loaded = %u, MAXPOSTS_sET \n\n",source_il[0],source_il[1]);

		
		
		t1 = getSYSTimeBF();
		fprintf(stderr,"\n**... entering BUILD INVERTED LIST REPRESENTATION************!! \n\n"); fflush(stderr);	
/*
		{ char fileposts[2048];
			sprintf(fileposts,"%s.%s.%u","postinglistsXX","posts", getpid());
			FILE *ff = fopen(fileposts,"w");
			fwrite(source_il, sizeof(uint), sourcelen_il,ff);
			fclose(ff);
			
		}
*/
		//compressing the lists of occurrences and setting wcsa->ils
		int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils));  //source_il is freed inside!.
		IFERRORIL(error);							

		fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1);
		
	
	#ifndef FREQ_VECTOR_AVAILABLE   //<----- not needed in advance, only during construction
		free(wcsa->freqs);
	#endif

		
	ulong sizeI;
	index_size(*index, &sizeI);
	fflush(stderr); fflush(stdout);
	fprintf(stderr,"\n ---------------------------------------------");
	fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI);
	fprintf(stderr,"\n... Done: OVERALL TIME = %2.2f seconds (sys+usr time)", getSYSTimeBF() -t0);
	fprintf(stderr,"\n ---------------------------------------------\n\n\n");
	fflush(stderr);
	fflush(stdout);
	return 0;
}
Esempio n. 13
0
/** ***********************************************************************************
	 CONSTRUCTION OF THE INDEX, from a given text file "inbasename".
    ***********************************************************************************/
int build_WordIndex (char *inbasename, char *build_options, void **index){
	twcsa *wcsa;
	wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
	*index = wcsa;
	wcsa->text = NULL;
	double t0, t1;
	t0 = getSYSTimeBF();

	//char path2repaircompressor[1000]="./src/repair64bit/repairCompressor";
	//wcsa->blockSize = DEFAULT_BLOCK_SIZE;
	//wcsa->q = DEFAULT_QGRAM_LEN;

	/** processing the parameters of the index:: blockSize, and q-gram-len (q) */
	{
		char delimiters[] = " =;";
		int j,num_parameters;
		char ** parameters;
		
		if (build_options != NULL) {
			parse_parameters_II(build_options,&num_parameters, &parameters, delimiters);
			for (j=0; j<num_parameters;j++) {

			  if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) {
				//wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER;	    
				j++;
			  } 
			  else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) {
				//wcsa->q =atoi(parameters[j+1]);	    
				j++;
			  }
			  else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) {
				//strcpy(path2repaircompressor,parameters[j+1]);	    
				j++;
			  }
			  
			}
			free_parameters_II(num_parameters, &parameters);
		}
		//fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q);
		//fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor);
	}

	/** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/
	StartValid();
	
	
	/** 1 ** Loads the compressed text into memory. */
	t1 = getSYSTimeBF();
	fprintf(stderr,"\n... Entering LoadTextInmem:%s\n",inbasename);
	loadTextInMem(&(wcsa->text), &(wcsa->sourceTextSize),(char *)inbasename);	
		
	fprintf(stderr,"... Loaded Source Sequence: %lu bytes\n", wcsa->sourceTextSize); 
	fprintf(stderr,"... Done: %2.2f seconds (sys+usr time)\n\n", getSYSTimeBF() -t1); 
	fflush(stderr);

	/** 2 ** loads the array of document boundaries                           */
	uint ndocs;
	ulong *docboundaries;
	loadDocBeginngins(&docboundaries, &ndocs,(char *)inbasename);	
	wcsa->ndocs = ndocs; //just for statistics.

	
	/** 3 ** Parses the sequence and gathers the vocabulary of words (sorted alphanumerically) 
		the frecuency of such words: obtains "words", "nwords", and "wordsZone" 
		Sets also wcsa->freqs (freq of each word)
		Sets also wcsa->maxNumOccs (needed for malloc during extraction) */	
	
	fprintf(stderr,"\n... Entering CreateVocabularyOfWords (1st pass) \n"); fflush(stderr);
	CreateVocabularyOfWords(*index, docboundaries, ndocs);


	//shows the words parsed...
	{
		int i;
		fprintf(stderr,"\n\n Despues de sorting ....");	fflush(stderr);
		unsigned char *str;
		uint len;
//		for (i = 0; i<100; i++) {
		for (i = 0; ((uint)i)<wcsa->nwords; i++) {
			if ((i<10) || (((uint)i) >wcsa->nwords-5)) {
				getWord(wcsa,i,&str,&len);				
				fprintf(stderr,"\n freq[%6d]=%6u ",i,  wcsa->freqs[i]);
				fprintf(stderr,", words[%6d] = ",i);
				printWord(str,len);
			}
		}		
	}

	t1 = getSYSTimeBF();
	fprintf(stderr,"\n %u words have been parsed", wcsa->nwords);
	fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); 					
	
	/** 4 ** creates a temporal list of occurrences of each word (block-oriented).
					gives also the len of each list */
	{	
		//decompression of the source text and creation of occList[][] and lenList[]
		uint **occList; uint *lenList;
		
		t1 = getSYSTimeBF();
		fprintf(stderr,"\n... Entering createListsOfOccurrences (2nd pass) \n"); fflush(stderr);	
		
		createListsOfOccurrences (*index, &occList, &lenList, docboundaries, ndocs);
		
		fprintf(stderr,"\n %u lists of occurrences were created.", wcsa->nwords);fflush(stderr);
		fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1);





#ifdef CIKM2011_HURRY
	free(wcsa->text);
	wcsa->text = NULL;
#endif
		//Preparing a "list of occurrences" that will be later indexed through build_il() **
		uint *source_il, sourcelen_il;
		uint maxPost = ndocs;
		uint nwords = wcsa->nwords;
		ulong source_il_ulong;

		t1 = getSYSTimeBF();
		fprintf(stderr,"\n... Entering prepareSourceFormatForIListBuilder \n"); fflush(stderr);	
		
		prepareSourceFormatForIListBuilder(nwords,maxPost,lenList, occList, &source_il, &source_il_ulong);

		/** FOR CIKM ILISTS_DO NOT STILL SUPPORT an ULONG HERE **/
		sourcelen_il = (uint)source_il_ulong;
		
		fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", 
		               (ulong)sourcelen_il - nwords -2, (ulong) sizeof(uint)*(sourcelen_il - nwords -2));

		/*
	char fileuintpostings[256] = "postingsSequence.uint32";
	output_posting_lists_concatenated_DEBUGGING_ONLY (nwords, maxPost, lenList, occList,fileuintpostings);		
	*/
		
		
		{ char fileposts[2048];
			sprintf(fileposts,"%s.%s.%u","postinglists","posts", getpid());
			FILE *ff = fopen(fileposts,"w");
			fwrite(source_il, sizeof(uint), sourcelen_il,ff);
			fclose(ff);
			
		}		
/*		
	FILE *ff = fopen ("2gbnopositional.posts.uint32","w");
	fwrite (source_il, sizeof(uint), sourcelen_il, ff);
	fclose(ff);
*/	
	
		fprintf(stderr,"\n the lists of occurrences were formatted for build_il.");fflush(stderr);
		fprintf(stderr,"\n...Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1);

		
		t1 = getSYSTimeBF();
		fprintf(stderr,"\n**... entering BUILD INVERTED LIST REPRESENTATION!! \n"); fflush(stderr);	

		//compressing the lists of occurrences and setting wcsa->ils
		int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils));  //source_il is freed inside!.
		IFERRORIL(error);							

		fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1);
		
		{
			//frees memory for the posting lists
			uint i;
			for (i=0;i<wcsa->nwords;i++) free(occList[i]);
			free(occList);
			free(lenList);
		}
	
		/** 5 ** compressed representation of the source text */	
		{

#ifdef CIKM2011_HURRY
		uchar text_null[109] = "NULL-TEXT"; uint text_len_null=1;
		uint docbounds_null[2]= {0,10};
		uint ndoc_null = 1;
			build_representation (text_null, text_len_null, docbounds_null, ndoc_null, build_options, &wcsa->ct);
#endif
#ifndef CIKM2011_HURRY
			build_representation (wcsa->text, wcsa->sourceTextSize, docboundaries, ndocs, build_options, &wcsa->ct);
#endif

		
			unsigned char *document;
			uint doclen;
			extract_doc_representation (wcsa->ct, 0, &document, &doclen);
			fprintf(stderr,"\n =================== DOC 0 ======================");
			fprintf(stderr,"\n%s",document);
			fprintf(stderr,"\n =================== ***** ======================\n");
			free(document);
			//free(docbegsUL);
		

		}

	}
	
	#ifndef FREQ_VECTOR_AVAILABLE   //<----- not needed in advance, only during construction
		free(wcsa->freqs);
	#endif

	free(docboundaries);	
		
	ulong sizeI;
	index_size(*index, &sizeI);
	fflush(stderr); fflush(stdout);
	fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI);
	fprintf(stderr,"\n... Done: OVERALL TIME = %2.2f seconds (sys+usr time)\n\n\n", getSYSTimeBF() -t0);
	fflush(stderr);
	fflush(stdout);
	return 0;
}
Esempio n. 14
0
lzma_index_append(lzma_index *i, const lzma_allocator *allocator,
		lzma_vli unpadded_size, lzma_vli uncompressed_size)
{
	// Validate.
	if (i == NULL || unpadded_size < UNPADDED_SIZE_MIN
			|| unpadded_size > UNPADDED_SIZE_MAX
			|| uncompressed_size > LZMA_VLI_MAX)
		return LZMA_PROG_ERROR;

	index_stream *s = (index_stream *)(i->streams.rightmost);
	index_group *g = (index_group *)(s->groups.rightmost);

	const lzma_vli compressed_base = g == NULL ? 0
			: vli_ceil4(g->records[g->last].unpadded_sum);
	const lzma_vli uncompressed_base = g == NULL ? 0
			: g->records[g->last].uncompressed_sum;
	const uint32_t index_list_size_add = lzma_vli_size(unpadded_size)
			+ lzma_vli_size(uncompressed_size);

	// Check that the file size will stay within limits.
	if (index_file_size(s->node.compressed_base,
			compressed_base + unpadded_size, s->record_count + 1,
			s->index_list_size + index_list_size_add,
			s->stream_padding) == LZMA_VLI_UNKNOWN)
		return LZMA_DATA_ERROR;

	// The size of the Index field must not exceed the maximum value
	// that can be stored in the Backward Size field.
	if (index_size(i->record_count + 1,
			i->index_list_size + index_list_size_add)
			> LZMA_BACKWARD_SIZE_MAX)
		return LZMA_DATA_ERROR;

	if (g != NULL && g->last + 1 < g->allocated) {
		// There is space in the last group at least for one Record.
		++g->last;
	} else {
		// We need to allocate a new group.
		g = lzma_alloc(sizeof(index_group)
				+ i->prealloc * sizeof(index_record),
				allocator);
		if (g == NULL)
			return LZMA_MEM_ERROR;

		g->last = 0;
		g->allocated = i->prealloc;

		// Reset prealloc so that if the application happens to
		// add new Records, the allocation size will be sane.
		i->prealloc = INDEX_GROUP_SIZE;

		// Set the start offsets of this group.
		g->node.uncompressed_base = uncompressed_base;
		g->node.compressed_base = compressed_base;
		g->number_base = s->record_count + 1;

		// Add the new group to the Stream.
		index_tree_append(&s->groups, &g->node);
	}

	// Add the new Record to the group.
	g->records[g->last].uncompressed_sum
			= uncompressed_base + uncompressed_size;
	g->records[g->last].unpadded_sum
			= compressed_base + unpadded_size;

	// Update the totals.
	++s->record_count;
	s->index_list_size += index_list_size_add;

	i->total_size += vli_ceil4(unpadded_size);
	i->uncompressed_size += uncompressed_size;
	++i->record_count;
	i->index_list_size += index_list_size_add;

	return LZMA_OK;
}
Esempio n. 15
0
//符号表是全局变量
struct InterCodes* translate_Exp(struct TreeNode* Exp, Operand place){
	if (strcmp(Exp->children->name, "INT")){
		printf("Exp - INT\n");
		if(place != NULL){
			FieldList item = (FieldList)malloc(sizeof(struct FieldList_));
			lookupTable(Exp->children->value_str,&item,0);

			place->kind = CONSTANT;
			place->u.value = Exp->children->value_int;

			struct InterCodes* temp = (struct InterCodes*)malloc(sizeof(struct InterCodes));
			temp->code.kind = NONE_;
			return temp;
		}
		else{
			struct InterCodes* temp = (struct InterCodes*)malloc(sizeof(struct InterCodes));
			temp->code.kind = NONE_;
			return temp;
		}
	}
	if (strcmp(Exp->children->name, "ID")){
		if(Exp->children->neighbours == NULL){
			printf("Exp - ID\n");
			if(place != NULL){
				FieldList item = (FieldList)malloc(sizeof(struct FieldList_));
				lookupTable(Exp->children->value_str,&item,0);

				place->kind = VARIABLE;
				strcpy(place->u.ID, item->inter_name);

				struct InterCodes* temp = (struct InterCodes*)malloc(sizeof(struct InterCodes));
				temp->code.kind = NONE_;
				return temp;
			}
			else{
				struct InterCodes* temp = (struct InterCodes*)malloc(sizeof(struct InterCodes));
				temp->code.kind = NONE_;
				return temp;
			}
		}
		if(strcmp(Exp->children->neighbours->name, "DOT")){
			printf("Exp - Exp DOT ID\n");
			//Operand t1 = new_temp();
			//struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1);

			FieldList item = (FieldList)malloc(sizeof(struct FieldList_));
			int i = lookupTable(Exp->children->neighbours->neighbours->value_str,&item,0);
			switch(i){
				case 0:{
					Operand t1 = new_temp();
					struct InterCodes* code1 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
					code1->code.kind = ADD_;
					code1->code.u.binop.result = t1;
					code1->code.u.binop.op1 = (Operand)malloc(sizeof(struct Operand_));
					code1->code.u.binop.op1->kind = VARIABLE;
					strcpy(code1->code.u.binop.op1->u.ID, item->inter_name);
					code1->code.u.binop.op2 = (Operand)malloc(sizeof(struct Operand_));
					code1->code.u.binop.op2->kind = CONSTANT;
					code1->code.u.binop.op2->u.value = index_size(item, Exp->children->value_str);

					place->kind = REFERENCE;
					strcpy(place->u.ID, t1->u.ID);

					return code1;
					//break;
				}
				case 1:{
					Operand t1 = new_temp();
					Operand t2 = new_temp();

					struct InterCodes* code1 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
					code1->code.kind = ASSIGN_;
					code1->code.u.assign.left = t1;
					code1->code.u.assign.right = (Operand)malloc(sizeof(struct Operand_));
					code1->code.u.assign.right->kind = ADDRESS;
					strcpy(code1->code.u.assign.right->u.ID, item->inter_name);

					struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
					code2->code.kind = ADD_;
					code2->code.u.binop.result = t2;
					code2->code.u.binop.op1 = t1;
					code2->code.u.binop.op2 = (Operand)malloc(sizeof(struct Operand_));
					code2->code.u.binop.op2->kind = CONSTANT;
					code2->code.u.binop.op2->u.value = index_size(item, Exp->children->value_str);

					place->kind = REFERENCE;
					strcpy(place->u.ID, t2->u.ID);

					code1->next = code2;
					code2->prev = code1;

					return code1;
				}
			}
		}
	}
	if (strcmp(Exp->children->name, "Exp")){
		if (strcmp(Exp->children->neighbours->name, "ASSIGNOP")){
			if (strcmp(Exp->children->neighbours->neighbours->children->name, "ID")){
				printf("Exp - Exp ASSIGNOP Exp\n");
				if(place != NULL){
					Operand t1 = new_temp();
					FieldList item = (FieldList)malloc(sizeof(struct FieldList_));
					lookupTable(Exp->children->neighbours->neighbours->children->value_str,&item,0);

					struct InterCodes* code1 = translate_Exp(Exp->children, t1);

					struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
					code2->code.kind = ASSIGN_;
					code2->code.u.assign.left = (Operand)malloc(sizeof(struct Operand_));
					code2->code.u.assign.left->kind = VARIABLE;
					strcpy(code2->code.u.assign.left->u.ID, item->inter_name);
					code2->code.u.assign.right = t1;

					/*struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
					code3->code.kind = ASSIGN;
					code3->code.u.assign.left = (Operand)malloc(sizeof(struct Operand_));
					code3->code.u.assign.left->kind = VARIABLE;
					strcpy(code3->code.u.assign.left->u.ID, place->u.ID);
					code3->code.u.assign.right = (Operand)malloc(sizeof(struct Operand_));
					code3->code.u.assign.right->kind = VARIABLE;
					strcpy(code3->code.u.assign.right->u.ID, Exp->children->neighbours->neighbours->children->inter_name);*/

					place->kind = VARIABLE;
					strcpy(place->u.ID, item->inter_name);

					struct InterCodes* p = code1;
					while(p->next != NULL)
						p = p->next;
					p->next = code2;
					code2->prev = p;
					code2->next = NULL;

					return code1;
				}
				else{
					Operand t1 = new_temp();
					FieldList item = (FieldList)malloc(sizeof(struct FieldList_));
					lookupTable(Exp->children->neighbours->neighbours->children->value_str,&item,0);
					
					struct InterCodes* code1 = translate_Exp(Exp->children, t1);

					struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
					code2->code.kind = ASSIGN_;
					code2->code.u.assign.left = (Operand)malloc(sizeof(struct Operand_));
					code2->code.u.assign.left->kind = VARIABLE;
					strcpy(code2->code.u.assign.left->u.ID, item->inter_name);
					code2->code.u.assign.right = t1;

					struct InterCodes* p = code1;
					while(p->next != NULL)
						p = p->next;
					p->next = code2;
					code2->prev = p;
					code2->next = NULL;

					return code1;
				}
			}
		}
		if(strcmp(Exp->children->neighbours->name, "PLUS")){
			printf("Exp - Exp PLUS Exp\n");
			if(place != NULL){
				Operand t1 = new_temp();
				Operand t2 = new_temp();
				struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1);
				struct InterCodes* code2 = translate_Exp(Exp->children, t2);		
				struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
				code3->code.kind = ADD_;
				code3->code.u.binop.result = (Operand)malloc(sizeof(struct Operand_));
				code3->code.u.binop.result->kind = VARIABLE;
				strcpy(code3->code.u.binop.result->u.ID, place->u.ID);
				code3->code.u.binop.op1 = t1;
				code3->code.u.binop.op2 = t2;

				struct InterCodes* p = code1;
				while(p->next != NULL)
					p = p->next;
				p->next = code2;

				code2->prev = p;
				p = code2;
				while(p->next != NULL)
					p = p->next;
				p->next = code3;
				code3->prev = p;

				code3->next = NULL;

				return code1;
			}
			else{
				Operand t1 = new_temp();
				Operand t2 = new_temp();
				struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1);
				struct InterCodes* code2 = translate_Exp(Exp->children, t2);

				struct InterCodes* p = code1;
				while(p->next != NULL)
					p = p->next;
				p->next = code2;
				code2->prev = p;

				code2->next = NULL;

				return code1;
			}
		}
		if(strcmp(Exp->children->neighbours->name, "MINUS")){
			printf("Exp - Exp MINUS Exp\n");
			if(place != NULL){
				if(Exp->children->neighbours->neighbours != NULL){
					Operand t1 = new_temp();
					Operand t2 = new_temp();
					struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1);
					struct InterCodes* code2 = translate_Exp(Exp->children, t2);
					struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
					code3->code.kind = SUB_;
					code3->code.u.binop.result = (Operand)malloc(sizeof(struct Operand_));
					code3->code.u.binop.result->kind = VARIABLE;
					strcpy(code3->code.u.binop.result->u.ID, place->u.ID);
					code3->code.u.binop.op1 = t1;
					code3->code.u.binop.op2 = t2;

					struct InterCodes* p = code1;
					while(p->next != NULL)
						p = p->next;
					p->next = code2;

					code2->prev = p;
					p = code2;
					while(p->next != NULL)
						p = p->next;
					p->next = code3;
					code3->prev = p;

					code3->next = NULL;

					return code1;
				}
				else{
					Operand t1 = new_temp();
					struct InterCodes* code1 = translate_Exp(Exp->children, t1);
					struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
					code2->code.kind = SUB_;
					code2->code.u.binop.result = (Operand)malloc(sizeof(struct Operand_));
					code2->code.u.binop.result->kind = VARIABLE;
					strcpy(code2->code.u.binop.result->u.ID, place->u.ID);
					code2->code.u.binop.op1 = (Operand)malloc(sizeof(struct Operand_));
					code2->code.u.binop.op1->kind = CONSTANT;
					code2->code.u.binop.op1->u.value = 0;
					code2->code.u.binop.op2 = t1;

					struct InterCodes* p = code1;
					while(p->next != NULL)
						p = p->next;
					p->next = code2;
					code2->prev = p;

					code2->next = NULL;

					return code1;
				}
			}
			else{
				if(Exp->children->neighbours->neighbours != NULL){
					Operand t1 = new_temp();
					Operand t2 = new_temp();
					struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1);
					struct InterCodes* code2 = translate_Exp(Exp->children, t2);

					struct InterCodes* p = code1;
					while(p->next != NULL)
						p = p->next;
					p->next = code2;
					code2->prev = p;

					code2->next = NULL;

					return code1;
				}
				else{
					Operand t1 = new_temp();
					struct InterCodes* code1 = translate_Exp(Exp->children, t1);

					return code1;
				}
			}
		}
		if(strcmp(Exp->children->neighbours->name, "STAR")){
			printf("Exp - Exp STAR Exp\n");
			if(place != NULL){
				Operand t1 = new_temp();
				Operand t2 = new_temp();
				struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1);
				struct InterCodes* code2 = translate_Exp(Exp->children, t2);		
				struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
				code3->code.kind = MUL_;
				code3->code.u.binop.result = (Operand)malloc(sizeof(struct Operand_));
				code3->code.u.binop.result->kind = VARIABLE;
				strcpy(code3->code.u.binop.result->u.ID, place->u.ID);
				code3->code.u.binop.op1 = t1;
				code3->code.u.binop.op2 = t2;

				struct InterCodes* p = code1;
				while(p->next != NULL)
					p = p->next;
				p->next = code2;

				code2->prev = p;
				p = code2;
				while(p->next != NULL)
					p = p->next;
				p->next = code3;
				code3->prev = p;

				code3->next = NULL;

				return code1;
			}
			else{
				Operand t1 = new_temp();
				Operand t2 = new_temp();
				struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1);
				struct InterCodes* code2 = translate_Exp(Exp->children, t2);

				struct InterCodes* p = code1;
				while(p->next != NULL)
					p = p->next;
				p->next = code2;
				code2->prev = p;

				code2->next = NULL;

				return code1;
			}
		}
		if(strcmp(Exp->children->neighbours->name, "DIV")){
			printf("Exp - Exp DIV Exp\n");
			if(place != NULL){
				Operand t1 = new_temp();
				Operand t2 = new_temp();
				struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1);
				struct InterCodes* code2 = translate_Exp(Exp->children, t2);
				struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
				code3->code.kind = DIV_;
				code3->code.u.binop.result = (Operand)malloc(sizeof(struct Operand_));
				code3->code.u.binop.result->kind = VARIABLE;
				strcpy(code3->code.u.binop.result->u.ID, place->u.ID);
				code3->code.u.binop.op1 = t1;
				code3->code.u.binop.op2 = t2;

				struct InterCodes* p = code1;
				while(p->next != NULL)
					p = p->next;
				p->next = code2;

				code2->prev = p;
				p = code2;
				while(p->next != NULL)
					p = p->next;
				p->next = code3;
				code3->prev = p;

				code3->next = NULL;

				return code1;
			}
			else{
				Operand t1 = new_temp();
				Operand t2 = new_temp();
				struct InterCodes* code1 = translate_Exp(Exp->children->neighbours->neighbours, t1);
				struct InterCodes* code2 = translate_Exp(Exp->children, t2);

				struct InterCodes* p = code1;
				while(p->next != NULL)
					p = p->next;
				p->next = code2;
				code2->prev = p;

				code2->next = NULL;

				return code1;
			}
		}
		if(strcmp(Exp->children->neighbours->name, "RELOP") || strcmp(Exp->children->neighbours->name, "NOT") || strcmp(Exp->children->neighbours->name, "AND") || strcmp(Exp->children->neighbours->name, "OR")){
			printf("Exp - Exp Cond Exp\n");
			if(place != NULL){
				Operand label1 = new_label();
				Operand label2 = new_label();
				struct InterCodes* code1 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
				code1->code.kind = ASSIGN_;
				code1->code.u.assign.left = (Operand)malloc(sizeof(struct Operand_));
				code1->code.u.assign.left->kind = VARIABLE;
				strcpy(code1->code.u.assign.left->u.ID, place->u.ID);
				code1->code.u.assign.right = (Operand)malloc(sizeof(struct Operand_));
				code1->code.u.assign.right->kind = CONSTANT;
				code1->code.u.assign.right->u.value = 0;
				struct InterCodes* code2 = translate_Cond(Exp, label1, label2);
				struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
				code3->code.kind = LABEL_;
				code3->code.u.labelcode.label = label1;
				struct InterCodes* code4 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
				code4->code.kind = ASSIGN_;
				code4->code.u.assign.left = (Operand)malloc(sizeof(struct Operand_));
				code4->code.u.assign.left->kind = VARIABLE;
				strcpy(code4->code.u.assign.left->u.ID, place->u.ID);
				code4->code.u.assign.right = (Operand)malloc(sizeof(struct Operand_));
				code4->code.u.assign.right->kind = CONSTANT;
				code4->code.u.assign.right->u.value = 1;
				struct InterCodes* code5 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
				code5->code.kind = LABEL_;
				code5->code.u.labelcode.label = label2;

				code1->next = code2;
				code2->prev = code1;
				struct InterCodes* p = code2;
				while(p->next != NULL)
					p = p->next;
				p->next = code3;
				code3->prev = p;

				code3->next = code4;
				code4->prev = code3;

				code4->next = code5;
				code5->prev = code4;

				code5->next = NULL;

				return code1;
			}
		}
	}
	if (strcmp(Exp->children->neighbours->name, "LP")){
		printf("Exp - ID LP RP\n");
		if(strcmp(Exp->children->neighbours->neighbours->value_str, "read") == 0){
			struct InterCodes* code1 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
			code1->code.kind = READ_;
			code1->code.u.read.rd = place;
			return code1;
		}
		else{
			FieldList item = (FieldList)malloc(sizeof(struct FieldList_));
			lookupTable(Exp->children->neighbours->neighbours->value_str,&item,1);//查函数
			struct InterCodes* code1 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
			code1->code.kind = CALL_;
			code1->code.u.call.left = (Operand)malloc(sizeof(struct Operand_));
			code1->code.u.call.left->kind = VARIABLE;
			strcpy(code1->code.u.assign.left->u.ID, place->u.ID);
			code1->code.u.call.function = (Operand)malloc(sizeof(struct Operand_));
			code1->code.u.call.function->kind = VARIABLE;
			strcpy(code1->code.u.call.function->u.ID, item->name);
			return code1;
		}
	}
	if(strcmp(Exp->children->neighbours->name, "Args")){
		printf("Exp - ID LP Args RP\n");
		struct ArgList* arg_list = NULL;
		struct InterCodes* code1 = translate_Args(Exp->children->neighbours, arg_list);
		if(strcpy(Exp->children->neighbours->neighbours->neighbours->value_str, "write") == 0){
			struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
			code2->code.kind = WRITE_;
			code2->code.u.write.wr = arg_list->op;

			struct InterCodes* p = code1;
			while(p->next != NULL)
				p = p->next;
			p->next = code2;
			code2->prev = p;

			code2->next = NULL;

			return code1;
		}
		else{
			FieldList item = (FieldList)malloc(sizeof(struct FieldList_));
			lookupTable(Exp->children->neighbours->neighbours->neighbours->value_str,&item,0);
			struct ArgList* q = arg_list;
			struct InterCodes* code2 = NULL;
			while(q != NULL){
				struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
				code3->code.kind = ARG_;
				code3->code.u.arg.argument = q->op;
				if(code2 == NULL){
					code2 = code3;
				}
				else{
					struct InterCodes* p = code2;
					while(p->next != NULL)
						p = p->next;
					p->next = code3;
					code3->prev = p;
				}
				q = q->next;
			}
			struct InterCodes* code4 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
			code4->code.kind = CALL_;
			code4->code.u.call.left = (Operand)malloc(sizeof(struct Operand_));
			code4->code.u.call.left->kind = VARIABLE;
			strcpy(code4->code.u.assign.left->u.ID, place->u.ID);
			code4->code.u.call.function = (Operand)malloc(sizeof(struct Operand_));
			code4->code.u.call.function->kind = VARIABLE;
			strcpy(code4->code.u.call.function->u.ID, item->name);

			struct InterCodes* p = code1;
			while(p->next != NULL)
				p = p->next;
			p->next = code2;
			code2->prev = p;

			p = code2;
			while(p->next != NULL)
				p = p->next;
			p->next = code4;
			code4->prev = p;

			code4->next = NULL;

			return code1;
		}
	}
	if (strcmp(Exp->children->name, "RB") == 0){
		printf("Exp - Exp LB Exp RB\n");
		FieldList item = (FieldList)malloc(sizeof(struct FieldList_));
		lookupTable(Exp->children->neighbours->neighbours->neighbours->children->value_str,&item,0);

		Operand t1 = new_temp();
		Operand t2 = new_temp();
		Operand t3 = new_temp();

		struct InterCodes* code1 = translate_Exp(Exp->children->neighbours, t1);
		struct InterCodes* code2 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
		code2->code.kind = MUL_;
		code2->code.u.binop.result = t2;
		code2->code.u.binop.op1 = t1;
		code2->code.u.binop.op2 = (Operand)malloc(sizeof(struct Operand_));
		code2->code.u.binop.result->kind = CONSTANT;
		code2->code.u.binop.result->u.value = 4;
		struct InterCodes* code3 = (struct InterCodes*)malloc(sizeof(struct InterCodes));
		code3->code.kind = ADD_;	
		code3->code.u.binop.result = t3;
		code3->code.u.binop.op1 = (Operand)malloc(sizeof(struct Operand_));
		code3->code.u.binop.op1->kind = ADDRESS;
		strcpy(code3->code.u.binop.op1->u.ID, item->inter_name);
		code3->code.u.binop.op2 = t2;
		//place = t4;
		place->kind = REFERENCE;
		strcpy(place->u.ID, t3->u.ID);

		struct InterCodes* p = code1;
		while(p->next != NULL)
			p = p->next;
		p->next = code2;
		code2->prev = p;

		code2->next = code3;
		code3->prev = code2;

		code3->next = NULL;

		return code1;
	}
}
Esempio n. 16
0
/** ***********************************************************************************
	 CONSTRUCTION OF THE INDEX 
    ***********************************************************************************/
int build_WordIndex (char *inbasename, char *build_options, void **index){
	twcsa *wcsa;
	wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
	*index = wcsa;
	wcsa->text = NULL;

	//char path2repaircompressor[1000]="./src/repair64bit/repairCompressor";
	//wcsa->blockSize = DEFAULT_BLOCK_SIZE;
	//wcsa->q = DEFAULT_QGRAM_LEN;

	/** processing the parameters of the index:: blockSize, and q-gram-len (q) */
	{
		char delimiters[] = " =;";
		int j,num_parameters;
		char ** parameters;
		
		if (build_options != NULL) {
			parse_parameters_II(build_options,&num_parameters, &parameters, delimiters);
			for (j=0; j<num_parameters;j++) {

			  if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) {
				//wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER;	    
				j++;
			  } 
			  else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) {
				//wcsa->q =atoi(parameters[j+1]);	    
				j++;
			  }
			  else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) {
				//strcpy(path2repaircompressor,parameters[j+1]);	    
				j++;
			  }
			  
			}
			free_parameters_II(num_parameters, &parameters);
		}
		//fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q);
		//fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor);
	}

	/** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/
	StartValid();
	
	
	/** 1 ** Loads the compressed text into memory. */
	loadTextInMem(&(wcsa->text), &(wcsa->textSize),(char *)inbasename);	
	wcsa->sourceTextSize = wcsa->textSize;
	fprintf(stderr,"... Loaded Source Sequence: %u bytes\n", wcsa->textSize); fflush(stderr);
fflush(stderr);
	/** 2 ** loads the array of document boundaries                           */
	uint ndocs;
	uint *docboundaries;
	loadDocBeginngins(&docboundaries, &ndocs,(char *)inbasename);	
	wcsa->ndocs = ndocs; //just for statistics.

	
	/** 3 ** Parses the sequence and gathers the vocabulary of words (sorted alphanumerically) 
		the frecuency of such words: obtains "words", "nwords", and "wordsZone" 
		Sets also wcsa->freqs (freq of each word)
		Sets also wcsa->maxNumOccs (needed for malloc during extraction) */	
	
	fprintf(stderr,"... Entering CreateVocabularyOfWords \n"); fflush(stderr);
	CreateVocabularyOfWords(*index, docboundaries, ndocs);


	//shows the words parsed...
	{
		int i;
		fprintf(stderr,"\n\n Después de sorting ....");	fflush(stderr);
		unsigned char *str;
		uint len;
//		for (i = 0; i<100; i++) {
		for (i = 0; i<wcsa->nwords; i++) {
			if ((i<15) || (i>wcsa->nwords-5)) {
				getWord(wcsa,i,&str,&len);				
				fprintf(stderr,"\n freq[%6d]=%6u ",i,  wcsa->freqs[i]);
				fprintf(stderr,", words[%6d] = ",i);
				printWord(str,len);
			}
		}		
	}

	fprintf(stderr,"\n %u words have been parsed", wcsa->nwords);
					
	
	/** 4 ** creates a temporal list of occurrences of each word (block-oriented).
					gives also the len of each list */
	{	
		//decompression of the source text and creation of occList[][] and lenList[]
		uint **occList; uint *lenList;
		uint *doc_offsets_sids;
		
		createListsOfOccurrences (*index, &occList, &lenList, &doc_offsets_sids, docboundaries, ndocs);
		wcsa->doc_offsets_sids=doc_offsets_sids;
		

		fprintf(stderr,"\n %u lists of occurrences were created.", wcsa->nwords);fflush(stderr);
		//fprintf(stderr,"\n The ranks of the document beginnings are:");
		//{
		//  int i;
		//  for (i=0;i<=ndocs;i++) 		fprintf(stderr,"[%u-th-> %u]",i,wcsa->doc_offsets_sids[i]);
		//}

		//Preparing a "list of occurrences" that will be later indexed through build_il() **
		uint *source_il, sourcelen_il;
		//uint maxPost = ndocs; //!!
		uint maxPost = doc_offsets_sids[ndocs];
		uint nwords = wcsa->nwords;
		prepareSourceFormatForIListBuilder(nwords,maxPost,lenList, occList, &source_il, &sourcelen_il);

/*	
	char fileuintpostings[256] = "postingsSequence.uint32";
	output_posting_lists_concatenated_DEBUGGING_ONLY (nwords, maxPost, lenList, occList,fileuintpostings);	
*/
	
	
	FILE *ff = fopen ("postings.posts","w");
	fwrite (source_il, sizeof(uint), sourcelen_il, ff);
	fclose(ff);
	
#ifdef WRITE_POSTING_LIST		
		FILE *ff = fopen ("postings.posts","w");
		fwrite (source_il, sizeof(uint), sourcelen_il, ff);
		fclose(ff);
#endif		
		
		fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", (ulong)sourcelen_il - nwords -2, (ulong) sizeof(uint)*(sourcelen_il - nwords -2));
		fprintf(stderr,"\n maxPostValue = %u",maxPost);
		  
		/*{ char fileposts[256];
			sprintf(fileposts,"%s.%s.%u","POSTING_LISTS","posts", getpid());
			FILE *ff = fopen(fileposts,"w");
			fwrite(source_il, sizeof(uint), sourcelen_il,ff);
			fclose(ff);
			
		}		
		for (int x=0;x<10;x++) printf("\n%u --> %u",x,source_il[x]);
		exit(0);
		
		*/
		
		fprintf(stderr,"\n the lists of occurrences were formatted for build_il.");fflush(stderr);


		//compressing the lists of occurrences and setting wcsa->ils
		int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils));  //source_il is freed inside!.
		IFERRORIL(error);							


		{
			//frees memory for the posting lists
			uint i;
			for (i=0;i<wcsa->nwords;i++) free(occList[i]);
			free(occList);
			free(lenList);
		}
	
		/** 5 ** compressed representation of the source text */	
		{
			//unsigned long *docbegsUL; 
			unsigned int i;
			//docbegsUL = (ulong *) malloc (sizeof(ulong) * (ndocs+1));
			//for (i=0;i<=ndocs;i++) docbegsUL[i] = docboundaries[i];			
			//build_representation (wcsa->text, wcsa->textSize, docbegsUL, ndocs, build_options, &wcsa->ct);

			fprintf(stderr,"\nNow compressing the text: %lu bytes", (ulong)wcsa->textSize);fflush(stderr);
			
			build_representation (wcsa->text, wcsa->textSize, docboundaries, ndocs, build_options, &wcsa->ct);
			unsigned char *document;
			uint doclen;
			extract_doc_representation (wcsa->ct, 0, &document, &doclen);
			fprintf(stderr,"\n =================== DOC 0 ======================");
			fprintf(stderr,"\n%s",document);
			fprintf(stderr,"\n =================== ***** ======================\n");
			free(document);
			//free(docbegsUL);
		}

	}
	
	#ifndef FREQ_VECTOR_AVAILABLE   //<----- not needed in advance, only during construction
		free(wcsa->freqs);
	#endif

	free(docboundaries);	
		
	ulong sizeI;
	index_size(*index, &sizeI);
	fflush(stderr); fflush(stdout);
	fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI);
	fflush(stderr);
	fflush(stdout);
	return 0;
}