// Returns guest instr count (in old replay counting mode) static RR_prog_point copy_entry(void) { // Code copied from rr_log.c. // Copy entry. RR_log_entry item; rr_fread(&item.header.prog_point, sizeof(item.header.prog_point), 1, oldlog); if (item.header.prog_point.guest_instr_count > end_count) { // We don't want to copy this one. return item.header.prog_point; } //ph Fix up instruction count RR_prog_point original_prog_point = item.header.prog_point; item.header.prog_point.guest_instr_count -= actual_start_count; rr_fwrite(&item.header.prog_point, sizeof(item.header.prog_point), 1, newlog); #define RR_COPY_ITEM(field) rr_fcopy(&(field), sizeof(field), 1, oldlog, newlog) RR_COPY_ITEM(item.header.kind); RR_COPY_ITEM(item.header.callsite_loc); //mz read the rest of the item switch (item.header.kind) { case RR_INPUT_1: RR_COPY_ITEM(item.variant.input_1); break; case RR_INPUT_2: RR_COPY_ITEM(item.variant.input_2); break; case RR_INPUT_4: RR_COPY_ITEM(item.variant.input_4); break; case RR_INPUT_8: RR_COPY_ITEM(item.variant.input_8); break; case RR_INTERRUPT_REQUEST: RR_COPY_ITEM(item.variant.interrupt_request); break; case RR_EXIT_REQUEST: RR_COPY_ITEM(item.variant.exit_request); break; case RR_SKIPPED_CALL: { RR_skipped_call_args *args = &item.variant.call_args; //mz read kind first! RR_COPY_ITEM(args->kind); switch(args->kind) { case RR_CALL_CPU_MEM_RW: RR_COPY_ITEM(args->variant.cpu_mem_rw_args); args->variant.cpu_mem_rw_args.buf = g_malloc(args->variant.cpu_mem_rw_args.len); rr_fcopy(args->variant.cpu_mem_rw_args.buf, 1, args->variant.cpu_mem_rw_args.len, oldlog, newlog); break; case RR_CALL_CPU_MEM_UNMAP: RR_COPY_ITEM(args->variant.cpu_mem_unmap); args->variant.cpu_mem_unmap.buf = g_malloc(args->variant.cpu_mem_unmap.len); rr_fcopy(args->variant.cpu_mem_unmap.buf, 1, args->variant.cpu_mem_unmap.len, oldlog, newlog); break; case RR_CALL_MEM_REGION_CHANGE: RR_COPY_ITEM(args->variant.mem_region_change_args); args->variant.mem_region_change_args.name = g_malloc0(args->variant.mem_region_change_args.len + 1); rr_fcopy(args->variant.mem_region_change_args.name, 1, args->variant.mem_region_change_args.len, oldlog, newlog); break; case RR_CALL_HD_TRANSFER: RR_COPY_ITEM(args->variant.hd_transfer_args); break; case RR_CALL_NET_TRANSFER: RR_COPY_ITEM(args->variant.net_transfer_args); break; case RR_CALL_HANDLE_PACKET: RR_COPY_ITEM(args->variant.handle_packet_args); args->variant.handle_packet_args.buf = g_malloc(args->variant.handle_packet_args.size); rr_fcopy(args->variant.handle_packet_args.buf, args->variant.handle_packet_args.size, 1, oldlog, newlog); break; default: //mz unimplemented sassert(0, 3); } } break; case RR_LAST: //mz nothing to read //ph We don't copy RR_LAST here; write out afterwards. break; default: //mz unimplemented sassert(0, 4); } return original_prog_point; }
void main(int argc, char *argv[]) { int verbosity; int n; int m; int i; int input_type; int storage_type; unsigned short *current_ngram_int; unsigned short *previous_ngram_int; char **current_ngram_text; char **previous_ngram_text; int current_count; int running_total; flag same; flag first_one; flag got_to_eof; running_total = 0; report_version(&argc,argv); if (pc_flagarg( &argc, argv,"-help") || argc==1) { fprintf(stderr,"ngram2mgram - Convert an n-gram file to an m-gram file, where m<n\n"); fprintf(stderr,"Usage : ngram2mgram -n N -m M\n"); fprintf(stderr," [ -binary | -ascii | -words ]\n"); fprintf(stderr," < .ngram > .mgram\n"); exit(1); } n = pc_intarg( &argc, argv,"-n",0); m = pc_intarg( &argc, argv,"-m",0); verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY); input_type = 0; if (pc_flagarg( &argc, argv,"-binary")) { input_type = BINARY; } if (pc_flagarg( &argc, argv,"-ascii")) { if (input_type != 0) { quit(-1,"Error : more than one file format specified.\n"); } input_type = ASCII; } if (pc_flagarg( &argc, argv,"-words")) { if (input_type != 0) { quit(-1,"Error : more than one file format specified.\n"); } input_type = WORDS; } if (input_type == 0) { pc_message(verbosity,2,"Warning : no input type specified. Defaulting to binary.\n"); input_type = BINARY; } if (n == 0) { quit(-1,"Must specify a value for n. Use the -n switch.\n"); } if (m == 0) { quit(-1,"Must specify a value for m. Use the -m switch.\n"); } if (n<=m) { quit(-1,"n must be greater than m.\n"); } pc_report_unk_args(&argc,argv,verbosity); if (input_type == BINARY || input_type == ASCII) { storage_type = NUMERIC; } else { storage_type = ALPHA; } if (storage_type == NUMERIC) { current_ngram_int = (unsigned short *) rr_malloc(n*sizeof(unsigned short)); previous_ngram_int = (unsigned short *) rr_malloc(n*sizeof(unsigned short)); /* And to prevent compiler warnings ... */ current_ngram_text = NULL; previous_ngram_text = NULL; } else { current_ngram_text = (char **) rr_malloc(n*sizeof(char *)); previous_ngram_text = (char **) rr_malloc(n*sizeof(char *)); for (i=0;i<=n-1;i++) { current_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char)); previous_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char)); } /* And to prevent compiler warnings ... */ current_ngram_int = NULL; previous_ngram_int = NULL; } got_to_eof = 0; first_one = 1; while (!rr_feof(stdin)) { /* Store previous n-gram */ if (!first_one) { if (storage_type == NUMERIC) { for (i=0;i<=n-1;i++) { previous_ngram_int[i] = current_ngram_int[i]; } } else { for (i=0;i<=n-1;i++) { strcpy(previous_ngram_text[i],current_ngram_text[i]); } } } /* Read new n-gram */ switch(input_type) { case BINARY: for (i=0;i<=n-1;i++) { rr_fread(¤t_ngram_int[i],sizeof(id__t),1,stdin, "from id_ngrams at stdin",0); } rr_fread(¤t_count,sizeof(count_t),1,stdin, "from id_ngrams file at stdin",0); break; case ASCII: for (i=0;i<=n-1;i++) { if (fscanf(stdin,"%hu",¤t_ngram_int[i]) != 1) { if (!rr_feof(stdin)) { quit(-1,"Error reading id_ngram.\n"); } else { got_to_eof = 1; } } } if (fscanf(stdin,"%d",¤t_count) != 1) { if (!rr_feof(stdin)) { quit(-1,"Error reading id_ngram.\n"); } else { got_to_eof = 1; } } break; case WORDS: for (i=0;i<=n-1;i++) { if (fscanf(stdin,"%s",current_ngram_text[i]) != 1) { if (!rr_feof(stdin)) { quit(-1,"Error reading id_ngram.\n"); } else { got_to_eof = 1; } } } if (fscanf(stdin,"%d",¤t_count) != 1) { if (!rr_feof(stdin)) { quit(-1,"Error reading id_ngram.\n"); } else { got_to_eof = 1; } } break; } if (!got_to_eof) { /* Check for correct sorting */ if (!first_one) { switch(storage_type) { case NUMERIC: for (i=0;i<=n-1;i++) { if (current_ngram_int[i]<previous_ngram_int[i]) { quit(-1,"Error : ngrams not correctly sorted.\n"); } else { if (current_ngram_int[i]>previous_ngram_int[i]) { i=n; } } } break; case ALPHA: for (i=0;i<=n-1;i++) { if (strcmp(current_ngram_text[i],previous_ngram_text[i])<0) { quit(-1,"Error : ngrams not correctly sorted.\n"); } else { if (strcmp(current_ngram_text[i],previous_ngram_text[i])>0) { i=n; } } } break; } } /* Compare this m-gram with previous m-gram */ if (!first_one) { switch(storage_type) { case NUMERIC: same = 1; for (i=0;i<=m-1;i++) { if (current_ngram_int[i] != previous_ngram_int[i]) { same = 0; } } if (same) { running_total += current_count; } else { if (input_type == ASCII) { for (i=0;i<=m-1;i++) { printf("%d ",previous_ngram_int[i]); } printf("%d\n",running_total); } else { for (i=0;i<=m-1;i++) { rr_fwrite(&previous_ngram_int[i],sizeof(id__t),1,stdout, "to id_ngrams at stdout"); } rr_fwrite(&running_total,sizeof(count_t),1,stdout, "to id n-grams at stdout"); } running_total = current_count; } break; case ALPHA: same = 1; for (i=0;i<=m-1;i++) { if (strcmp(current_ngram_text[i],previous_ngram_text[i])) { same = 0; } } if (same) { running_total += current_count; } else { for (i=0;i<=m-1;i++) { printf("%s ",previous_ngram_text[i]); } printf("%d\n",running_total); running_total = current_count; } break; } } else { running_total = current_count; } first_one = 0; } } /* Write out final m-gram */ switch(input_type) { case BINARY: break; case ASCII: for (i=0;i<=m-1;i++) { printf("%d ",previous_ngram_int[i]); } printf("%d\n",running_total); break; case WORDS: for (i=0;i<=m-1;i++) { printf("%s ",previous_ngram_text[i]); } printf("%d\n",running_total); break; } pc_message(verbosity,0,"ngram2mgram : Done.\n"); exit(0); }
static INLINEIT void rr_fcopy(void *ptr, size_t size, size_t nmemb, FILE *oldlog, FILE *newlog) { rr_fread(ptr, size, nmemb, oldlog); rr_fwrite(ptr, size, nmemb, newlog); }
void merge_idngramfiles (int start_file, int end_file, char *temp_file_root, char *temp_file_ext, int max_files, FILE *outfile, flag write_ascii, int fof_size, int n_order) { FILE *new_temp_file; char temp_string[1000]; char *new_temp_filename; FILE **temp_file; char **temp_filename; wordid_t **current_ngram; wordid_t *smallest_ngram; wordid_t *previous_ngram; int *current_ngram_count; flag *finished; flag all_finished; int temp_count; int i,j; flag first_ngram; fof_t **fof_array; ngram_sz_t *num_kgrams; int *ng_count; int pos_of_novelty; n = n_order; pos_of_novelty = n; /* Simply for warning-free compilation */ num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t)); ng_count = (int *) rr_calloc(n-1,sizeof(int)); first_ngram = 1; previous_ngram = (wordid_t *) rr_calloc(n,sizeof(wordid_t)); temp_file = (FILE **) rr_malloc(sizeof(FILE *) * (end_file-start_file+1)); temp_filename = (char **) rr_malloc(sizeof(char *) * (end_file-start_file+1)); /* should change to 2d array*/ current_ngram = (wordid_t **) rr_malloc(sizeof(wordid_t *) * (end_file-start_file+1)); for (i=0;i<=end_file-start_file;i++) current_ngram[i] = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); current_ngram_count = (int *) rr_malloc(sizeof(int)*(end_file-start_file+1)); finished = (flag *) rr_malloc(sizeof(flag)*(end_file-start_file+1)); smallest_ngram = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); /* should change to 2d array*/ fof_array = (fof_t **) rr_malloc(sizeof(fof_t *)*(n-1)); for (i=0;i<=n-2;i++) fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t)); if (end_file-start_file+1 > max_files) { sprintf(temp_string,"%s/%hu%s",temp_file_root, end_file+1,temp_file_ext); new_temp_filename = salloc(temp_string); new_temp_file = rr_oopen(new_temp_filename); merge_tempfiles(start_file,start_file+max_files-1, temp_file_root,temp_file_ext,max_files, new_temp_file,write_ascii,0); merge_tempfiles(start_file+max_files,end_file+1, temp_file_root,temp_file_ext,max_files, outfile,write_ascii,0); }else { /* Open all the temp files for reading */ for (i=0;i<=end_file-start_file;i++) { sprintf(temp_string,"%s/%hu%s",temp_file_root, i+start_file,temp_file_ext); temp_filename[i] = salloc(temp_string); temp_file[i] = rr_iopen(temp_filename[i]); } /* Now go through the files simultaneously, and write out the appropriate ngram counts to the output file. */ for (i=end_file-start_file;i>=0;i--) { finished[i] = 0; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { rr_fread((char*) ¤t_ngram[i][j], sizeof(wordid_t),1, temp_file[i],"temporary n-gram ids",0); } rr_fread((char*) ¤t_ngram_count[i], sizeof(int),1, temp_file[i],"temporary n-gram counts",0); } } all_finished = 0; while (!all_finished) { /* Find the smallest current ngram */ for (i=0;i<=n-1;i++) smallest_ngram[i] = MAX_WORDID; for (i=0;i<=end_file-start_file;i++) { if (!finished[i]) { if (compare_ngrams3(smallest_ngram,current_ngram[i]) < 0) { for (j=0;j<n;j++) smallest_ngram[j] = current_ngram[i][j]; } } } #if MAX_VOCAB_SIZE < 65535 /* This check is well-meaning but completely useless since smallest_ngram[i] by definition cannot contain any value greater than MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */ for (i=0;i<=n-1;i++) { if (smallest_ngram[i] > MAX_VOCAB_SIZE) { quit(-1,"Error : Temporary files corrupted, invalid n-gram found.\n"); } } #endif /* For each of the files that are currently holding this ngram, add its count to the temporary count, and read in a new ngram from the files. */ temp_count = 0; for (i=0;i<=end_file-start_file;i++) { if (!finished[i]) { if (compare_ngrams3(smallest_ngram,current_ngram[i]) == 0) { temp_count = temp_count + current_ngram_count[i]; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { rr_fread((char*) ¤t_ngram[i][j],sizeof(wordid_t),1, temp_file[i],"temporary n-gram ids",0); } rr_fread((char*)¤t_ngram_count[i],sizeof(int),1, temp_file[i],"temporary n-gram count",0); }else { finished[i] = 1; all_finished = 1; for (j=0;j<=end_file-start_file;j++) { if (!finished[j]) all_finished = 0; } } } } } if (write_ascii) { for (i=0;i<=n-1;i++) { if (fprintf(outfile,"%d ",smallest_ngram[i]) < 0) { quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n"); } } if (fprintf(outfile,"%d\n",temp_count) < 0) quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n"); }else { for (i=0;i<=n-1;i++) { rr_fwrite((char*)&smallest_ngram[i],sizeof(wordid_t),1, outfile,"n-gram ids"); } rr_fwrite((char*)&temp_count,sizeof(count_t),1,outfile,"n-gram counts"); } if (fof_size > 0 && n>1) { /* Add stuff to fof arrays */ /* Code from idngram2stats */ pos_of_novelty = n; for (i=0;i<=n-1;i++) { if (smallest_ngram[i] > previous_ngram[i]) { pos_of_novelty = i; i=n; } } /* Add new N-gram */ num_kgrams[n-2]++; if (temp_count <= fof_size) fof_array[n-2][temp_count]++; if (!first_ngram) { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) { fof_array[i-1][ng_count[i-1]]++; } ng_count[i-1] = temp_count; } }else { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { ng_count[i-1] = temp_count; } first_ngram = 0; } for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += temp_count; for (i=0;i<=n-1;i++) previous_ngram[i]=smallest_ngram[i]; } } for (i=0;i<=end_file-start_file;i++) { fclose(temp_file[i]); remove(temp_filename[i]); } } if (fof_size > 0 && n>1) { /* Display fof arrays */ /* Process last ngram */ for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) fof_array[i-1][ng_count[i-1]]++; ng_count[i-1] = temp_count; } for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += temp_count; display_fof_array(num_kgrams,fof_array,fof_size,stderr, n); } }