int oe_03_main (int argc, char **argv) { flag first_ngram; int n; fof_sz_t fof_size; flag is_ascii; int verbosity; fof_t **fof_array; ngram_sz_t *num_kgrams; ngram current_ngram; ngram previous_ngram; count_t *ng_count; int pos_of_novelty; int nlines; int i; report_version(&argc,argv); if (argc == 1 || pc_flagarg(&argc, argv,"-help")) { oe_04_help_message(); exit(1); } is_ascii = pc_flagarg(&argc, argv,"-ascii_input"); n = pc_intarg(&argc, argv,"-n",3); fof_size = pc_intarg(&argc, argv,"-fof_size",50); verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY); pc_report_unk_args(&argc,argv,verbosity); pc_message(verbosity,2,"n = %d\n",n); pc_message(verbosity,2,"fof_size = %d\n",fof_size); current_ngram.n = n; previous_ngram.n = n; pos_of_novelty = n; fof_array = (fof_t **) rr_malloc(sizeof(fof_t *) * (n-1)); for (i=0;i<=n-2;i++) fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t)); num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t)); ng_count = (count_t *) rr_calloc(n-1,sizeof(count_t)); current_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t)); previous_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t)); pc_message(verbosity,2,"Processing id n-gram file.\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); nlines = 0; first_ngram = 1; while (!rr_feof(stdin)) { if (!first_ngram) ngram_copy(&previous_ngram,¤t_ngram,n); if (get_ngram(stdin,¤t_ngram,is_ascii)) { nlines++; show_idngram_nlines(nlines, verbosity); /* Test for where this ngram differs from last - do we have an out-of-order ngram? */ if (!first_ngram) pos_of_novelty = ngram_find_pos_of_novelty(¤t_ngram,&previous_ngram,n,nlines); else pos_of_novelty = 0; /* Add new N-gram */ num_kgrams[n-2]++; if (current_ngram.count <= fof_size) fof_array[n-2][current_ngram.count]++; if (!first_ngram) { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) fof_array[i-1][ng_count[i-1]]++; ng_count[i-1] = current_ngram.count; } } else { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) ng_count[i-1] = current_ngram.count; } for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += current_ngram.count; if (first_ngram) first_ngram = 0; } } /* Process last ngram */ for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) { fof_array[i-1][ng_count[i-1]]++; } ng_count[i-1] = current_ngram.count; } #import "OpenEarsStaticAnalysisToggle.h" #ifdef STATICANALYZEDEPENDENCIES #define __clang_analyzer__ 1 #endif #if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES) #undef __clang_analyzer__ for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += current_ngram.count; display_fof_array(num_kgrams,fof_array,fof_size,stderr, n); #endif pc_message(verbosity,0,"idngram2stats : Done.\n"); exit(0); }
void merge_idngramfiles (int start_file, int end_file, char *temp_file_root, char *temp_file_ext, int max_files, FILE *outfile, flag write_ascii, int fof_size, int n_order) { FILE *new_temp_file; char temp_string[1000]; char *new_temp_filename; FILE **temp_file; char **temp_filename; wordid_t **current_ngram; wordid_t *smallest_ngram; wordid_t *previous_ngram; int *current_ngram_count; flag *finished; flag all_finished; int temp_count; int i,j; flag first_ngram; fof_t **fof_array; ngram_sz_t *num_kgrams; int *ng_count; int pos_of_novelty; n = n_order; pos_of_novelty = n; /* Simply for warning-free compilation */ num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t)); ng_count = (int *) rr_calloc(n-1,sizeof(int)); first_ngram = 1; previous_ngram = (wordid_t *) rr_calloc(n,sizeof(wordid_t)); temp_file = (FILE **) rr_malloc(sizeof(FILE *) * (end_file-start_file+1)); temp_filename = (char **) rr_malloc(sizeof(char *) * (end_file-start_file+1)); /* should change to 2d array*/ current_ngram = (wordid_t **) rr_malloc(sizeof(wordid_t *) * (end_file-start_file+1)); for (i=0;i<=end_file-start_file;i++) current_ngram[i] = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); current_ngram_count = (int *) rr_malloc(sizeof(int)*(end_file-start_file+1)); finished = (flag *) rr_malloc(sizeof(flag)*(end_file-start_file+1)); smallest_ngram = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); /* should change to 2d array*/ fof_array = (fof_t **) rr_malloc(sizeof(fof_t *)*(n-1)); for (i=0;i<=n-2;i++) fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t)); if (end_file-start_file+1 > max_files) { sprintf(temp_string,"%s/%hu%s",temp_file_root, end_file+1,temp_file_ext); new_temp_filename = salloc(temp_string); new_temp_file = rr_oopen(new_temp_filename); merge_tempfiles(start_file,start_file+max_files-1, temp_file_root,temp_file_ext,max_files, new_temp_file,write_ascii,0); merge_tempfiles(start_file+max_files,end_file+1, temp_file_root,temp_file_ext,max_files, outfile,write_ascii,0); }else { /* Open all the temp files for reading */ for (i=0;i<=end_file-start_file;i++) { sprintf(temp_string,"%s/%hu%s",temp_file_root, i+start_file,temp_file_ext); temp_filename[i] = salloc(temp_string); temp_file[i] = rr_iopen(temp_filename[i]); } /* Now go through the files simultaneously, and write out the appropriate ngram counts to the output file. */ for (i=end_file-start_file;i>=0;i--) { finished[i] = 0; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { rr_fread((char*) ¤t_ngram[i][j], sizeof(wordid_t),1, temp_file[i],"temporary n-gram ids",0); } rr_fread((char*) ¤t_ngram_count[i], sizeof(int),1, temp_file[i],"temporary n-gram counts",0); } } all_finished = 0; while (!all_finished) { /* Find the smallest current ngram */ for (i=0;i<=n-1;i++) smallest_ngram[i] = MAX_WORDID; for (i=0;i<=end_file-start_file;i++) { if (!finished[i]) { if (compare_ngrams3(smallest_ngram,current_ngram[i]) < 0) { for (j=0;j<n;j++) smallest_ngram[j] = current_ngram[i][j]; } } } #if MAX_VOCAB_SIZE < 65535 /* This check is well-meaning but completely useless since smallest_ngram[i] by definition cannot contain any value greater than MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */ for (i=0;i<=n-1;i++) { if (smallest_ngram[i] > MAX_VOCAB_SIZE) { quit(-1,"Error : Temporary files corrupted, invalid n-gram found.\n"); } } #endif /* For each of the files that are currently holding this ngram, add its count to the temporary count, and read in a new ngram from the files. */ temp_count = 0; for (i=0;i<=end_file-start_file;i++) { if (!finished[i]) { if (compare_ngrams3(smallest_ngram,current_ngram[i]) == 0) { temp_count = temp_count + current_ngram_count[i]; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { rr_fread((char*) ¤t_ngram[i][j],sizeof(wordid_t),1, temp_file[i],"temporary n-gram ids",0); } rr_fread((char*)¤t_ngram_count[i],sizeof(int),1, temp_file[i],"temporary n-gram count",0); }else { finished[i] = 1; all_finished = 1; for (j=0;j<=end_file-start_file;j++) { if (!finished[j]) all_finished = 0; } } } } } if (write_ascii) { for (i=0;i<=n-1;i++) { if (fprintf(outfile,"%d ",smallest_ngram[i]) < 0) { quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n"); } } if (fprintf(outfile,"%d\n",temp_count) < 0) quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n"); }else { for (i=0;i<=n-1;i++) { rr_fwrite((char*)&smallest_ngram[i],sizeof(wordid_t),1, outfile,"n-gram ids"); } rr_fwrite((char*)&temp_count,sizeof(count_t),1,outfile,"n-gram counts"); } if (fof_size > 0 && n>1) { /* Add stuff to fof arrays */ /* Code from idngram2stats */ pos_of_novelty = n; for (i=0;i<=n-1;i++) { if (smallest_ngram[i] > previous_ngram[i]) { pos_of_novelty = i; i=n; } } /* Add new N-gram */ num_kgrams[n-2]++; if (temp_count <= fof_size) fof_array[n-2][temp_count]++; if (!first_ngram) { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) { fof_array[i-1][ng_count[i-1]]++; } ng_count[i-1] = temp_count; } }else { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { ng_count[i-1] = temp_count; } first_ngram = 0; } for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += temp_count; for (i=0;i<=n-1;i++) previous_ngram[i]=smallest_ngram[i]; } } for (i=0;i<=end_file-start_file;i++) { fclose(temp_file[i]); remove(temp_filename[i]); } } if (fof_size > 0 && n>1) { /* Display fof arrays */ /* Process last ngram */ for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) fof_array[i-1][ng_count[i-1]]++; ng_count[i-1] = temp_count; } for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += temp_count; display_fof_array(num_kgrams,fof_array,fof_size,stderr, n); } }