int file_prepare(void) { unsigned int i; char file_name[512]; for (i=0; i < num_files; i++) { snprintf(file_name, sizeof(file_name), "test_file.%d",i); /* remove test files for creation test if they exist */ if (test_mode == MODE_WRITE) unlink(file_name); log_text(LOG_DEBUG, "Opening file: %s",file_name); #ifndef _WIN32 files[i] = open(file_name, O_CREAT | O_RDWR | file_extra_flags, S_IRUSR | S_IWUSR); #else files[i] = CreateFile(file_name, GENERIC_READ|GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, file_extra_flags? file_extra_flags : FILE_ATTRIBUTE_NORMAL, NULL); #endif if (!VALID_FILE(files[i])) { log_errno(LOG_FATAL, "Cannot open file"); return 1; } } #ifdef HAVE_MMAP if (file_mmap_prepare()) return 1; #endif pthread_mutex_init(&fsync_mutex, NULL); return 0; }
int init_sentsim_lmtrlm (SPEC* spec, char* prefix) { DIR_ARRAY dir_array; PRINT_TRACE (2, print_string, "Trace: entering init_sentsim_lmtrlm"); /* Lookup the values of the relevant parameters */ if (UNDEF == lookup_spec (spec, &spec_args[0], num_spec_args)) return (UNDEF); if (UNDEF == (doc_fd = open_vector (dvec_file, dvec_file_mode))) return (UNDEF); // initalize the LM conversion module if (UNDEF == init_lang_model_wt_lm(spec, NULL)) return UNDEF; if (UNDEF == (vec_vec_inst = vec_vec_ptab->init_proc (spec, NULL))) return (UNDEF); if (! VALID_FILE (collstat_file)) { return UNDEF; } else { if (UNDEF == (collstats_fd = open_dir_array (collstat_file, collstat_mode))) return (UNDEF); // Read the total number of documents dir_array.id_num = COLLSTAT_NUMDOC; // Get the collection frequency list from the file if (1 != seek_dir_array (collstats_fd, &dir_array) || 1 != read_dir_array (collstats_fd, &dir_array)) { return UNDEF; } else { memcpy(&collstats_numdocs, dir_array.list, sizeof(long)); } // Read in collection frequencies dir_array.id_num = COLLSTAT_TOTWT; // COLLSTAT_COLLFREQ; // ; // Get the collection frequency list from the file if (1 != seek_dir_array (collstats_fd, &dir_array) || 1 != read_dir_array (collstats_fd, &dir_array)) { collstats_freq = NULL; collstats_num_freq = 0; return UNDEF; } else { // Read from file successful. Allocate 'freq' array and dump the // contents of the file in this list collstats_freq = (float*) dir_array.list; collstats_num_freq = dir_array.num_list / sizeof (float); p_w_Q = (float*) malloc (collstats_num_freq * sizeof(float)); if (p_w_Q == NULL) return UNDEF; } } totalDocFreq = getTotalDocumentFreq(); if (UNDEF == init_lda_est(&ldamodel, spec)) { return UNDEF; } PRINT_TRACE (2, print_string, "Trace: leaving init_sentsim_lmtrlm"); return 0; }
// Read from collstat file the collection frequency in the global // variable collstats_info. int init_lang_model_wt_cf_lm_nsim(SPEC* spec, char* unused) { DIR_ARRAY dir_array; long ctype ; char conceptName[256]; long i; PRINT_TRACE (2, print_string, "Trace: entering init_lang_model_wt_cf_lm_nsim"); // Intialize buffer to copy invec's term weights into outvec's ones num_conwt_buf = 8096; if ( NULL == (conwt_buf = (CON_WT *) malloc (num_conwt_buf * sizeof (CON_WT))) ) return (UNDEF); // Get the number of concepts if (UNDEF == lookup_spec_docdesc (spec, &doc_desc)) return (UNDEF); // Initialize the array of fds for opening the collection files collstats_fd = (int*) malloc (sizeof(int) * doc_desc.num_ctypes) ; if ( collstats_fd == NULL ) return UNDEF ; collstats_info = (struct coll_info*) malloc (sizeof(struct coll_info) * doc_desc.num_ctypes) ; if ( collstats_info == NULL ) return UNDEF ; if (UNDEF == lookup_spec (spec, &spec_args[0], num_spec_args)) { return (UNDEF); } // For each concept, collect the collection frequency for (ctype = 0; ctype < doc_desc.num_ctypes; ctype++) { snprintf(conceptName, sizeof(conceptName), "ctype.%ld.", ctype) ; prefix = conceptName ; if (UNDEF == lookup_spec_prefix (spec, spec_prefix_args, num_spec_prefix_args)) return (UNDEF); if (! VALID_FILE (collstat_file)) { collstats_info[ctype].freq = NULL ; collstats_info[ctype].num_freq = 0; } else { if (UNDEF == (collstats_fd[ctype] = open_dir_array (collstat_file, collstat_mode))) return (UNDEF); // Get the frequency mode to use dir_array.id_num = COLLSTAT_TOTWT ; // Get the collection frequency list from the file if (1 != seek_dir_array (collstats_fd[ctype], &dir_array) || 1 != read_dir_array (collstats_fd[ctype], &dir_array)) { collstats_info[ctype].freq = NULL; collstats_info[ctype].num_freq = 0; } else { // Read from file successful. Allocate 'freq' array and dump the // contents of the file in this list if (NULL == (collstats_info[ctype].freq = (float *) malloc ((unsigned) dir_array.num_list))) return (UNDEF); (void) bcopy (dir_array.list, (char *) collstats_info[ctype].freq, dir_array.num_list); collstats_info[ctype].num_freq = dir_array.num_list / sizeof (float); } if (UNDEF == close_dir_array (collstats_fd[ctype])) return (UNDEF); } } totalDocFreq = (float*) malloc (sizeof(float) * doc_desc.num_ctypes); // Compute the total Document Frequency only once since it is a constant. for (i = 0; i < doc_desc.num_ctypes; i++) { totalDocFreq[i] = getTotalDocFreq(i); } PRINT_TRACE (2, print_string, "Trace: leaving init_lang_model_wt_cf_lm_nsim"); return (1); }