/************************************************************************ Function: main Description: Main entry point Inputs: int argc, char *argv[] - main input params Output: int - always 0. Comments: none. ***********************************************************************/ int main(int argc, char **argv) { // Parse command line unsigned int frame_rate; double min_phoneme_length, max_phoneme_length; string silence_symbol; bool remove_silence; double C = 0.0; double beta1, beta2, beta3; string scores_filelist; string dists_filelist; string phonemes_filelist; string start_times_filelist; string phonemes_filename; string phoneme_stats_filename; string classifier_filename; string output_align; string output_confidence; learning::cmd_line cmdline; cmdline.info("Forced Alignment based on Passive-Aggressive"); cmdline.add("-frame_rate", "frame rate (shift) in msec [10]",&frame_rate,10); cmdline.add("-min_phoneme_length", "min. phoneme duration in msec [20]", &min_phoneme_length, 20); cmdline.add("-max_phoneme_length", "max. phoneme duration in msec [330]", &max_phoneme_length, 330); cmdline.add("-silence_symbol", "silence symbol [sil]", &silence_symbol, "sil"); cmdline.add("-remove_silence", "remove pre/post silence from data", &remove_silence, false); cmdline.add("-beta1", "weight of the distance feature", &beta1, 1.0); cmdline.add("-beta2", "weight of the duration feature", &beta2, 1.0); cmdline.add("-beta3", "weight of the speaking rate feature", &beta3, 1.0); cmdline.add("-output_align", "file list where the forced alignemnt is written", &output_align, ""); cmdline.add("-output_confidence", "single file where the forced alignemnt confidence is written", &output_confidence, ""); cmdline.add_master_option("scores_filelist", &scores_filelist); cmdline.add_master_option("dists_filelist", &dists_filelist); cmdline.add_master_option("phonemes_filelist", &phonemes_filelist); cmdline.add_master_option("start_times_filelist [null]", &start_times_filelist); cmdline.add_master_option("phonemes", &phonemes_filename); cmdline.add_master_option("phoneme-stats", &phoneme_stats_filename); cmdline.add_master_option("classifier", &classifier_filename); int rc = cmdline.parse(argc, argv); if (rc < 7) { cmdline.print_help(); return EXIT_FAILURE; } // phoneme symbol to number mapping (Lee and Hon, 89) PhonemeSequence::load_phoneme_map(phonemes_filename, silence_symbol); // Initiate classifier Classifier classifier(frame_rate, min_phoneme_length, max_phoneme_length, C, beta1, beta2, beta3, 0.0); classifier.load(classifier_filename); classifier.load_phoneme_stats(phoneme_stats_filename); // begining of the training set Dataset test_dataset(scores_filelist, dists_filelist, phonemes_filelist, start_times_filelist); int num_boundaries = 0; int cummulative_loss = 0; int cum_loss_less_than[NUM_CUM_LOSS_RESOLUTIONS+1]; for (uint t=1; t <= NUM_CUM_LOSS_RESOLUTIONS; t++) cum_loss_less_than[t] = 0; StringVector output_align_files; if (output_align != "") output_align_files.read(output_align); ofstream output_confidence_ofs; if (output_confidence != "") { output_confidence_ofs.open(output_confidence.c_str()); if (!output_confidence_ofs.good()) { cerr << "Error: unable to open " << output_confidence << "for writing." << endl; } } // Run over all dataset for (uint i=0; i < test_dataset.size(); i++) { SpeechUtterance x; StartTimeSequence y; StartTimeSequence y_hat; cout << "==================================================================================" << endl; // read next example for dataset test_dataset.read(x, y, remove_silence); y_hat.resize(x.phonemes.size()); // predict label double confidence = classifier.predict(x, y_hat); cout << "phonemes=" << x.phonemes << endl; if (test_dataset.labels_given()) cout << "alignment= " << y << endl; cout << "predicted= " << y_hat << endl; cout << "confidence= " << confidence << endl; if (output_align != "") { ofstream output_align_ofs(output_align_files[i].c_str()); if (output_align_ofs.good()) { for (uint j=0; j < y_hat.size(); j++) { output_align_ofs << y_hat[j] << endl; } output_align_ofs.close(); } } if (output_confidence != "" && output_confidence_ofs.good()) output_confidence_ofs << confidence << endl; // calculate the error if (test_dataset.labels_given()) { int file_loss = 0; int cur_loss; for (unsigned int j=0; j < y.size(); ++j) { if (y[j] > y_hat[j]) { cur_loss = y[j] - y_hat[j]; } else { cur_loss = y_hat[j] - y[j]; } file_loss += cur_loss; cummulative_loss += cur_loss; for (int t=1; t <= NUM_CUM_LOSS_RESOLUTIONS; t++) if ( cur_loss <= t ) cum_loss_less_than[t]++; } num_boundaries += y.size(); cout << "File loss = " << file_loss/double(y.size()) << endl; cout << "Cum loss = " << cummulative_loss/double(num_boundaries) << endl; for (uint t = NUM_CUM_LOSS_RESOLUTIONS; t >= 1; t--) { cout << "% Boundaries (t <= " << t*frame_rate << "ms) = " << 100.0*cum_loss_less_than[t]/double(num_boundaries) << "\n"; } cout << endl; } } if (output_confidence != "" && output_confidence_ofs.good()) output_confidence_ofs.close(); cout << "Done." << endl; return EXIT_SUCCESS; }
/************************************************************************ Function: Dataset::read Description: Read next instance and label Inputs: SpeechUtterance& StartTimeSequence& Output: void. Comments: none. ***********************************************************************/ uint Dataset::read(SpeechUtterance &x, StartTimeSequence &y, bool remove_silence, bool enable_printouts) { uint sil_offset = 0; if (single_file_mode) { std::cout << "current file=" << current_file << std::endl; std::cout << scores_file_list[0] << std::endl; std::cout << dists_file_list[0] << std::endl; std::cout << phonemes_file_list[0] << std::endl; x.read(scores_file_list[0], dists_file_list[0], phonemes_file_list[current_file],single_file_mode); } else { if (enable_printouts) { std::cout << "current file=" << current_file << std::endl; std::cout << scores_file_list[current_file] << std::endl; std::cout << dists_file_list[current_file] << std::endl; std::cout << phonemes_file_list[current_file] << std::endl; } x.read(scores_file_list[current_file], dists_file_list[current_file], phonemes_file_list[current_file],single_file_mode); } if (read_labels) { std::cout << start_times_file_list[current_file] << std::endl; y.read(start_times_file_list[current_file]); } else { y.resize(x.phonemes.size()); } ++current_file; if (remove_silence) { /////////////////////////// // the code below is used to remove the leading silence at the // begining of each utterance and the following silnce at the end x.silence_offset = y[1]; x.last_silence = y[y.size()-1]; infra::matrix tmp1(x.scores); x.scores.resize(y[y.size()-1]-y[1],tmp1.width()); x.scores = tmp1.submatrix(y[1],0,x.scores.height(),x.scores.width()); // remove silences for (uint i=0;i<x.phonemes.size()-2;++i) x.phonemes[i] = x.phonemes[i+1]; x.phonemes.resize(x.phonemes.size()-2); infra::matrix tmp2(x.distances); x.distances.resize(y[y.size()-1]-y[1],tmp2.width()); x.distances = tmp2.submatrix(y[1],0,x.distances.height(),x.distances.width()); uint tmp3 = y[1]; for (uint i=0;i<y.size()-2;++i) y[i] = y[i+1]-tmp3; y.resize(y.size()-2); /////////////////////////// } return sil_offset; }