/************************************************************************
 Function:     main
 
 Description:  Main entry point
 Inputs:       int argc, char *argv[] - main input params
 Output:       int - always 0.
 Comments:     none.
 ***********************************************************************/
int main(int argc, char **argv) 
{
  // Parse command line
  unsigned int frame_rate;
  double min_phoneme_length, max_phoneme_length;
  string silence_symbol;
  bool remove_silence;
  double C = 0.0;
  double beta1, beta2, beta3;
  string scores_filelist;
  string dists_filelist;
  string phonemes_filelist;
  string start_times_filelist;
  string phonemes_filename;
  string phoneme_stats_filename;
  string classifier_filename;
  string output_align;
  string output_confidence;
  
  learning::cmd_line cmdline;
  cmdline.info("Forced Alignment based on Passive-Aggressive");
  cmdline.add("-frame_rate", "frame rate (shift) in msec [10]",&frame_rate,10);
  cmdline.add("-min_phoneme_length", "min. phoneme duration in msec [20]", &min_phoneme_length, 20);
  cmdline.add("-max_phoneme_length", "max. phoneme duration in msec [330]", &max_phoneme_length, 330);
  cmdline.add("-silence_symbol", "silence symbol [sil]", &silence_symbol, "sil");
  cmdline.add("-remove_silence", "remove pre/post silence from data", &remove_silence, false);  
  cmdline.add("-beta1", "weight of the distance feature", &beta1, 1.0);
  cmdline.add("-beta2", "weight of the duration feature", &beta2, 1.0);
  cmdline.add("-beta3", "weight of the speaking rate feature", &beta3, 1.0);
  cmdline.add("-output_align", "file list where the forced alignemnt is written", &output_align, "");
  cmdline.add("-output_confidence", "single file where the forced alignemnt confidence is written", &output_confidence, "");
  cmdline.add_master_option("scores_filelist", &scores_filelist);
  cmdline.add_master_option("dists_filelist", &dists_filelist);
  cmdline.add_master_option("phonemes_filelist", &phonemes_filelist);  
  cmdline.add_master_option("start_times_filelist [null]", &start_times_filelist);
  cmdline.add_master_option("phonemes", &phonemes_filename);	
  cmdline.add_master_option("phoneme-stats", &phoneme_stats_filename);
  cmdline.add_master_option("classifier", &classifier_filename);
  int rc = cmdline.parse(argc, argv);
  if (rc < 7) {
    cmdline.print_help();
    return EXIT_FAILURE;
  }
  
  // phoneme symbol to number mapping (Lee and Hon, 89)
  PhonemeSequence::load_phoneme_map(phonemes_filename, silence_symbol);
  
  // Initiate classifier
  Classifier classifier(frame_rate, min_phoneme_length, max_phoneme_length, C, beta1, beta2, beta3, 0.0);
  classifier.load(classifier_filename);
  classifier.load_phoneme_stats(phoneme_stats_filename);
  
  // begining of the training set
  Dataset test_dataset(scores_filelist, dists_filelist, phonemes_filelist, start_times_filelist);
  
  int num_boundaries = 0;
  int cummulative_loss = 0;
  int cum_loss_less_than[NUM_CUM_LOSS_RESOLUTIONS+1];
  for (uint t=1; t <= NUM_CUM_LOSS_RESOLUTIONS; t++)
    cum_loss_less_than[t] = 0;
  

  StringVector output_align_files;
  if (output_align != "") 
    output_align_files.read(output_align);

  ofstream output_confidence_ofs;
  if (output_confidence != "") {
    output_confidence_ofs.open(output_confidence.c_str());
    if (!output_confidence_ofs.good()) {
      cerr << "Error: unable to open " << output_confidence << "for writing." << endl;
    }
  }
  
  // Run over all dataset
  for (uint i=0; i <  test_dataset.size(); i++) {
    
    SpeechUtterance x;
    StartTimeSequence y;
    StartTimeSequence y_hat;
    
    cout << "==================================================================================" << endl;
    
    // read next example for dataset
    test_dataset.read(x, y, remove_silence);
    y_hat.resize(x.phonemes.size());
    
    // predict label 
    double confidence = classifier.predict(x, y_hat);
    cout << "phonemes=" << x.phonemes << endl;
    if (test_dataset.labels_given())
      cout << "alignment= " << y << endl;
    cout << "predicted= " << y_hat << endl;
    cout << "confidence= " << confidence << endl;

    if (output_align != "") {
      ofstream output_align_ofs(output_align_files[i].c_str());
      if (output_align_ofs.good()) { 
        for (uint j=0; j < y_hat.size(); j++) {
          output_align_ofs << y_hat[j] << endl;
        }
        output_align_ofs.close();
      }
    }
    if (output_confidence != "" && output_confidence_ofs.good()) 
      output_confidence_ofs << confidence << endl;
    
    // calculate the error
    if (test_dataset.labels_given()) {
      int file_loss = 0;
      int cur_loss;
      for (unsigned int j=0; j < y.size(); ++j) {
        if (y[j] > y_hat[j]) {
          cur_loss = y[j] - y_hat[j];
        } else {
          cur_loss = y_hat[j] - y[j];
        }
        file_loss += cur_loss;
        cummulative_loss += cur_loss;
        for (int t=1; t <= NUM_CUM_LOSS_RESOLUTIONS; t++)
          if ( cur_loss <= t ) cum_loss_less_than[t]++;
      }
      num_boundaries += y.size();
      cout << "File loss = " << file_loss/double(y.size()) << endl;
      cout << "Cum loss = " << cummulative_loss/double(num_boundaries) << endl;
      for (uint t = NUM_CUM_LOSS_RESOLUTIONS; t >= 1; t--) {
        cout << "% Boundaries (t <= " << t*frame_rate << "ms) = " 
        << 100.0*cum_loss_less_than[t]/double(num_boundaries) << "\n";
      }
      cout << endl;
    }
  }
  
  if (output_confidence != "" && output_confidence_ofs.good()) 
    output_confidence_ofs.close();

  cout << "Done." << endl;  
  
  return EXIT_SUCCESS;
  
}
Example #2
0
/************************************************************************
 Function:     Dataset::read
 
 Description:  Read next instance and label
 Inputs:       SpeechUtterance&
 StartTimeSequence&
 Output:       void.
 Comments:     none.
 ***********************************************************************/
uint Dataset::read(SpeechUtterance &x, StartTimeSequence &y, bool remove_silence, bool enable_printouts)
{
  uint sil_offset = 0;
  
  if (single_file_mode) {
    std::cout << "current file=" << current_file << std::endl;
		std::cout << scores_file_list[0] << std::endl;
		std::cout << dists_file_list[0] << std::endl;
		std::cout << phonemes_file_list[0] << std::endl;

    x.read(scores_file_list[0], dists_file_list[0],
           phonemes_file_list[current_file],single_file_mode);
  }
  else {
		if (enable_printouts) {
    std::cout << "current file=" << current_file << std::endl;
		std::cout << scores_file_list[current_file] << std::endl;
		std::cout << dists_file_list[current_file] << std::endl;
		std::cout << phonemes_file_list[current_file] << std::endl;
		}
    x.read(scores_file_list[current_file], dists_file_list[current_file],
           phonemes_file_list[current_file],single_file_mode);
  }
  
  if (read_labels) {
		std::cout << start_times_file_list[current_file] << std::endl;
    y.read(start_times_file_list[current_file]);
  }
  else {
    y.resize(x.phonemes.size());
  }
    
  ++current_file;
  
  
  if (remove_silence) {
    ///////////////////////////
    // the code below is used to remove the leading silence at the 
    // begining of each utterance and the following silnce at the end

    x.silence_offset = y[1];
    x.last_silence = y[y.size()-1];
    infra::matrix tmp1(x.scores);
    x.scores.resize(y[y.size()-1]-y[1],tmp1.width());
    x.scores = tmp1.submatrix(y[1],0,x.scores.height(),x.scores.width());
    
    // remove silences
    for (uint i=0;i<x.phonemes.size()-2;++i) 
      x.phonemes[i] = x.phonemes[i+1];
    x.phonemes.resize(x.phonemes.size()-2);
    
    infra::matrix tmp2(x.distances);
    x.distances.resize(y[y.size()-1]-y[1],tmp2.width());
    x.distances = tmp2.submatrix(y[1],0,x.distances.height(),x.distances.width());
    
    uint tmp3 = y[1];
    for (uint i=0;i<y.size()-2;++i)
      y[i] = y[i+1]-tmp3;
    y.resize(y.size()-2);
    ///////////////////////////
  }
  return sil_offset;
}