////////////////////////////////////////////////////////////
// learn_errors
//
// Correct reads using a much stricter filter in order
// to count the nt->nt errors and learn the errors
// probabilities
////////////////////////////////////////////////////////////
//static void learn_errors(string fqf, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double (&ntnt_prob)[4][4], double prior_prob[4]) {
static void learn_errors(string fqf, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double ntnt_prob[Read::max_qual][4][4], double prior_prob[4]) {
  unsigned int ntnt_counts[Read::max_qual][4][4] = {0};
  unsigned int samples = 0;

  unsigned int chunk = 0;
#pragma omp parallel //shared(trusted)
  {    
    unsigned int tchunk;
    string header,ntseq,strqual,corseq;
    int trim_length;
    char* nti;
    Read *r;    
    ifstream reads_in(fqf.c_str());
    
    while(chunk < threads*chunks_per_thread) {
#pragma omp critical
      tchunk = chunk++;     
      
      reads_in.seekg(starts[tchunk]);
      
      unsigned long long tcount = 0;
      while(getline(reads_in, header)) {
	//cout << header << endl;
	
	// get sequence
	getline(reads_in, ntseq);
	//cout << ntseq << endl;
	
	// convert ntseq to iseq
	vector<unsigned int> iseq;
	for(int i = 0; i < ntseq.size(); i++) {
	  nti = strchr(nts, ntseq[i]);
	  iseq.push_back(nti - nts);
	}
		
	// get quality values
	getline(reads_in,strqual);
	//cout << strqual << endl;
	getline(reads_in,strqual);
	//cout << strqual << endl;

	vector<int> untrusted;

	if(iseq.size() < trim_t)
	  trim_length = 0;
	else {
	  for(int i = 0; i < iseq.size()-k+1; i++) {
	    if(!trusted->check(&iseq[i])) {
	      untrusted.push_back(i);
	    }
	  }
	  
	  trim_length = quick_trim(strqual, untrusted);
	}

	// fix error reads
	if(untrusted.size() > 0) {
	  // correct
	  r = new Read(header, &iseq[0], strqual, untrusted, trim_length);
	  corseq = r->correct(trusted, ntnt_prob, prior_prob, true);
	    
	  // if trimmed to long enough
	  if(corseq.size() >= trim_t) {
	    if(r->trusted_read != 0) { // else no guarantee there was a correction
	      for(int c = 0; c < r->trusted_read->corrections.size(); c++) {
		correction cor = r->trusted_read->corrections[c];
		if(iseq[cor.index] < 4) {
		  // P(obs=o|actual=a,a!=o) for Bayes
		  ntnt_counts[strqual[cor.index]-Read::quality_scale][cor.to][iseq[cor.index]]++;
		  
		  // P(actual=a|obs=o,a!=o)
		  //ntnt_counts[iseq[cor.index]][cor.to]++;
		  samples++;
		}
	      }
	    }
	  }
	  delete r;
	}
	
	if(++tcount == counts[tchunk] || samples > 200000)
	  break;
      }
    }
    reads_in.close();
  }

  regress_probs(ntnt_prob, ntnt_counts);

  output_model(ntnt_prob, ntnt_counts, fqf);
}
////////////////////////////////////////////////////////////////////////////////
// correct_reads
//
// Correct the reads in the file 'fqf' using the data structure of trusted
// kmers 'trusted', matrix of nt->nt error rates 'ntnt_prob' and prior nt
// probabilities 'prior_prob'.  'starts' and 'counts' help openMP parallelize
// the read processing.  If 'pairedend_code' is 0, the reads are not paired;
// if it's 1, this file is the first of a pair so print all reads and withold
// combining; if it's 2, the file is the second of a pair so print all reads
// and then combine both 1 and 2.
////////////////////////////////////////////////////////////////////////////////
static void correct_reads(string fqf, int pe_code, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double ntnt_prob[Read::max_qual][4][4], double prior_prob[4]) {
  // output directory
  struct stat st_file_info;
  string path_suffix = split(fqf,'/').back();
  string out_dir("."+path_suffix);
  if(stat(out_dir.c_str(), &st_file_info) == 0) {
    cerr << "Hidden temporary directory " << out_dir << " already exists and will be used" << endl;
  } else {
    if(mkdir(out_dir.c_str(), S_IRWXU) == -1) {
      cerr << "Failed to create hidden temporary directory " << out_dir << endl;
      exit(EXIT_FAILURE);
    }
  }

  // collect stats
  stats * thread_stats = new stats[omp_get_max_threads()];

  unsigned int chunk = 0;
#pragma omp parallel //shared(trusted)
  {
    int tid = omp_get_thread_num();
    
    // input
    ifstream reads_in(fqf.c_str());
    
    unsigned int tchunk;
    string header,ntseq,mid,strqual,corseq;
    int trim_length;
    char* nti;
    Read *r;

    #pragma omp critical
    tchunk = chunk++;

    while(tchunk < starts.size()) {
      reads_in.seekg(starts[tchunk]);

      // output
      string toutf(out_dir+"/");
      stringstream tconvert;
      tconvert << tchunk;
      toutf += tconvert.str();

      if(overwrite_temp || stat(toutf.c_str(), &st_file_info) == -1) {
	ofstream reads_out(toutf.c_str());
	//cout << toutf << endl;

	// output log
	string tlogf = toutf + ".log";
	ofstream corlog_out;
	if(out_log) {
	  corlog_out.open(tlogf.c_str());
	}

	unsigned long long tcount = 0;
	while(getline(reads_in, header)) {
	  //cout << tid << " " << header << endl;
	
	  // get sequence
	  getline(reads_in, ntseq);
	  //cout << ntseq << endl;
	
	  // convert ntseq to iseq
	  vector<unsigned int> iseq;
	  for(int i = 0; i < ntseq.size(); i++) {
	    nti = strchr(nts, ntseq[i]);	
	    iseq.push_back(nti - nts);
	  }

	  // get quality values
	  getline(reads_in,mid);
	  //cout << mid << endl;
	  getline(reads_in,strqual);
	  //cout << strqual << endl;

	  vector<int> untrusted;

	  if(iseq.size() < trim_t)
	    trim_length = 0;
	  else {
	    for(int i = 0; i < iseq.size()-k+1; i++) {
	      if(!trusted->check(&iseq[i])) {
		untrusted.push_back(i);
	      }
	    }

	    trim_length = quick_trim(strqual, untrusted);
	    //trim_length = iseq.size();
	  }

	  // fix error reads
	  if(untrusted.size() > 0) {
	    r = new Read(header, &iseq[0], strqual, untrusted, trim_length);
	    corseq = r->correct(trusted, ntnt_prob, prior_prob);

	    // output read w/ trim and corrections
	    output_read(reads_out, corlog_out, pe_code, header, ntseq, mid, strqual, corseq, thread_stats[tid]);
	  
	    delete r;
	  } else {
	    output_read(reads_out, corlog_out, pe_code, header, ntseq, mid, strqual, ntseq.substr(0,trim_length), thread_stats[tid]);
	    // output read as trimmed
	    /*
	      if(contrail_out)
	      reads_out << header << "\t" << ntseq.substr(0,trim_length) << endl;
	      else
	      reads_out << header << endl << ntseq.substr(0,trim_length) << endl << mid << endl << strqual.substr(0,trim_length) << endl;
	    */
	  }
	
	  if(++tcount == counts[tchunk])
	    break;
	}
	reads_out.close();
      }

#pragma omp critical
      tchunk = chunk++;
    }
    reads_in.close();
  }

  // combine stats
  for(int i = 1; i < omp_get_max_threads(); i++) {
    thread_stats[0].validated += thread_stats[i].validated;
    thread_stats[0].corrected += thread_stats[i].corrected;
    thread_stats[0].trimmed += thread_stats[i].trimmed;
    thread_stats[0].trimmed_only += thread_stats[i].trimmed_only;
    thread_stats[0].removed += thread_stats[i].removed;
  }

  // print stats
  int suffix_index = fqf.rfind(".");
  string outf;
  if(suffix_index == -1) {
    outf = fqf+".stats.txt";
  } else {
    outf = fqf.substr(0,suffix_index+1) + "stats.txt";
  }
  ofstream stats_out(outf.c_str());
  stats_out << "Validated: " << thread_stats[0].validated << endl;
  stats_out << "Corrected: " << thread_stats[0].corrected << endl;
  stats_out << "Trimmed: " << thread_stats[0].trimmed << endl;
  stats_out << "Trimmed only: " << thread_stats[0].trimmed_only << endl;
  stats_out << "Removed: " << thread_stats[0].removed << endl;
  stats_out.close();
}