vector<T> smoothenData(const vector<T> &rawData, int32 support) { vector<float> gaussKernel = formGaussKernel(2*support + 1); std::vector<T> smoothData(rawData.size()); vector<T> tempData(rawData.size()+2*support); //Create a temporary vector with mirrored boundaries std::copy(rawData.begin(), rawData.end(), tempData.begin()+support); std::reverse_copy(rawData.begin(), rawData.begin()+support,tempData.begin()); std::reverse_copy(rawData.end()-support,rawData.end(), tempData.end()-support); for(int i=0; i< smoothData.size(); i++) { T tempVec = initVal<T>(); for(int j=-support; j<=support; j++) { tempVec += tempData[i+j+support] * gaussKernel[j+support]; } smoothData[i] = tempVec; } return smoothData; }
bool DTW::predict(MatrixDouble inputTimeSeries){ if( !trained ){ errorLog << "predict(Matrix<double> &inputTimeSeries) - The DTW templates have not been trained!" << endl; return false; } if( classLikelihoods.size() != numTemplates ) classLikelihoods.resize(numTemplates); if( classDistances.size() != numTemplates ) classDistances.resize(numTemplates); predictedClassLabel = 0; maxLikelihood = DEFAULT_NULL_LIKELIHOOD_VALUE; for(UINT k=0; k<classLikelihoods.size(); k++){ classLikelihoods[k] = 0; classDistances[k] = DEFAULT_NULL_LIKELIHOOD_VALUE; } if( numFeatures != inputTimeSeries.getNumCols() ){ errorLog << "predict(Matrix<double> &inputTimeSeries) - The number of features in the model (" << numFeatures << ") do not match that of the input time series (" << inputTimeSeries.getNumCols() << ")" << endl; return false; } //Perform any preprocessing if requried MatrixDouble *timeSeriesPtr = &inputTimeSeries; MatrixDouble processedTimeSeries; MatrixDouble tempMatrix; if(useScaling){ scaleData(*timeSeriesPtr,processedTimeSeries); timeSeriesPtr = &processedTimeSeries; } //Normalize the data if needed if( useZNormalisation ){ znormData(*timeSeriesPtr,processedTimeSeries); timeSeriesPtr = &processedTimeSeries; } //Smooth the data if required if( useSmoothing ){ smoothData(*timeSeriesPtr,smoothingFactor,tempMatrix); timeSeriesPtr = &tempMatrix; } //Offset the timeseries if required if( offsetUsingFirstSample ){ offsetTimeseries( *timeSeriesPtr ); } //Make the prediction by finding the closest template double sum = 0; if( distanceMatrices.size() != numTemplates ) distanceMatrices.resize( numTemplates ); if( warpPaths.size() != numTemplates ) warpPaths.resize( numTemplates ); //Test the timeSeries against all the templates in the timeSeries buffer for(UINT k=0; k<numTemplates; k++){ //Perform DTW classDistances[k] = computeDistance(templatesBuffer[k].timeSeries,*timeSeriesPtr,distanceMatrices[k],warpPaths[k]); classLikelihoods[k] = classDistances[k]; sum += classLikelihoods[k]; } //See which gave the min distance UINT closestTemplateIndex = 0; bestDistance = classDistances[0]; for(UINT k=1; k<numTemplates; k++){ if( classDistances[k] < bestDistance ){ bestDistance = classDistances[k]; closestTemplateIndex = k; } } //Normalize the class likelihoods and check which class has the maximum likelihood UINT maxLikelihoodIndex = 0; maxLikelihood = 0; for(UINT k=0; k<numTemplates; k++){ classLikelihoods[k] = (sum-classLikelihoods[k])/sum; if( classLikelihoods[k] > maxLikelihood ){ maxLikelihood = classLikelihoods[k]; maxLikelihoodIndex = k; } } if( useNullRejection ){ switch( rejectionMode ){ case TEMPLATE_THRESHOLDS: if( bestDistance <= nullRejectionThresholds[ closestTemplateIndex ] ) predictedClassLabel = templatesBuffer[ closestTemplateIndex ].classLabel; else predictedClassLabel = GRT_DEFAULT_NULL_CLASS_LABEL; break; case CLASS_LIKELIHOODS: if( maxLikelihood >= 0.99 ) predictedClassLabel = templatesBuffer[ maxLikelihoodIndex ].classLabel; else predictedClassLabel = GRT_DEFAULT_NULL_CLASS_LABEL; break; case THRESHOLDS_AND_LIKELIHOODS: if( bestDistance <= nullRejectionThresholds[ closestTemplateIndex ] && maxLikelihood >= 0.99 ) predictedClassLabel = templatesBuffer[ closestTemplateIndex ].classLabel; else predictedClassLabel = GRT_DEFAULT_NULL_CLASS_LABEL; break; default: errorLog << "predict(Matrix<double> &timeSeries) - Unknown RejectionMode!" << endl; return false; break; } }else predictedClassLabel = templatesBuffer[ closestTemplateIndex ].classLabel; return true; }
bool DTW::train_NDDTW(LabelledTimeSeriesClassificationData &trainingData,DTWTemplate &dtwTemplate,UINT &bestIndex){ UINT numExamples = trainingData.getNumSamples(); VectorDouble results(numExamples,0.0); MatrixDouble distanceResults(numExamples,numExamples); dtwTemplate.averageTemplateLength = 0; for(UINT m=0; m<numExamples; m++){ MatrixDouble templateA; //The m'th template MatrixDouble templateB; //The n'th template dtwTemplate.averageTemplateLength += trainingData[m].getLength(); //Smooth the data if required if( useSmoothing ) smoothData(trainingData[m].getData(),smoothingFactor,templateA); else templateA = trainingData[m].getData(); if( offsetUsingFirstSample ){ offsetTimeseries(templateA); } for(UINT n=0; n<numExamples; n++){ if(m!=n){ //Smooth the data if required if( useSmoothing ) smoothData(trainingData[n].getData(),smoothingFactor,templateB); else templateB = trainingData[n].getData(); if( offsetUsingFirstSample ){ offsetTimeseries(templateB); } //Compute the distance between the two time series MatrixDouble distanceMatrix(templateA.getNumRows(),templateB.getNumRows()); vector< IndexDist > warpPath; double dist = computeDistance(templateA,templateB,distanceMatrix,warpPath); trainingLog << "Template: " << m << " Timeseries: " << n << " Dist: " << dist << endl; //Update the results values distanceResults[m][n] = dist; results[m] += dist; }else distanceResults[m][n] = 0; //The distance is zero because the two timeseries are the same } } for(UINT m=0; m<numExamples; m++) results[m]/=(numExamples-1); //Find the best average result, this is the result with the minimum value bestIndex = 0; double bestAverage = results[0]; for(UINT m=1; m<numExamples; m++){ if( results[m] < bestAverage ){ bestAverage = results[m]; bestIndex = m; } } if( numExamples > 2 ){ //Work out the threshold value for the best template dtwTemplate.trainingMu = results[bestIndex]; dtwTemplate.trainingSigma = 0.0; for(UINT n=0; n<numExamples; n++){ if(n!=bestIndex){ dtwTemplate.trainingSigma += SQR( distanceResults[ bestIndex ][n] - dtwTemplate.trainingMu ); } } dtwTemplate.trainingSigma = sqrt( dtwTemplate.trainingSigma / double(numExamples-2) ); }else{ warningLog << "_train_NDDTW(LabelledTimeSeriesClassificationData &trainingData,DTWTemplate &dtwTemplate,UINT &bestIndex - There are not enough examples to compute the trainingMu and trainingSigma for the template for class " << dtwTemplate.classLabel << endl; dtwTemplate.trainingMu = 0.0; dtwTemplate.trainingSigma = 0.0; } //Set the average length of the training examples dtwTemplate.averageTemplateLength = (UINT) (dtwTemplate.averageTemplateLength/double(numExamples)); trainingLog << "AverageTemplateLength: " << dtwTemplate.averageTemplateLength << endl; //Flag that the training was successfull return true; }
////////////////////////// TRAINING FUNCTIONS ////////////////////////// bool DTW::train(LabelledTimeSeriesClassificationData labelledTrainingData){ UINT bestIndex = 0; //Cleanup Memory templatesBuffer.clear(); classLabels.clear(); trained = false; continuousInputDataBuffer.clear(); if( trimTrainingData ){ LabelledTimeSeriesClassificationSampleTrimmer timeSeriesTrimmer(trimThreshold,maximumTrimPercentage); LabelledTimeSeriesClassificationData tempData; tempData.setNumDimensions( labelledTrainingData.getNumDimensions() ); for(UINT i=0; i<labelledTrainingData.getNumSamples(); i++){ if( timeSeriesTrimmer.trimTimeSeries( labelledTrainingData[i] ) ){ tempData.addSample(labelledTrainingData[i].getClassLabel(), labelledTrainingData[i].getData()); }else{ trainingLog << "Removing training sample " << i << " from the dataset as it could not be trimmed!" << endl; } } //Overwrite the original training data with the trimmed dataset labelledTrainingData = tempData; } if( labelledTrainingData.getNumSamples() == 0 ){ errorLog << "_train(LabelledTimeSeriesClassificationData &labelledTrainingData) - Can't train model as there are no samples in training data!" << endl; return false; } //Assign numClasses = labelledTrainingData.getNumClasses(); numTemplates = labelledTrainingData.getNumClasses(); numFeatures = labelledTrainingData.getNumDimensions(); templatesBuffer.resize( numClasses ); classLabels.resize( numClasses ); nullRejectionThresholds.resize( numClasses ); averageTemplateLength = 0; //Need to copy the labelled training data incase we need to scale it or znorm it LabelledTimeSeriesClassificationData trainingData( labelledTrainingData ); //Perform any scaling or normalisation rangesBuffer = trainingData.getRanges(); if( useScaling ) scaleData( trainingData ); if( useZNormalisation ) znormData( trainingData ); //For each class, run a one-to-one DTW and find the template the best describes the data for(UINT k=0; k<numTemplates; k++){ //Get the class label for the cth class UINT classLabel = trainingData.getClassTracker()[k].classLabel; LabelledTimeSeriesClassificationData classData = trainingData.getClassData( classLabel ); UINT numExamples = classData.getNumSamples(); bestIndex = 0; //Set the class label of this template templatesBuffer[k].classLabel = classLabel; //Set the kth class label classLabels[k] = classLabel; trainingLog << "Training Template: " << k << " Class: " << classLabel << endl; //Check to make sure we actually have some training examples if(numExamples<1){ errorLog << "_train(LabelledTimeSeriesClassificationData &labelledTrainingData) - Can not train model: Num of Example is < 1! Class: " << classLabel << endl; return false; } if(numExamples==1){//If we have just one training example then we have to use it as the template bestIndex = 0; nullRejectionThresholds[k] = 0.0;//TODO-We need a better way of calculating this! warningLog << "_train(LabelledTimeSeriesClassificationData &labelledTrainingData) - Can't compute reject thresholds for class " << classLabel << " as there is only 1 training example" << endl; }else{ //Search for the best training example for this class if( !train_NDDTW(classData,templatesBuffer[k],bestIndex) ){ errorLog << "_train(LabelledTimeSeriesClassificationData &labelledTrainingData) - Failed to train template for class with label: " << classLabel << endl; return false; } } //Add the template with the best index to the buffer int trainingMethod = 0; if(useSmoothing) trainingMethod = 1; switch (trainingMethod) { case(0)://Standard Training templatesBuffer[k].timeSeries = classData[bestIndex].getData(); break; case(1)://Training using Smoothing //Smooth the data, reducing its size by a factor set by smoothFactor smoothData(classData[ bestIndex ].getData(),smoothingFactor,templatesBuffer[k].timeSeries); break; default: cout<<"Can not train model: Unknown training method \n"; return false; break; } if( offsetUsingFirstSample ){ offsetTimeseries( templatesBuffer[k].timeSeries ); } //Add the average length of the training examples for this template to the overall averageTemplateLength averageTemplateLength += templatesBuffer[k].averageTemplateLength; } //Flag that the models have been trained trained = true; averageTemplateLength = (UINT) averageTemplateLength/double(numTemplates); //Recompute the null rejection thresholds recomputeNullRejectionThresholds(); //Resize the prediction results to make sure it is setup for realtime prediction continuousInputDataBuffer.clear(); continuousInputDataBuffer.resize(averageTemplateLength,vector<double>(numFeatures,0)); classLikelihoods.resize(numTemplates,DEFAULT_NULL_LIKELIHOOD_VALUE); classDistances.resize(numTemplates,0); predictedClassLabel = GRT_DEFAULT_NULL_CLASS_LABEL; maxLikelihood = DEFAULT_NULL_LIKELIHOOD_VALUE; //Training complete return true; }
void ElutionPeakDetection::detectElutionPeaks_(MassTrace& mt, std::vector<MassTrace>& single_mtraces) { //smooth data //std::vector<double> smoothed_data; // Size win_size = mt.getFWHMScansNum(); double scan_time(mt.getAverageMS1CycleTime()); Size win_size = std::ceil(chrom_fwhm_ / scan_time); // add smoothed data (original data is still accessible) smoothData(mt, static_cast<Int>(win_size)); // debug intensities // Size i = 0; // std::cout << "*****" << std::endl; // for (MassTrace::const_iterator mt_it = mt.begin(); mt_it != mt.end(); ++mt_it) // { // std::cout << mt_it->getIntensity() << " " << smoothed_data[i] << std::endl; // ++i; // } //std::cout << "*****" << std::endl; std::vector<Size> maxes, mins; findLocalExtrema(mt, win_size / 2, maxes, mins); // if only one maximum exists: finished! if (maxes.size() == 1) { bool pw_ok = true; bool snr_ok = true; // check mass trace filter criteria (if enabled) if (pw_filtering_ == "fixed") { double act_fwhm(mt.estimateFWHM(true)); // std::cout << "act_fwhm: " << act_fwhm << " "; if (act_fwhm < min_fwhm_ || act_fwhm > max_fwhm_) { pw_ok = false; } // std::cout << pw_ok << std::endl; } if (mt_snr_filtering_) { if (computeApexSNR(mt) < chrom_peak_snr_) { snr_ok = false; } } if (pw_ok && snr_ok) { mt.updateSmoothedMaxRT(); if (pw_filtering_ != "fixed") { mt.estimateFWHM(true); } // check for minimum/maximum trace length // double mt_length(std::fabs(mt.rbegin()->getRT() - mt.begin()->getRT())); // if ((mt_length >= min_trace_length_) && (mt_length <= max_trace_length_)) // if (mt_quality >= 1.2) // { #ifdef _OPENMP #pragma omp critical (OPENMS_ElutionPeakDetection_mtraces) #endif single_mtraces.push_back(mt); } } else if (maxes.empty()) { return; } else // split mt to sub-traces { MassTrace::const_iterator cp_it = mt.begin(); Size last_idx(0); // add last data point as last minimum (to grep the last chunk of the MT) mins.push_back(mt.getSize() - 1); for (Size min_idx = 0; min_idx < mins.size(); ++min_idx) { // copy sub-trace between cp_it and split point std::vector<PeakType> tmp_mt; std::vector<double> smoothed_tmp; while (last_idx <= mins[min_idx]) { tmp_mt.push_back(*cp_it); smoothed_tmp.push_back(mt.getSmoothedIntensities()[last_idx]); ++cp_it; ++last_idx; } // check if // if (tmp_mt.size() >= win_size / 2) // { MassTrace new_mt(tmp_mt); // copy smoothed int's new_mt.setSmoothedIntensities(smoothed_tmp); // check filter criteria bool pw_ok = true; bool snr_ok = true; // check mass trace filter criteria (if enabled) if (pw_filtering_ == "fixed") { double act_fwhm(new_mt.estimateFWHM(true)); // std::cout << "act_fwhm: " << act_fwhm << " "; if (act_fwhm < min_fwhm_ || act_fwhm > max_fwhm_) { pw_ok = false; } // std::cout << pw_ok << std::endl; } if (mt_snr_filtering_) { if (computeApexSNR(mt) < chrom_peak_snr_) { snr_ok = false; } } if (pw_ok && snr_ok) { // set label of sub-trace new_mt.setLabel(mt.getLabel() + "." + String(min_idx + 1)); //new_mt.updateWeightedMeanRT(); new_mt.updateSmoothedMaxRT(); //new_mt.updateSmoothedWeightedMeanRT(); new_mt.updateWeightedMeanMZ(); new_mt.updateWeightedMZsd(); if (pw_filtering_ != "fixed") { new_mt.estimateFWHM(true); } // double mt_quality(computeApexSNR(new_mt)); // double new_mt_length(std::fabs(new_mt.rbegin()->getRT() - new_mt.begin()->getRT())); // if ((new_mt_length >= min_trace_length_) && (new_mt_length <= max_trace_length_)) //{ #ifdef _OPENMP #pragma omp critical (OPENMS_ElutionPeakDetection_mtraces) #endif single_mtraces.push_back(new_mt); } // } } } return; }