// converts a space separated quality string into a compressed quality string // NOTE: this function has horrible amounts of overhead, but lean and mean code that I had before // failed some of the unit tests. void CRegexUtilities::ConvertQualities(string& qualities, CMosaikString& compQualities) { string::iterator strIte = qualities.end() - 1; while ( *strIte == ' ' ) { qualities.erase( strIte ); strIte--; } vector<string> columns; vector<string>::const_iterator sIter; char* pQualities = (char*)qualities.c_str(); Chomp(pQualities); back_insert_iterator<vector<string> > backiter(columns); SplitString(backiter, " ", pQualities); const unsigned int numQualities = (unsigned int)columns.size(); compQualities.Reserve(numQualities); compQualities.SetLength(numQualities); unsigned char* pCompQualities = (unsigned char*)compQualities.Data(); for(sIter = columns.begin(); sIter != columns.end(); ++sIter, ++pCompQualities) { if(sIter->empty()) continue; *pCompQualities = GetUnsignedChar((char*)sIter->c_str()); } }
// extracts the genome assembly ID from a FASTA/FASTQ header void CRegexUtilities::ExtractGenomeAssemblyID(const string& line, CMosaikString& genomeAssemblyID) { #ifdef WIN32 cmatch results; if(!regex_search(line.c_str(), results, mGenomeAssemblyIDRegex)) { genomeAssemblyID.SetLength(0); return; } genomeAssemblyID = results[1].str().c_str(); #else // TODO: replace this with the TR1 regex above when it finally works in gcc. It doesn't work in gcc 4.3.3 // find the GA tag const string gaTag = "GA("; string::size_type gaPos = line.find(gaTag.c_str()); if(gaPos == string::npos) { genomeAssemblyID.SetLength(0); return; } // find the matching end parenthesis const unsigned int start = gaPos + gaTag.size(); unsigned int stop = start; const char* pBuffer = line.data(); unsigned int lineLen = line.size(); if(stop < lineLen) { while(pBuffer[stop] != ')') { stop++; if(stop == lineLen) break; } } if(start == stop) { cout << "ERROR: could not parse genome assembly ID from FASTA header." << endl; cout << " " << line << endl; exit(1); } genomeAssemblyID = line.substr(start, stop - start).c_str(); #endif }