Пример #1
0
// converts a space separated quality string into a compressed quality string
// NOTE: this function has horrible amounts of overhead, but lean and mean code that I had before
//       failed some of the unit tests.
void CRegexUtilities::ConvertQualities(string& qualities, CMosaikString& compQualities) {

	string::iterator strIte = qualities.end() - 1;
	while ( *strIte == ' ' ) {
		qualities.erase( strIte );
		strIte--;
	}
	
	vector<string> columns;
	vector<string>::const_iterator sIter;

	char* pQualities = (char*)qualities.c_str();
	Chomp(pQualities);

	back_insert_iterator<vector<string> > backiter(columns);
	SplitString(backiter, " ", pQualities);
	const unsigned int numQualities = (unsigned int)columns.size();

	compQualities.Reserve(numQualities);
	compQualities.SetLength(numQualities);

	unsigned char* pCompQualities = (unsigned char*)compQualities.Data();

	for(sIter = columns.begin(); sIter != columns.end(); ++sIter, ++pCompQualities) {
		if(sIter->empty()) continue;
		*pCompQualities = GetUnsignedChar((char*)sIter->c_str());
	}
}
Пример #2
0
// extracts the genome assembly ID from a FASTA/FASTQ header
void CRegexUtilities::ExtractGenomeAssemblyID(const string& line, CMosaikString& genomeAssemblyID) {
#ifdef WIN32

	cmatch results;
	if(!regex_search(line.c_str(), results, mGenomeAssemblyIDRegex)) {
		genomeAssemblyID.SetLength(0);
		return;
	}
	genomeAssemblyID = results[1].str().c_str();

#else

	// TODO: replace this with the TR1 regex above when it finally works in gcc. It doesn't work in gcc 4.3.3

	// find the GA tag
	const string gaTag = "GA(";
	string::size_type gaPos = line.find(gaTag.c_str());

	if(gaPos == string::npos) {
		genomeAssemblyID.SetLength(0);
		return;
	}

	// find the matching end parenthesis
	const unsigned int start = gaPos + gaTag.size();
	unsigned int stop = start;

	const char* pBuffer = line.data();
	unsigned int lineLen = line.size();

	if(stop < lineLen) {
		while(pBuffer[stop] != ')') {
			stop++;
			if(stop == lineLen) break;
		}
	}

	if(start == stop) {
		cout << "ERROR: could not parse genome assembly ID from FASTA header." << endl;
		cout << "       " << line << endl;
		exit(1);
	}

	genomeAssemblyID = line.substr(start, stop - start).c_str();

#endif
}