int main(int argc, char* argv[]) { string barcodeFileName, insertFileName, outputFileName; if (argc != 4) { cout << "usage: makeBarcodeDatabase insert.fasta barcodes.fasta output.fasta" << endl; exit(1); } insertFileName = argv[1]; barcodeFileName = argv[2]; outputFileName = argv[3]; FASTAReader barcodeReader, insertReader; barcodeReader.Initialize(barcodeFileName); insertReader.Initialize(insertFileName); ofstream barcodedOut; CrucialOpen(outputFileName, barcodedOut, std::ios::out); vector<FASTASequence> forwardBarcodes, reverseBarcodes; FASTASequence barcodeSequence, reverseBarcodeSequence; while(barcodeReader.GetNext(barcodeSequence)) { forwardBarcodes.push_back(barcodeSequence); barcodeSequence.MakeRC(reverseBarcodeSequence); reverseBarcodes.push_back(reverseBarcodeSequence); } FASTASequence insert; insertReader.GetNext(insert); int i; for (i = 0; i < forwardBarcodes.size(); i++) { FASTASequence barcodedInsert; barcodedInsert.Resize(forwardBarcodes[i].length * 2 + insert.length); stringstream titleStrm; titleStrm << insert.title << "|ff|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|fr|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|rf|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|rr|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); } }
GenomeLength FASTAReader::ReadAllSequencesIntoOne(FASTASequence &seq, SequenceIndexDatabase<FASTASequence> *seqDBPtr) { seq.Free(); GenomeLength p = curPos; AdvanceToTitleStart(p); CheckValidTitleStart(p); ReadTitle(p, seq); if (seq.title == NULL) { cout << "ERROR, sequence must have a nonempty title." << endl; exit(1); } if (seqDBPtr != NULL) { seqDBPtr->growableName.push_back(seq.title); } GenomeLength seqLength; seqLength = fileSize - p; GenomeLength memorySize = seqLength+padding+1; if (memorySize > UINT_MAX) { cout << "ERROR! Reading fasta files greater than 4Gbytes is not supported." << endl; exit(1); } seq.Resize(memorySize); GenomeLength i; i = 0L; for (; p < fileSize; p++, i++ ) { seq.seq[i] = filePtr[p]; } i = p = 0; while (p < seqLength) { // // If this is the beginning of another read, add an 'N' // to delineate spaces between reads. // while (p < seqLength and (seq.seq[p] == ' ' or seq.seq[p] == '\n' or seq.seq[p] == '\t' or seq.seq[p] == '\r')) { p++; } if (p < seqLength and seq.seq[p] == '>') { seq.seq[i] = 'N'; GenomeLength titleStartPos = p+1; i++; while (p < seqLength and seq.seq[p] != '\n') p++; if (seqDBPtr != NULL and p < seqLength) { string title; GenomeLength tp; for (tp = titleStartPos; tp < p; tp++) { title.push_back(seq.seq[tp]); } seqDBPtr->growableName.push_back(title); seqDBPtr->growableSeqStartPos.push_back(i); int nSeq = seqDBPtr->growableSeqStartPos.size(); if (nSeq > 1 and computeMD5) { string md5Str; MakeMD5((const char*) &seq.seq[seqDBPtr->growableSeqStartPos[nSeq-2]], seqDBPtr->growableSeqStartPos[nSeq-1] - seqDBPtr->growableSeqStartPos[nSeq-2] - 1, md5Str); seqDBPtr->md5.push_back(md5Str); } } } else if (p < seqLength) { // Otherwise, p may be at whitespace // advance past that as well. seq.seq[i] = convMat[seq.seq[p]]; i++; p++; } } if (i > UINT_MAX) { cout << "ERROR! Sequences greater than 4Gbase are not supported." << endl; exit(1); } // // Append an 'N' at the end of the last sequence for consistency // between different orderings of reference input. // seq.seq[i] = 'N'; i++; seq.length = i; // fill padding. for (; i < memorySize; i++ ){ seq.seq[i] = 0; } seq.deleteOnExit = true; if (seqDBPtr != NULL) { seqDBPtr->growableSeqStartPos.push_back(seq.length); int nSeq = seqDBPtr->growableSeqStartPos.size(); if (nSeq > 1 and computeMD5) { string md5Str; MakeMD5((const char*) &seq.seq[seqDBPtr->growableSeqStartPos[nSeq-2]], seqDBPtr->growableSeqStartPos[nSeq-1] - seqDBPtr->growableSeqStartPos[nSeq-2] - 1, md5Str); seqDBPtr->md5.push_back(md5Str); } seqDBPtr->Finalize(); } return seq.length; }