Example #1
0
int main(int argc, char* argv[]) {
  string barcodeFileName, insertFileName, outputFileName;
  if (argc != 4) {
    cout << "usage: makeBarcodeDatabase insert.fasta barcodes.fasta output.fasta" << endl;
    exit(1);
  }
  insertFileName = argv[1];
  barcodeFileName = argv[2];
  outputFileName  = argv[3];

  FASTAReader barcodeReader, insertReader;
  barcodeReader.Initialize(barcodeFileName);
  insertReader.Initialize(insertFileName);
  
  ofstream barcodedOut;
  CrucialOpen(outputFileName, barcodedOut, std::ios::out);

  vector<FASTASequence> forwardBarcodes, reverseBarcodes;
  FASTASequence barcodeSequence, reverseBarcodeSequence;
  while(barcodeReader.GetNext(barcodeSequence)) {
    forwardBarcodes.push_back(barcodeSequence);
    barcodeSequence.MakeRC(reverseBarcodeSequence);
    reverseBarcodes.push_back(reverseBarcodeSequence);
  }
  
  FASTASequence insert;
  insertReader.GetNext(insert);
  
  int i;
  for (i = 0; i < forwardBarcodes.size(); i++) {
    FASTASequence barcodedInsert;
    barcodedInsert.Resize(forwardBarcodes[i].length * 2 + insert.length);
    stringstream titleStrm;
    titleStrm << insert.title << "|ff|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);

    titleStrm.str("");
    titleStrm << insert.title << "|fr|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);


    titleStrm.str("");
    titleStrm << insert.title << "|rf|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);


    titleStrm.str("");
    titleStrm << insert.title << "|rr|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);
  }
}
Example #2
0
GenomeLength FASTAReader::ReadAllSequencesIntoOne(FASTASequence &seq, SequenceIndexDatabase<FASTASequence> *seqDBPtr) {
    seq.Free();
    GenomeLength p = curPos;
    AdvanceToTitleStart(p);
    CheckValidTitleStart(p);
    ReadTitle(p, seq); 
     
    if (seq.title == NULL) {
        cout << "ERROR, sequence must have a nonempty title." << endl;
        exit(1);
    }
    if (seqDBPtr != NULL) {
        seqDBPtr->growableName.push_back(seq.title);
    }
    GenomeLength seqLength;
    seqLength = fileSize - p;
    GenomeLength memorySize = seqLength+padding+1;

    if (memorySize > UINT_MAX) {
        cout << "ERROR! Reading fasta files greater than 4Gbytes is not supported." << endl;
        exit(1);
    }
    seq.Resize(memorySize);
    GenomeLength i;
    i = 0L;
    for (; p < fileSize; p++, i++ ) {
        seq.seq[i] = filePtr[p];
    }
    i = p = 0;
    while (p < seqLength) {
        //
        // If this is the beginning of another read, add an 'N' 
        // to delineate spaces between reads.
        //

        while (p < seqLength and
                (seq.seq[p] == ' ' or
                 seq.seq[p] == '\n' or 
                 seq.seq[p] == '\t' or
                 seq.seq[p] == '\r')) {
            p++;
        }
        if (p < seqLength and seq.seq[p] == '>') {
            seq.seq[i] = 'N';

            GenomeLength titleStartPos = p+1;
            i++;
            while (p < seqLength and seq.seq[p] != '\n') p++;
            if (seqDBPtr != NULL and p < seqLength) {
                string title;
                GenomeLength tp;
                for (tp = titleStartPos; tp < p; tp++) {
                    title.push_back(seq.seq[tp]);
                }

                seqDBPtr->growableName.push_back(title);
                seqDBPtr->growableSeqStartPos.push_back(i);
                int nSeq = seqDBPtr->growableSeqStartPos.size();
                if (nSeq > 1 and computeMD5) {
                    string md5Str;
                    MakeMD5((const char*) &seq.seq[seqDBPtr->growableSeqStartPos[nSeq-2]],
                            seqDBPtr->growableSeqStartPos[nSeq-1] - seqDBPtr->growableSeqStartPos[nSeq-2] - 1,
                            md5Str);
                    seqDBPtr->md5.push_back(md5Str);
                }
            }
        }
        else if (p < seqLength)  {
            // Otherwise, p may be at whitespace
            // advance past that as well.
            seq.seq[i] = convMat[seq.seq[p]];
            i++;
            p++;
        }
    }
    if (i > UINT_MAX) {
        cout << "ERROR! Sequences greater than 4Gbase are not supported." << endl;
        exit(1);
    }
    //
    // Append an 'N' at the end of the last sequence for consistency
    // between different orderings of reference input. 
    //
    seq.seq[i] = 'N';
    i++;
    seq.length = i;
    // fill padding.
    for (; i < memorySize; i++ ){
        seq.seq[i] = 0;
    }
    seq.deleteOnExit = true;
    if (seqDBPtr != NULL) {
        seqDBPtr->growableSeqStartPos.push_back(seq.length);
        int nSeq = seqDBPtr->growableSeqStartPos.size();
        if (nSeq > 1 and computeMD5) {
            string md5Str;
            MakeMD5((const char*) &seq.seq[seqDBPtr->growableSeqStartPos[nSeq-2]],
                    seqDBPtr->growableSeqStartPos[nSeq-1] - seqDBPtr->growableSeqStartPos[nSeq-2] - 1,
                    md5Str);
            seqDBPtr->md5.push_back(md5Str);
        }
        seqDBPtr->Finalize();
    }
    return seq.length;
}