Exemple #1
0
int main(int argc, char* argv[])
{
    std::string seqInName, seqOutName, dotOutName;
    if (argc < 4) {
        std::cout << "usage: exciseRepeats inName repMaskOutFile outName" << std::endl;
        std::exit(EXIT_FAILURE);
    }

    seqInName = argv[1];
    dotOutName = argv[2];
    seqOutName = argv[3];
    FASTAReader reader;
    reader.Initialize(seqInName);
    FASTASequence origSeq;
    reader.GetNext(origSeq);

    std::ifstream dotOutFile;
    CrucialOpen(dotOutName, dotOutFile);
    std::ofstream seqOutFile;
    std::ofstream seqOut;
    CrucialOpen(seqOutName, seqOut, std::ios::out);
    std::string dotOutLine;
    getline(dotOutFile, dotOutLine);
    getline(dotOutFile, dotOutLine);
    getline(dotOutFile, dotOutLine);
    while (getline(dotOutFile, dotOutLine)) {
        std::stringstream lineStrm(dotOutLine);
        int swScore;
        float pctDiv, pctDel, pctIns;
        std::string query;
        DNALength qPosBegin, qPosEnd;
        std::string left;
        char strand;
        std::string matchingRepeat;
        std::string repClass;
        std::string repPos, repEnd, repLeft;
        int id;
        lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >>
            left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id;
        for (DNALength seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) {
            origSeq.seq[seqPos] = 'X';
        }
    }

    DNALength seqPos, unexPos;
    unexPos = 0;
    for (seqPos = 0; seqPos < origSeq.length; seqPos++) {
        if (origSeq.seq[seqPos] != 'X') {
            origSeq.seq[unexPos] = origSeq.seq[seqPos];
            unexPos++;
        }
    }
    origSeq.length = unexPos;

    origSeq.PrintSeq(seqOut);
    return 0;
}
int main(int argc, char* argv[]) {

  string alignmentFileName, geneDBFileName;

  if (argc < 3) {
    cout << "usage: printDuplicates alignemntFile genedb " << endl;
    exit(1);
  }

  alignmentFileName = argv[1];
  geneDBFileName    = argv[2];

  GeneDB genedb;
  genedb.Read(geneDBFileName);
  genedb.IndexChromosomes();

  ifstream alignmentIn;
  CrucialOpen(alignmentFileName, alignmentIn, std::ios::in);
  
  while(alignmentIn) {
    string line;
    getline(alignmentIn, line);
    if (line == "###") {
      // found the end of an entry
      

    }
    else {
      string chrName, genome, source;
      int start, end, identity;
      char a, strand, b;
      string annotationString;
      stringstream strm(line);
      strm >> chrName >> genome >> source >> start >> end >> a >> strand >> b >> annotationString;
      vector<string> annotations;
      if (source != "exon") {
        continue;
      }
      ParseSeparatedList(annotationString, annotations, ';');

      GeneDBChromosome *chromosomePtr;
      chromosomePtr = genedb.Find(chrName);
      if (chromosomePtr == NULL) {
        cout << "chromosome " << chromosomePtr << " not found in database." << endl;
        continue;
      }
      int exonIndex;
      if ( chromosomePtr->LookupIndexByStart(start, exonIndex) ) {
        if (chromosomePtr->exons[exonIndex].start <= start and 
            chromosomePtr->exons[exonIndex].end   >= end) {
          chromosomePtr->exons[exonIndex].Print(cout);
        }
      }
      else {
        
      }
    }
  }
}
bool SAMReader<T_ReferenceSequence, T_ReadGroup, T_SAMAlignment>::Initialize(std::string samFileName) {
  if(samFileName != "stdin") {
    CrucialOpen(samFileName, samFile, std::ios::in);
    samFilePtr = &samFile;
  } else {
    samFilePtr = &std::cin;
  }
  return true;
}
void FileOfFileNames::FOFNToList(std::string &fofnFileName, std::vector<std::string> &fofnList)
{
    std::ifstream fofnIn;
    CrucialOpen(fofnFileName, fofnIn);
    while (fofnIn) {
        std::string name;
        std::getline(fofnIn, name);
        if (name.size() > 0) {
            fofnList.push_back(name);
        }
    }
}
int main(int argc, char *argv[]) {
	string sequencesInName, sequencesOutName;
	if (argc <3){ 
		cout << "usage: scramble in out" << endl;
		exit(1);
	}
	sequencesInName = argv[1];
	sequencesOutName= argv[2];
	vector<FASTASequence*> sequences;
	vector<int> sequenceIndices;

	FASTAReader reader;
	reader.Init(sequencesInName);
	ofstream out;
	CrucialOpen(sequencesOutName, out, std::ios::out);
	

	FASTASequence read;
	FASTASequence*readPtr;
	while(reader.GetNext(read)) {
		readPtr = new FASTASequence;
		*readPtr = read;
		sequences.push_back(readPtr);
	}

	int i;
	for (i = 0; i < sequences.size(); i++) {
		sequenceIndices.push_back(i);
	}

	for (i = 0; i < 10*sequences.size(); i++ ){
		//
		// shuffle indices.
		//
		int idx1;
		int idx2;
		idx1 = RandomInt(sequences.size());
		idx2 = RandomInt(sequences.size());
		int tmp;
		tmp  = sequenceIndices[idx1];
		sequenceIndices[idx1] = sequenceIndices[idx2];
		sequenceIndices[idx2] = tmp;
	}
	
	for (i = 0; i < sequenceIndices.size(); i++ ){
		sequences[sequenceIndices[i]]->PrintSeq(out);
	}
	return 0;
}
void CompressedSequence<T_Sequence>::Write(std::string outFileName) {
    std::ofstream out;
    CrucialOpen(outFileName,out, std::ios::binary | std::ios::in);
    out.write((char*) &hasTitle, sizeof(int));
    out.write((char*) &hasIndex, sizeof(int));
    if (hasTitle) {
        out.write((char*)&titleLength, sizeof(int));
        out.write((char*)title, titleLength);
    }
    out.write((char*) &length, sizeof(int));
    out.write((char*) seq, sizeof(char) * length);
    if (hasIndex) {
        index.Write(out);
    }
    out.close();
}
Exemple #7
0
	int Read(std::string inName) {
		std::ifstream bwtIn;
		CrucialOpen(inName, bwtIn, std::ios::binary|std::ios::in);
		bwtSequence.Read(bwtIn);
		bwtIn.read((char*)charCount, sizeof(DNALength)*CharCountSize);
		bwtIn.read((char*)&firstCharPos, sizeof(DNALength));
		bwtIn.read((char*)&useDebugData, sizeof(useDebugData));
		if (useDebugData) {
			saCopy.resize(bwtSequence.length-1);
			bwtIn.read((char*)&saCopy[0], (bwtSequence.length-1) * sizeof(DNALength));
		}
		occ.Read(bwtIn, useDebugData);
		pos.Read(bwtIn);
		occ.InitializeBWT(bwtSequence);
		return 1;
	}
Exemple #8
0
void GFFFile::ReadAll(std::string & gffFileName) {
  std::fstream gffIn;
  CrucialOpen(gffFileName, gffIn, std::ios::in);
  while(gffIn) {
    std::string line;
    getline(gffIn, line);
    std::stringstream linestrm(line);
    std::string name, source, type;
    UInt start, end;
    char strand;
    float score;
    std::string frame, attributes;
    // A sample record in adapterGffFile:
    // ref000001   .   adapter 10955   10999   0.00    +   .   xxxx
    linestrm >> name >> source >> type 
             >> start >> end >> score 
             >> strand >> frame >> attributes;
    entries.push_back(GFFEntry(
      name, source, type, start, end, 
      score, strand, frame, attributes));
    }
    gffIn.close();
}
void CompressedSequence<T_Sequence>::Read(std::string inFileName) {
    Free(); //Free before reusing this object.
    std::ifstream in;
    CrucialOpen(inFileName, in, std::ios::binary | std::ios::in);
    // step 1, read in the options.
    in.read((char*) &hasTitle, sizeof(int));
    in.read((char*) &hasIndex, sizeof(int));
    if (hasTitle) {
        int inTitleLength;
        in.read((char*) &inTitleLength, sizeof(int));
        char * inTitle = ProtectedNew<char>(inTitleLength+1);
        in.read((char*) inTitle, inTitleLength);
        inTitle[titleLength] = '\0';
        CopyTitle(inTitle, inTitleLength);
        delete [] inTitle;
    }
    in.read((char*) &length, sizeof(DNALength));
    seq = ProtectedNew<Nucleotide>(length);
    in.read((char*) seq, length * sizeof(Nucleotide));
    if (hasIndex) {
        index.Read(in);
    }
    deleteOnExit = true;
}
Exemple #10
0
int main(int argc, char* argv[]) {
	CommandLineParser clp;
	
	string indexDBName;
	bool   printIndex = false;
	int    searchIndex;

	//
	// Configure the command line.
	//
	clp.SetProgramName("testseqdb");
	clp.SetProgramSummary("test the sequence db.\n");
	clp.RegisterStringOption("indexdb", &indexDBName, "The index to test.");
	clp.RegisterPreviousFlagsAsHidden();
	clp.RegisterFlagOption("p", &printIndex, "Print the start position of each read.");
	clp.RegisterIntOption("i", &searchIndex, "The index to search for", CommandLineParser::NonNegativeInteger, true);

	clp.ParseCommandLine(argc, argv);

	SequenceIndexDatabase<FASTASequence> seqDB;
	ifstream in;
	CrucialOpen(indexDBName, in, std::ios::in | std::ios::binary);
	seqDB.ReadDatabase(in);

	if (printIndex) {
		int i;
		for (i = 0; i < seqDB.nSeqPos - 1; i++) { 
			cout << i << " " << seqDB.seqStartPos[i+1] << " " << seqDB.names[i] << endl;
		}
	}

	int dbPos = seqDB.SearchForIndex(searchIndex);
	if (dbPos >= 0) {
		cout << "searchIndex: " << searchIndex << " " << dbPos << " " << seqDB.seqStartPos[dbPos] << " " << seqDB.names[dbPos-1] << endl;
	}
};
int main(int argc, char* argv[])
{
    std::string outFileName;
    unsigned contextLength = 5;
    int minSamples = 500;
    int maxSamples = 1000;
    if (argc < 3) {
        PrintUsage();
        std::exit(EXIT_FAILURE);
    }

    int argi = 1;
    std::string cmpH5FileName;
    cmpH5FileName = argv[argi++];
    outFileName = argv[argi++];
    int minAverageQual = 0;
    bool onlyMaxLength = false;

    while (argi < argc) {
        if (strcmp(argv[argi], "-contextLength") == 0) {
            contextLength = atoi(argv[++argi]);
        } else if (strcmp(argv[argi], "-minSamples") == 0) {
            minSamples = atoi(argv[++argi]);
        } else if (strcmp(argv[argi], "-maxSamples") == 0) {
            maxSamples = atoi(argv[++argi]);
        } else if (strcmp(argv[argi], "-onlyMaxLength") == 0) {
            onlyMaxLength = true;
        } else {
            PrintUsage();
            std::cout << "ERROR, bad option: " << argv[argi] << std::endl;
            std::exit(EXIT_FAILURE);
        }
        ++argi;
    }
    std::map<std::string, ScoredLength> maxLengthMap;
    OutputSampleListSet samples(contextLength);
    SMRTSequence read;

    std::ofstream sampleOut;
    CrucialOpen(outFileName, sampleOut, std::ios::out | std::ios::binary);
    int fileNameIndex;

    int numContextsReached = 0;
    int numContexts = 1 << (contextLength * 2);
    ReaderAgglomerate reader;
    samples.keyLength = contextLength;
    HDFCmpFile<CmpAlignment> cmpReader;
    cmpReader.IncludeField("QualityValue");
    cmpReader.IncludeField("DeletionQV");
    cmpReader.IncludeField("InsertionQV");
    cmpReader.IncludeField("SubstitutionQV");
    cmpReader.IncludeField("SubstitutionTag");
    cmpReader.IncludeField("DeletionTag");
    cmpReader.IncludeField("PulseIndex");
    cmpReader.IncludeField("WidthInFrames");
    cmpReader.IncludeField("PreBaseFrames");

    if (cmpReader.Initialize(cmpH5FileName, H5F_ACC_RDWR) == 0) {
        std::cout << "ERROR, could not open the cmp file." << std::endl;
        std::exit(EXIT_FAILURE);
    }
    std::cout << "Reading cmp file." << std::endl;

    CmpFile cmpFile;

    cmpReader.ReadAlignmentDescriptions(cmpFile);
    cmpReader.ReadStructure(cmpFile);
    std::cout << "done reading structure." << std::endl;
    int alignmentIndex;
    int nAlignments = cmpReader.alnInfoGroup.GetNAlignments();
    std::vector<int> alignmentToBaseMap;

    for (alignmentIndex = 0; alignmentIndex < nAlignments and !samples.Sufficient();
         alignmentIndex++) {
        //
        // For ease of use, store the length of the alignment to make another model.
        //

        ByteAlignment alignmentArray;
        cmpReader.ReadAlignmentArray(alignmentIndex, alignmentArray);
        Alignment alignment;
        ByteAlignmentToAlignment(alignmentArray, alignment);
        std::string readSequence, refSequence;
        readSequence.resize(alignmentArray.size());
        refSequence.resize(alignmentArray.size());
        DNASequence readDNA, refDNA;

        ByteAlignmentToQueryString(&alignmentArray[0], alignmentArray.size(), &readSequence[0]);
        ByteAlignmentToRefString(&alignmentArray[0], alignmentArray.size(), &refSequence[0]);
        RemoveGaps(readSequence, readSequence);
        RemoveGaps(refSequence, refSequence);

        readDNA.seq = (Nucleotide*)readSequence.c_str();
        readDNA.length = readSequence.size();
        refDNA.seq = (Nucleotide*)refSequence.c_str();
        refDNA.length = refSequence.size();
        CmpAlignment cmpAlignment;

        cmpReader.ImportReadFromCmpH5(alignmentIndex, cmpAlignment, read);

        CreateAlignmentToSequenceMap(alignmentArray, alignmentToBaseMap);

        if (read.length < contextLength) {
            continue;
        }
        int subreadLength = (cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd() -
                             cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart());
        if (onlyMaxLength == false) {
            samples.lengths.push_back(subreadLength);
        } else {
            int score = (cmpAlignment.GetNMatch() - cmpAlignment.GetNMismatch() -
                         cmpAlignment.GetNInsertions() - cmpAlignment.GetNDeletions());
            std::stringstream nameStrm;
            nameStrm << cmpAlignment.GetMovieId() << "_" << cmpAlignment.GetHoleNumber();
            std::string nameStr = nameStrm.str();
            if (maxLengthMap.find(nameStr) == maxLengthMap.end()) {
                maxLengthMap[nameStr] = ScoredLength(score, subreadLength);
            }
        }

        int sampleEnd = alignmentArray.size() - contextLength / 2;
        int a;
        for (a = contextLength / 2; a < sampleEnd; a++) {

            // Make sure the context begins on a real nucleotide.
            while (a < sampleEnd and ((RefChar[alignmentArray[a]] == ' '))) {
                a++;
            }

            //
            // Move ab back to an index where there are contextLength/2 non-gap
            // characters, counted by nb
            //
            int ab;  //num bases
            int ae;  //alignment end
            ab = a - 1;
            int nb = 0, ne = 0;
            while (true) {
                if (RefChar[alignmentArray[ab]] != ' ') {
                    nb++;
                }
                if (ab == 0 or nb == static_cast<int>(contextLength) / 2) break;
                ab--;
            }

            //
            // Advance ae to an index where there are contextLength/2 non-gap
            // characters, counted by ne.
            //
            ae = a + 1;
            while (ae < static_cast<int>(alignmentArray.size()) and
                   ne < static_cast<int>(contextLength) / 2) {
                if (RefChar[alignmentArray[ae]] != ' ') {
                    ne++;
                }
                ae++;
            }

            //
            // Make sure there are no edge effects that prevent a context of the correct length from being assigned.
            //
            if (nb + ne + 1 != static_cast<int>(contextLength)) {
                continue;
            }
            int ai;
            std::string context;
            for (ai = ab; ai < ae; ai++) {
                if (RefChar[alignmentArray[ai]] != ' ') {
                    context.push_back(RefChar[alignmentArray[ai]]);
                }
            }
            assert(context.size() == contextLength);
            //
            // Now create the context.
            //
            OutputSample sample;

            //
            // This context is a deletion, create that.
            //
            sample.type = OutputSample::Deletion;

            //
            // This context is either an insertion or substitution
            //
            // Look to see if the previous aligned position was an
            // insertion, and move back as far as the insertion extends.
            int aq = a - 1;
            int sampleLength;

            if (QueryChar[alignmentArray[a]] == ' ') {
                sample.type = OutputSample::Deletion;
                sampleLength = 0;
            } else if (RefChar[alignmentArray[aq]] == ' ') {

                while (aq > 0 and RefChar[alignmentArray[aq]] == ' ' and
                       QueryChar[alignmentArray[aq]] != ' ') {
                    aq--;
                }
                sample.type = OutputSample::Insertion;
                sampleLength = a - aq;
            } else if (QueryChar[alignmentArray[a]] == RefChar[alignmentArray[aq]]) {
                sample.type = OutputSample::Match;
                sampleLength = 1;
            } else {
                sample.type = OutputSample::Substitution;
                sampleLength = 1;
            }

            sample.Resize(sampleLength);
            if (sampleLength > 0) {
                int seqPos = alignmentToBaseMap[aq];
                if (seqPos < static_cast<int>(read.length)) {
                    sample.CopyFromSeq(read, seqPos, sampleLength);
                    std::string nucs;
                    for (size_t n = 0; n < sample.nucleotides.size(); n++) {
                        char c = sample.nucleotides[n];
                        assert(c == 'A' or c == 'T' or c == 'G' or c == 'C');
                        nucs.push_back(sample.nucleotides[n]);
                    }
                }
            }
            samples.AppendOutputSample(context, sample);
        }
        read.Free();
    }

    if (onlyMaxLength) {
        std::map<std::string, ScoredLength>::iterator maxScoreIt;
        for (maxScoreIt = maxLengthMap.begin(); maxScoreIt != maxLengthMap.end(); ++maxScoreIt) {
            std::cout << maxScoreIt->second.length << std::endl;
            samples.lengths.push_back(maxScoreIt->second.length);
        }
    }

    samples.Write(sampleOut);

    return 0;
}
Exemple #12
0
	void Write(std::string outName) {
		std::ofstream bwtOut;
		CrucialOpen(outName, bwtOut, std::ios::binary|std::ios::out);
		Write(bwtOut);
	}
Exemple #13
0
int main(int argc, char* argv[]) {
    string program = "samtom4";
    string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

    string samFileName, refFileName, outFileName;
    bool printHeader = false;
    bool parseSmrtTitle = false;
    bool useShortRefName = false;

    CommandLineParser clp;
    clp.SetProgramName(program);
    clp.SetVersion(versionString);
    clp.SetProgramSummary("Converts a SAM file generated by blasr to M4 format.");
    clp.RegisterStringOption("in.sam",        &samFileName,
                             "Input SAM file, which is produced by blasr.");
    clp.RegisterStringOption("reference.fasta", &refFileName,
                             "Reference used to generate file.sam.");
    clp.RegisterStringOption("out.m4",          &outFileName,
                             "Output in blasr M4 format.");
    clp.RegisterPreviousFlagsAsHidden();
    clp.RegisterFlagOption("header",            &printHeader,
                           "Print M4 header.");
    clp.RegisterFlagOption("useShortRefName",   &useShortRefName, 
                           "Use abbreviated reference names obtained "
                           "from file.sam instead of using full names "
                           "from reference.fasta.");
    //clp.SetExamples(program + " file.sam reference.fasta out.m4");

    clp.ParseCommandLine(argc, argv);

    ostream * outFilePtr = &cout;
	ofstream outFileStrm;
	if (outFileName != "") {
		CrucialOpen(outFileName, outFileStrm, std::ios::out);
		outFilePtr = &outFileStrm;
	}

    SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader;
    FASTAReader fastaReader;

    //
    // Initialize samReader and fastaReader.
    //
    samReader.Initialize(samFileName);
    fastaReader.Initialize(refFileName);

    //
    // Configure the file log.
    //
    string command;
    CommandLineParser::CommandLineToString(argc, argv, command);

    //
    // Read necessary input.
    //
    vector<FASTASequence> references;
    fastaReader.ReadAllSequences(references);

    AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet;
    samReader.ReadHeader(alignmentSet); 

    //
    // The order of references in vector<FASTASequence> references and
    // AlignmentSet<, , >alignmentSet.references can be different.
    // Rearrange alignmentSet.references such that it is ordered in
    // exactly the same way as vector<FASTASequence> references.
    //
    alignmentSet.RearrangeReferences(references);

    //
    // Map short names for references obtained from file.sam to 
    // full names obtained from reference.fasta
    //
    map<string, string> shortRefNameToFull;
    map<string, string>::iterator it;
    assert(references.size() == alignmentSet.references.size());
    if (!useShortRefName) {
        for (size_t i = 0; i < references.size(); i++) {
            string shortRefName = alignmentSet.references[i].GetSequenceName();
            string fullRefName(references[i].title); 
            if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) {
                cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl;
                exit(1);
            } 
            shortRefNameToFull[shortRefName] = fullRefName;
            alignmentSet.references[i].sequenceName = fullRefName;
        }
    }

    // Map reference name obtained from SAM file to indices
    map<string, int> refNameToIndex;
    for (size_t i = 0; i < references.size(); i++) {
        string refName = alignmentSet.references[i].GetSequenceName();
        refNameToIndex[refName] = i;
    }

    //
    // Store the alignments.
    //
    SAMAlignment samAlignment;
    size_t alignIndex = 0; 

    //
    // For 150K, each chip produces about 300M sequences 
    // (not including quality values and etc.).
    // Let's assume that the sam file and reference data can 
    // fit in the memory. 
    // Need to scale for larger sequal data in the future.
    //
    if (printHeader)
        IntervalOutput::PrintHeader(*outFilePtr);

    // The socre matrix does not matter because we will use the 
    // aligner's score from SAM file anyway.
    DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn;

    while (samReader.GetNextAlignment(samAlignment)) {
        if (samAlignment.rName == "*") {
            continue;
        }

        if (!useShortRefName) {
            //convert shortRefName to fullRefName
            it = shortRefNameToFull.find(samAlignment.rName);
            if (it == shortRefNameToFull.end()) {
                cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl;
                exit(1);
            }
            samAlignment.rName = (*it).second;
        }

        // The padding character 'P' is not supported
        if (samAlignment.cigar.find('P') != string::npos) {
            cout << "WARNING. Could not process sam record with 'P' in its cigar string."
                 << endl;
            continue;
        }

        vector<AlignmentCandidate<> > convertedAlignments;

        //
        // Keep reference as forward.
        // So if IsReverseComplement(sam.flag)==true, then qStrand is reverse
        // and tStrand is forward.
        //
        bool keepRefAsForward = false;

        SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex,
                                  convertedAlignments, parseSmrtTitle, 
                                  keepRefAsForward);

        if (convertedAlignments.size() > 1) {
            cout << "WARNING. Ignore an alignment which has multiple segments." << endl;
            continue;
        }

        //all alignments are unique single-ended alignments.
        for (int i = 0; i < 1; i++) {
            AlignmentCandidate<> & alignment = convertedAlignments[i];

            ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, 
                                  alignment.tAlignedSeq.seq, distScoreFn);

            // Use aligner's score from SAM file anyway.
            alignment.score = samAlignment.as;
            alignment.mapQV = samAlignment.mapQV;

            // Since SAM only has the aligned sequence, many info of the 
            // original query (e.g. the full length) is missing. 
            // Overwrite alignment.qLength (which is length of the query
            // in the SAM alignment) with xq (which is the length of the 
            // original query sequence saved by blasr) right before printing 
            // the output so that one can reconstruct a blasr m4 record from 
            // a blasr sam alignment.
            if (samAlignment.xq!=0)
                alignment.qLength = samAlignment.xq;

            IntervalOutput::PrintFromSAM(alignment, *outFilePtr);

            alignment.FreeSubsequences();
        }
        ++alignIndex;
    }

	if (outFileName != "") {
		outFileStrm.close();
	}
    return 0;
}
int main(int argc, char* argv[])
{
    std::string inFileName, readsFileName;
    DNALength readLength;
    float coverage = 0;
    bool noRandInit = false;
    int numReads = -1;
    CommandLineParser clp;
    int qualityValue = 20;
    bool printFastq = false;
    int stratify = 0;
    std::string titleType = "pacbio";
    std::string fastqType = "illumina";  // or "sanger"
    clp.RegisterStringOption("inFile", &inFileName, "Reference sequence", 0);
    clp.RegisterPreviousFlagsAsHidden();
    clp.RegisterIntOption("readLength", (int*)&readLength,
                          "The length of reads to simulate.  The length is fixed.",
                          CommandLineParser::PositiveInteger, 0);
    clp.RegisterFloatOption("coverage", &coverage,
                            "Total coverage (from which the number of reads is calculated",
                            CommandLineParser::PositiveFloat, 0);
    clp.RegisterFlagOption("nonRandInit", &noRandInit,
                           "Skip initializing the random number generator with time.");
    clp.RegisterIntOption("nReads", &numReads,
                          "Total number of reads (from which coverage is calculated)",
                          CommandLineParser::PositiveInteger, 0);
    clp.RegisterStringOption("readsFile", &readsFileName, "Reads output file", 0);
    clp.RegisterFlagOption("fastq", &printFastq,
                           "Fake fastq output with constant quality value (20)");
    clp.RegisterIntOption("quality", &qualityValue, "Value to use for fastq quality",
                          CommandLineParser::PositiveInteger);
    clp.RegisterIntOption("stratify", &stratify,
                          "Sample a read every 'stratify' bases, rather than randomly.",
                          CommandLineParser::PositiveInteger);
    clp.RegisterStringOption("titleType", &titleType,
                             "Set the name of the title: 'pacbio'|'illumina'");
    clp.RegisterStringOption("fastqType", &fastqType, "Set the type of fastq: 'illumina'|'sanger'");
    std::vector<std::string> leftovers;
    clp.ParseCommandLine(argc, argv, leftovers);

    if (!noRandInit) {
        InitializeRandomGeneratorWithTime();
    }

    FASTAReader inReader;
    inReader.Init(inFileName);
    std::vector<FASTASequence> reference;

    inReader.ReadAllSequences(reference);
    std::ofstream readsFile;
    if (readsFileName == "") {
        std::cout << "ERROR.  You must specify a reads file." << std::endl;
        std::exit(EXIT_FAILURE);
    }
    CrucialOpen(readsFileName, readsFile, std::ios::out);

    std::ofstream sangerFastqFile;
    if (fastqType == "sanger") {
        std::string sangerFastqFileName = readsFileName + ".fastq";
        CrucialOpen(sangerFastqFileName, sangerFastqFile, std::ios::out);
    }

    DNALength refLength = 0;
    for (size_t i = 0; i < reference.size(); i++) {
        refLength += reference[i].length;
    }
    if (numReads == -1 and coverage == 0 and stratify == 0) {
        std::cout << "ERROR, you must specify either coverage, nReads, or stratify." << std::endl;
        std::exit(EXIT_FAILURE);
    } else if (numReads == -1) {
        numReads = (refLength / readLength) * coverage;
    }

    if (stratify) {
        if (!readLength) {
            std::cout << "ERROR. If you are using stratification, a read length must be specified."
                      << std::endl;
            std::exit(EXIT_FAILURE);
        }
    }

    DNASequence sampleSeq;
    sampleSeq.length = readLength;
    int maxRetry = 10000000;
    int retryNumber = 0;
    DNALength seqIndex, seqPos;
    if (stratify) {
        seqIndex = 0;
        seqPos = 0;
    }
    DNALength origReadLength = readLength;
    for (int i = 0; stratify or i < numReads; i++) {
        if (stratify == 0) {
            FindRandomPos(reference, seqIndex, seqPos, readLength);
        } else {
            //
            // find the next start pos, or bail if done
            //
            if (seqPos >= reference[seqIndex].length) {
                if (seqIndex == reference.size() - 1) {
                    break;
                } else {
                    seqIndex = seqIndex + 1;
                    seqPos = 0;
                    continue;
                }
            }
            readLength = std::min(reference[seqIndex].length - seqPos, origReadLength);
        }
        sampleSeq.seq = &reference[seqIndex].seq[seqPos];
        int j;
        int gappedRead = 0;
        std::string title;
        std::stringstream titleStrm;
        if (titleType == "pacbio") {
            titleStrm << i << "|" << reference[seqIndex].GetName() << "|" << seqPos << "|"
                      << seqPos + readLength;
        } else if (titleType == "illumina") {
            titleStrm << "SE_" << i << "_0@" << seqPos << "-" << seqPos + readLength << "/1";
        } else {
            std::cout << "ERROR. Bad title type " << titleType << std::endl;
            std::exit(EXIT_FAILURE);
        }
        title = titleStrm.str();
        sampleSeq.length = readLength;
        if (!printFastq) {
            readsFile << ">" << title << std::endl;
            sampleSeq.PrintSeq(readsFile);
        } else {
            FASTQSequence fastqSampleSeq;
            fastqSampleSeq.CopyTitle(title);
            fastqSampleSeq.seq = sampleSeq.seq;
            fastqSampleSeq.length = sampleSeq.length;
            fastqSampleSeq.qual.data = new unsigned char[sampleSeq.length];
            std::fill(fastqSampleSeq.qual.data, fastqSampleSeq.qual.data + sampleSeq.length,
                      qualityValue);
            if (fastqType == "illumina") {
                fastqSampleSeq.PrintFastq(readsFile, fastqSampleSeq.length + 1);
            } else {
                fastqSampleSeq.PrintSeq(readsFile);
                fastqSampleSeq.PrintQual(sangerFastqFile);
            }
            delete[] fastqSampleSeq.qual.data;
            delete[] fastqSampleSeq.title;
        }

        if (stratify) {
            seqPos += readLength;
        }
    }
    return 0;
}
void AfgBasWriter::Initialize(std::string _afgFileName){
    afgFileName = _afgFileName;
    CrucialOpen(afgFileName, afgOut);
}
int main(int argc, char* argv[]) {
  string barcodeFileName, insertFileName, outputFileName;
  if (argc != 4) {
    cout << "usage: makeBarcodeDatabase insert.fasta barcodes.fasta output.fasta" << endl;
    exit(1);
  }
  insertFileName = argv[1];
  barcodeFileName = argv[2];
  outputFileName  = argv[3];

  FASTAReader barcodeReader, insertReader;
  barcodeReader.Initialize(barcodeFileName);
  insertReader.Initialize(insertFileName);
  
  ofstream barcodedOut;
  CrucialOpen(outputFileName, barcodedOut, std::ios::out);

  vector<FASTASequence> forwardBarcodes, reverseBarcodes;
  FASTASequence barcodeSequence, reverseBarcodeSequence;
  while(barcodeReader.GetNext(barcodeSequence)) {
    forwardBarcodes.push_back(barcodeSequence);
    barcodeSequence.MakeRC(reverseBarcodeSequence);
    reverseBarcodes.push_back(reverseBarcodeSequence);
  }
  
  FASTASequence insert;
  insertReader.GetNext(insert);
  
  int i;
  for (i = 0; i < forwardBarcodes.size(); i++) {
    FASTASequence barcodedInsert;
    barcodedInsert.Resize(forwardBarcodes[i].length * 2 + insert.length);
    stringstream titleStrm;
    titleStrm << insert.title << "|ff|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);

    titleStrm.str("");
    titleStrm << insert.title << "|fr|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);


    titleStrm.str("");
    titleStrm << insert.title << "|rf|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);


    titleStrm.str("");
    titleStrm << insert.title << "|rr|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);
  }
}
Exemple #17
0
int main(int argc, char* argv[]) {
#ifdef USE_GOOGLE_PROFILER
    char *profileFileName = getenv("CPUPROFILE");
    if (profileFileName != NULL) {
      ProfilerStart(profileFileName);
    }
    else {
      ProfilerStart("google_profile.txt");
    }
#endif

    // Register inputs and outputs.
    string samFileName, refFileName, outFileName;

    CommandLineParser clp;
    clp.RegisterStringOption("file.sam", &samFileName,
                             "Input SAM file.");
    clp.RegisterStringOption("reference.fasta", &refFileName,
                             "Reference used to generate reads.");
    clp.RegisterStringOption("out.sam", &outFileName,
                             "Output SAM file.");
    clp.RegisterPreviousFlagsAsHidden();

    // Register filter criteria options.
    int minAlnLength = 50;
    float minPctSimilarity = 70, minPctAccuracy = 70;
    string hitPolicyStr = "randombest";
    bool useScoreCutoff = false;
    int  scoreCutoff = INF_INT;
    int  scoreSignInt = -1;
    RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, 
                          minPctAccuracy, hitPolicyStr, useScoreCutoff,
                          scoreSignInt, scoreCutoff);

    int seed = 1; 
    clp.RegisterIntOption("seed", &seed,
            "(1)  Seed for random number generator.\n"
            "If seed is 0, then use current time as seed.",
            CommandLineParser::Integer);

    string holeNumberStr;
    Ranges holeNumberRanges;
    clp.RegisterStringOption("holeNumbers", &holeNumberStr,
            "A string of comma-delimited hole number ranges to output hits, "
            "such as '1,2,10-12'. "
            "This requires hit titles to be in SMRT read title format.");

    bool parseSmrtTitle = false;
    clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle,
            "Use this option when filtering alignments generated by "
            "programs other than blasr, e.g. bwa-sw or gmap. "
            "  Parse read coordinates from the SMRT read title. " 
            "The title is in the format /name/hole/coordinates, where"
            " coordinates are in the format \\d+_\\d+, and represent "
            "the interval of the read that was aligned.");
    /* This experimental option can be useful for metagenomics, in which case
     * there are hundreds of sequences in the target, of which many titles are
     * long and may contain white spaces (e.g., ' ', '\t'). 
     * In order to save disc space and avoid the (possibly) none unique mapping
     * between full and short reference names, one may call blasr with 
     * -titleTable option to represent all target sequences in the output
     * by their indices in the title table.*/

    string titleTableName = "";
    clp.RegisterStringOption("titleTable", &titleTableName,
            "Use this experimental option when filtering alignments generated by "
            "blasr with -titleTable titleTableName, in which case "
            "reference titles in SAM are represented by their "
            "indices (e.g., 0, 1, 2, ...) in the title table.");

    string adapterGffFileName = "";
    clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName,
            "Use this option to remove reads which can only map to adapters " 
            "specified in the GFF file.");

    bool verbose = false;
    clp.RegisterFlagOption("v", &verbose, "Be verbose.");

    clp.SetExamples(
            "Because SAM has optional tags that have different meanings"
            " in different programs, careful usage is required in order "
            "to have proper output.  The \"xs\" tag in bwa-sw is used to "
            "show the suboptimal score, but in PacBio SAM (blasr) it is "
            "defined as the start in the query sequence of the alignment.\n"
            "When \"-smrtTitle\" is specified, the xs tag is ignored, but "
            "when it is not specified, the coordinates given by the xs and "
            "xe tags are used to define the interval of a read that is "
            "aligned.  The CIGAR string is relative to this interval.");

    clp.ParseCommandLine(argc, argv);

    // Set random number seed. 
    if (seed == 0) {
        srand(time(NULL));
    } else {
        srand(seed);
    }
    
    scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE;
    Score s(static_cast<float>(scoreCutoff), scoreSign);
    FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, 
                                  minPctAccuracy, true, s);
    filterCriteria.Verbose(verbose);
    HitPolicy hitPolicy(hitPolicyStr, scoreSign);
                                  
    string errMsg;
    if (not filterCriteria.MakeSane(errMsg)) {
        cout << errMsg << endl;
        exit(1);
    }

    // Parse hole number ranges. 
    if (holeNumberStr.size() != 0) {
        if (not holeNumberRanges.setRanges(holeNumberStr)) {
            cout << "Could not parse hole number ranges: "
                 << holeNumberStr << "." << endl;
            exit(1);
        } 
    }

    // Open output file.
    ostream * outFilePtr = &cout;
	ofstream outFileStrm;
	if (outFileName != "") {
		CrucialOpen(outFileName, outFileStrm, std::ios::out);
		outFilePtr = &outFileStrm;
	}
    
    GFFFile adapterGffFile;
    if (adapterGffFileName != "")
        adapterGffFile.ReadAll(adapterGffFileName);
    
    SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader;
    FASTAReader fastaReader;

    //
    // Initialize samReader and fastaReader.
    //
    samReader.Initialize(samFileName);
    fastaReader.Initialize(refFileName);

    //
    // Configure the file log.
    //
    string command;
    CommandLineParser::CommandLineToString(argc, argv, command);
    string log = "Filter sam hits.";
    string program = "samFilter";
    string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

    //
    // Read necessary input.
    //
    vector<FASTASequence> references;
    fastaReader.ReadAllSequences(references);

    // If the SAM file is generated by blasr with -titleTable,
    // then references in the SAM are represented by 
    // their corresponding indices in the title table.
    // In that case, we need to convert reference titles in fasta file
    // to their corresponding indices in the title table, such that
    // references in both SAM and fasta files are represented
    // by title table indices and therefore can match.
    if (titleTableName != "") {
        ConvertTitlesToTitleTableIndices(references, titleTableName);
    }
 
    AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet;
    vector<string> allHeaders = samReader.ReadHeader(alignmentSet); 

    // Process SAM Header.
    string commandLineString;
    clp.CommandLineToString(argc, argv, commandLineString);
    allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \
                         "\tCL:" + program + " " + commandLineString);
    for (int i = 0; i < allHeaders.size(); i++) {
        outFileStrm << allHeaders[i] << endl;
    }

    //
    // The order of references in vector<FASTASequence> references and
    // AlignmentSet<, , >alignmentSet.references can be different.
    // Rearrange alignmentSet.references such that they are ordered in
    // exactly the same way as vector<FASTASequence> references.
    //
    alignmentSet.RearrangeReferences(references);

    // Map reference name obtained from SAM file to indices
    map<string, int> refNameToIndex;
    for (int i = 0; i < references.size(); i++) {
        string refName = alignmentSet.references[i].GetSequenceName();
        refNameToIndex[refName] = i;
    }

    //
    // Store the alignments.
    //
    SAMAlignment samAlignment;
    int alignIndex = 0; 

    //
    // For 150K, each chip produces about 300M sequences 
    // (not including quality values and etc.).
    // Let's assume that the sam file and reference data can 
    // fit in the memory. 
    // Need to scale for larger sequal data in the future.
    //
    vector<SAMAlignment> allSAMAlignments;
    while (samReader.GetNextAlignment(samAlignment)) {
        if (samAlignment.rName == "*") {
            continue;
        }

        if (parseSmrtTitle and holeNumberStr.size() != 0) {
            string movieName;
            int thisHoleNumber;
            if (not ParsePBIReadName(samAlignment.qName, 
                                     movieName, 
                                     thisHoleNumber)) {
                cout << "ERROR, could not parse SMRT title: "
                     << samAlignment.qName << "." << endl;
                exit(1);
            }
            if (not holeNumberRanges.contains(UInt(thisHoleNumber))) {
                if (verbose) 
                    cout << thisHoleNumber << " is not in range." << endl; 
                continue;
            }
        }

        if (samAlignment.cigar.find('P') != string::npos) {
            cout << "WARNING. Could not process SAM record with 'P' in "
                 << "its cigar string." << endl;
            continue;
        }

        vector<AlignmentCandidate<> > convertedAlignments;
        SAMAlignmentsToCandidates(samAlignment, 
                references, refNameToIndex,
                convertedAlignments, parseSmrtTitle, false);
        
        if (convertedAlignments.size() > 1) {
            cout << "WARNING. Ignore multiple segments." << endl;
            continue;
        }

        for (int i = 0; i < 1; i++) {
            AlignmentCandidate<> & alignment = convertedAlignments[i];

            //score func does not matter
            DistanceMatrixScoreFunction<DNASequence, DNASequence> distFunc; 
            ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, 
                                  alignment.tAlignedSeq.seq, distFunc);
                                  
            // Check whether this alignment can only map to adapters in 
            // the adapter GFF file.
            if (adapterGffFileName != "" and 
                CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) {
                if (verbose)
                    cout << alignment.qName << " filter adapter only."
                         << endl;
                continue;
            }

            // Assign score to samAlignment.
            samAlignment.score = samAlignment.as;

            if (not filterCriteria.Satisfy(static_cast<AlignmentCandidate<> *>(&alignment))) {
                continue;
            }
            allSAMAlignments.push_back( samAlignment ); 

            alignment.FreeSubsequences();
        }
        ++alignIndex;
    }

    // Sort all SAM alignments by qName, score and target position.
    sort(allSAMAlignments.begin(), allSAMAlignments.end(), 
         byQNameScoreTStart);

    unsigned int groupBegin = 0;
    unsigned int groupEnd = -1;
    vector<SAMAlignment> filteredSAMAlignments;
    while(groupBegin < allSAMAlignments.size()) {
        // Get the next group of SAM alignments which have the same qName
        // from allSAMAlignments[groupBegin ... groupEnd)
        GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd);
        vector<unsigned int> hitIndices = ApplyHitPolicy(
                hitPolicy, allSAMAlignments, groupBegin, groupEnd);
        for(unsigned int i = 0; i < hitIndices.size(); i++) {
            filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]);
        }
        groupBegin = groupEnd;
    }

    // Sort all SAM alignments by reference name and query name
    sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), 
         byRNameQName);

    for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) {
        filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm);
    }

	if (outFileName != "") {
		outFileStrm.close();
	}
#ifdef USE_GOOGLE_PROFILER
  ProfilerStop();
#endif
    return 0;
}