コード例 #1
0
int main(int argc, char* argv[]) {

	string seqFileName;
	TupleMetrics tm;
	string outFileName;
	if (argc < 3) {
		cout << "usage: storeTuplePosList seqFile tupleSize outFile" << endl;
		return 0;
	}
	seqFileName = argv[1];
	tm.tupleSize = atoi(argv[2]);
	outFileName = argv[3];
	
	ofstream outFile;
	//	CrucialOpen(outFileName, outFile, std::ios::out| std::ios::binary);

	FASTAReader reader;
	reader.Init(seqFileName);
	FASTASequence seq;
	reader.GetNext(seq);
	//	vector<PositionDNATuple> 
	TupleList<PositionDNATuple>tuplePosList;
	tuplePosList.SetTupleMetrics(tm);
	//	StoreTuplePosList(seq, tm, tuplePosList);
	SequenceToTupleList(seq, tm, tuplePosList);
	tuplePosList.Sort();
	tuplePosList.WriteToFile(outFileName); //WriteTuplePosList(tuplePosList, tm.tupleSize, outFile);
	outFile.close();
	return 0;
}
コード例 #2
0
int main(int argc, char* argv[]) {
  if (argc < 3) {
    cout << "usage: testRandomSequence genome.fa ntries " << endl;
    exit(0);
  }
  string inFile = argv[1];
  int nSamples = atoi(argv[2]);

  if (nSamples == 0) {
    return 0;
  }

  FASTAReader reader;
  reader.Initialize(inFile);
  vector<FASTASequence> genome;
  reader.ReadAllSequences(genome);
  
  int i;
  cout << "title pos" << endl;
  for (i = 0; i < nSamples; i++) {
    DNALength chrIndex, chrPos;
    FindRandomPos(genome, chrIndex, chrPos);
    cout << genome[chrIndex].title << " " << chrPos << endl;
  }

  return 0;
}
コード例 #3
0
ファイル: FileGenome.cpp プロジェクト: asifemon/blasr
int main(int argc, char* argv[]) {
  string inFileName, outFileName;
  int length;
  inFileName = argv[1];
  outFileName = argv[2];
  length = atoi(argv[3]);

  int argi = 4;
  int stride = 0;
  float coverage = 0;
  while (argi < argc) {
    if (strcmp(argv[argi], "-stride")) {
      stride = atoi(argv[++argi]);
    }
    else if (strcmp(argv[argi], "-coverage")) {
      coverage = atof(argv[++argi]);
    }
    ++argi;
  }

  FASTAReader reader;
  reader.Initialize(inFileName);
  FASTASequence genome;
  reader.GetNext(genome);
  if (stride == 0 and coverage == 0) {
    cout << "ERROR, must provide stride or coverage. " << endl;
    exit(1);
  }
  if (stride == 0) {
    stride = genome.length * coverage / length;
  }
コード例 #4
0
ファイル: TestBuildOccBins.cpp プロジェクト: EichlerLab/blasr
int main(int argc, char* argv[]) {
	if (argc < 3) {
		cout << "usage: testBuildOccBins genomeFileName suffixArray" << endl;
		exit(0);
	}
	string genomeFileName      = argv[1];
	string suffixArrayFileName = argv[2];
	
	FASTAReader reader;
	reader.Init(genomeFileName);
	FASTASequence seq;
	reader.GetNext(seq);
	

	DNASuffixArray suffixArray;
	
	suffixArray.Read(suffixArrayFileName);

	Bwt<PackedDNASequence, FASTASequence> bwt;
	//bwt.InitializeFromSuffixArray(seq, suffixArray.index);

	bwt.InitializeBWTStringFromSuffixArray(seq, suffixArray.index);
	bwt.occ.Initialize(bwt.bwtSequence, 4096, 64);
	bwt.occ.PrintBins(cout);
}
コード例 #5
0
int main(int argc, char* argv[]) {

	if (argc < 4) {
		PrintUsage();
		exit(1);
	}
	int argi = 1;
	string saInFile = argv[argi++];
	string genomeFileName = argv[argi++];
	string saOutFile = argv[argi++];
	vector<string> inFiles;
	
	int doBLT = 0;
	int doBLCP = 0;
	int bltPrefixLength = 0;
	int lcpLength = 0;
	int parsingOptions = 0;
	
	while (argi < argc) {
		if (strcmp(argv[argi], "-blt") == 0) {
			doBLT = 1;
			bltPrefixLength = atoi(argv[++argi]);
		}
		else if (strcmp(argv[argi], "-blcp") == 0) {
			doBLCP = 1;
				lcpLength = atoi(argv[++argi]);
		}
		else {
			PrintUsage();
			cout << "Bad option: " << argv[argi] << endl;
			exit(1);
		}
		++argi;
	}

	//
	// Read the suffix array to modify.
	//

	DNASuffixArray  sa;
	sa.Read(saInFile);

	FASTAReader reader;
	reader.Initialize(genomeFileName);
	FASTASequence seq;
	reader.ReadAllSequencesIntoOne(seq);

	
	if (doBLT) {
		sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength);
	}

	if (doBLCP) {
		cout << "LCP Table not yet implemented." << endl;
	}

	sa.Write(saOutFile);

}
コード例 #6
0
int main(int argc, char* argv[]) {
	if (argc < 3) {
		cout << "usage: bwtLocateList bwtName querySeqFile" << endl;
		exit(1);
	}
	string bwtFileName      = argv[1];
	string querySeqFileName = argv[2];
	bool doPrintResults = false;
	int maxCount = 0;
	int argi = 3;
	bool countOnly = false;
	while(argi < argc) {
		if (strcmp(argv[argi], "-print") == 0) {
			doPrintResults = true;
		}
		else if (strcmp(argv[argi], "-max") == 0) {
			maxCount = atoi(argv[++argi]);
		}
		else if (strcmp(argv[argi], "-count") == 0) {
			countOnly = true;
		}
		else {
			cout << "bad option: " << argv[argi] << endl;
		}
		++argi;
	}

 	Bwt<PackedDNASequence, FASTASequence> bwt;
	bwt.Read(bwtFileName);

	FASTAReader queryReader;
	queryReader.Init(querySeqFileName);
	FASTASequence seq;
	int seqIndex = 0;
	vector<DNALength> positions;
	while(queryReader.GetNext(seq)) {
		positions.clear();
		if (countOnly == false) {
			bwt.Locate(seq, positions, maxCount);
		}
		else {
			DNALength sp,ep;
			bwt.Count(seq, sp, ep);
		}
		//		cout << "matched " << positions.size() << " positions." << endl;
		if (doPrintResults) {
			int i;
			for (i = 0; i < positions.size(); i++ ){
				cout << positions[i] << " ";
			}
			cout << endl;
		}
		++seqIndex;
	}
		//	float wordCountsPerLookup = (bwt.bwtSequence.nCountInWord *1.0) / bwt.bwtSequence.nCountNuc;
		//	cout << "word counts per lookup: " << wordCountsPerLookup << endl;
	return 0;
}
コード例 #7
0
ファイル: ExciseRepeats.cpp プロジェクト: Jiason2017/blasr
int main(int argc, char* argv[])
{
    std::string seqInName, seqOutName, dotOutName;
    if (argc < 4) {
        std::cout << "usage: exciseRepeats inName repMaskOutFile outName" << std::endl;
        std::exit(EXIT_FAILURE);
    }

    seqInName = argv[1];
    dotOutName = argv[2];
    seqOutName = argv[3];
    FASTAReader reader;
    reader.Initialize(seqInName);
    FASTASequence origSeq;
    reader.GetNext(origSeq);

    std::ifstream dotOutFile;
    CrucialOpen(dotOutName, dotOutFile);
    std::ofstream seqOutFile;
    std::ofstream seqOut;
    CrucialOpen(seqOutName, seqOut, std::ios::out);
    std::string dotOutLine;
    getline(dotOutFile, dotOutLine);
    getline(dotOutFile, dotOutLine);
    getline(dotOutFile, dotOutLine);
    while (getline(dotOutFile, dotOutLine)) {
        std::stringstream lineStrm(dotOutLine);
        int swScore;
        float pctDiv, pctDel, pctIns;
        std::string query;
        DNALength qPosBegin, qPosEnd;
        std::string left;
        char strand;
        std::string matchingRepeat;
        std::string repClass;
        std::string repPos, repEnd, repLeft;
        int id;
        lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >>
            left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id;
        for (DNALength seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) {
            origSeq.seq[seqPos] = 'X';
        }
    }

    DNALength seqPos, unexPos;
    unexPos = 0;
    for (seqPos = 0; seqPos < origSeq.length; seqPos++) {
        if (origSeq.seq[seqPos] != 'X') {
            origSeq.seq[unexPos] = origSeq.seq[seqPos];
            unexPos++;
        }
    }
    origSeq.length = unexPos;

    origSeq.PrintSeq(seqOut);
    return 0;
}
コード例 #8
0
ファイル: readmapper.cpp プロジェクト: hcondori/readmapper
/*
* Wrapper for FASTAReader
*/
bool read_seqs(FASTAReader &reader1, FASTAReader &reader2, 
                    Buffer<int16_t> *seqs1, Buffer<int16_t> *seqs2,
                    int *seqs1_len, int *seqs2_len, 
                    std::vector<std::string> *seqs1_ids,
                    std::vector<std::string> *seqs2_ids)
{
    bool ans;
    
    #pragma omp critical
    ans = reader1.next(seqs1, seqs1_len, seqs1_ids) && 
        reader2.next(seqs2, seqs2_len, seqs2_ids);
    return ans; 
}
コード例 #9
0
ファイル: ExtractRepeat.cpp プロジェクト: jcombs1/blasr
int main(int argc, char* argv[]) {
	string genomeFileName, subseqFileName;
	if (argc != 3) {
		cout << "usage: extractRepeats genome repeat" << endl;
		exit(0);
	}

	genomeFileName = argv[1];
	subseqFileName = argv[2];

	FASTASequence genome, sub;
	FASTAReader reader;
	reader.Init(genomeFileName);
	reader.GetNext(genome);
	reader.Init(subseqFileName);
	reader.GetNext(sub);

	genome.ToUpper();
	sub.ToUpper();	
	DNALength genomePos;
	FASTASequence genomeSub;
	int kband = (int) (0.15) * sub.length;
	vector<int> scoreMat;
	vector<Arrow> pathMat;
	int readIndex = 0;
	cout << "starting extraction" << endl;
	for (genomePos = 0; genomePos < genome.length - sub.length + 1; genomePos++) {
		genomeSub.seq = &genome.seq[genomePos];
		genomeSub.length = sub.length;
		int alignScore;
		Alignment alignment;
		alignScore = SWAlign(genomeSub, sub,
												 EditDistanceMatrix, 1, //1,kband,
												 scoreMat, pathMat,
												 alignment, QueryFit);
												 
		if (alignScore < 0.25 * sub.length) {
			stringstream titlestrm;
			titlestrm << readIndex << "|" 
								 << genomePos << "|"
								<< genomePos + sub.length << " " << alignScore/ (1.0*sub.length);
			FASTASequence subcopy;
			subcopy.CopyTitle(titlestrm.str());
			subcopy.seq = &genome.seq[genomePos];
			subcopy.length = sub.length;
			subcopy.PrintSeq(std::cout);
			genomePos += sub.length;
		}
	}
}
コード例 #10
0
int main(int argc, char *argv[]) {
	string sequencesInName, sequencesOutName;
	if (argc <3){ 
		cout << "usage: scramble in out" << endl;
		exit(1);
	}
	sequencesInName = argv[1];
	sequencesOutName= argv[2];
	vector<FASTASequence*> sequences;
	vector<int> sequenceIndices;

	FASTAReader reader;
	reader.Init(sequencesInName);
	ofstream out;
	CrucialOpen(sequencesOutName, out, std::ios::out);
	

	FASTASequence read;
	FASTASequence*readPtr;
	while(reader.GetNext(read)) {
		readPtr = new FASTASequence;
		*readPtr = read;
		sequences.push_back(readPtr);
	}

	int i;
	for (i = 0; i < sequences.size(); i++) {
		sequenceIndices.push_back(i);
	}

	for (i = 0; i < 10*sequences.size(); i++ ){
		//
		// shuffle indices.
		//
		int idx1;
		int idx2;
		idx1 = RandomInt(sequences.size());
		idx2 = RandomInt(sequences.size());
		int tmp;
		tmp  = sequenceIndices[idx1];
		sequenceIndices[idx1] = sequenceIndices[idx2];
		sequenceIndices[idx2] = tmp;
	}
	
	for (i = 0; i < sequenceIndices.size(); i++ ){
		sequences[sequenceIndices[i]]->PrintSeq(out);
	}
	return 0;
}
コード例 #11
0
ファイル: BuildSequenceDB.cpp プロジェクト: EichlerLab/blasr
int main(int argc, char* argv[]) {

	CommandLineParser clp;
	string fastaFileName, indexFileName;
	vector<string> fastaFileNames;
	vector<string> opts;
	clp.SetProgramName("bsdb");
	clp.SetProgramSummary("Build an index database on a file of sequences.\n"
												" The index is used to map to reads given alignment positions.\n");
	clp.RegisterStringOption("fasta", &fastaFileName, "A file with sequences to build an index.");
	clp.RegisterStringOption("index", &indexFileName, "The index file.");
	clp.RegisterPreviousFlagsAsHidden();

	clp.ParseCommandLine(argc, argv, opts);

	ifstream fastaIn;
	ofstream indexOut;

	if (FileOfFileNames::IsFOFN(fastaFileName)) {
		FileOfFileNames::FOFNToList(fastaFileName, fastaFileNames);
	}
	else {
		fastaFileNames.push_back(fastaFileName);
	}

	CrucialOpen(indexFileName, indexOut, std::ios::out | std::ios::binary);
	SequenceIndexDatabase<FASTASequence> seqDB;
		
	int fileNameIndex;
	for (fileNameIndex = 0; fileNameIndex < fastaFileNames.size(); fileNameIndex++){ 
		FASTAReader reader;
		FASTASequence seq;
		reader.Init(fastaFileNames[fileNameIndex]);
		int i = 0;
		while (reader.GetNext(seq)) {
			seqDB.AddSequence(seq);
			i++;
		}
	}
	seqDB.Finalize();
	seqDB.WriteDatabase(indexOut);
	return 0;
}
コード例 #12
0
ファイル: SplitContigs.cpp プロジェクト: asifemon/blasr
int main(int argc, char* argv[]) {
		if (argc < 4) {
			cout << "usage: splitContigs in.fa contiglength out" << endl;
			exit(1);
		}
		string inFileName, outFileName;
		inFileName = argv[1];
		int contigLength = atoi(argv[2]);		
		outFileName = argv[3];

		ofstream seqOut;
		CrucialOpen(outFileName, seqOut, std::ios::out);
		FASTAReader reader;
		reader.Init(inFileName);
		FASTASequence seq;
		DNALength curOffset;
		
		while(reader.GetNext(seq)) {
			FASTASequence subseq;
			int i;
			curOffset = 0;
			for (i =0 ; i < seq.length / contigLength + 1; i++ ) {
				subseq.seq = &seq.seq[curOffset];
				subseq.title = seq.title;
				if (curOffset + contigLength > seq.length) {
					subseq.length = seq.length - curOffset;
				}
				else {
					subseq.length = contigLength;
				}
				subseq.PrintSeq(seqOut);
				curOffset += contigLength;
			}
		}
		return 0;
}
コード例 #13
0
ファイル: Hammer.cpp プロジェクト: EichlerLab/blasr
int main(int argc, char* argv[]) {
	string refFileName, queryFileName;
	int maxHammingDistance;
	if (argc < 4) {
		cout << "usage: hammer ref query maxHam " << endl;
		exit(1);
	}
	refFileName = argv[1];
	queryFileName = argv[2];
	maxHammingDistance = atoi(argv[3]);

	FASTAReader reader;
	reader.Initialize(refFileName);
	FASTASequence ref, refRC;
	reader.GetNext(ref);
	ref.MakeRC(refRC);
	
	FASTAReader queryReader;
	queryReader.Initialize(queryFileName);
	FASTASequence query;
	queryReader.GetNext(query);
	DNALength p;
	for(p=0; p < ref.length-query.length-1; p++ ){
		DNASequence subseq;
		subseq.seq = &ref.seq[p];
		subseq.length = query.length;
		//		cout << "t "; subseq.PrintSeq(cout);
		//		cout << "q "; ((DNASequence*)&query)->PrintSeq(cout);
		if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) {
			cout << ">" << p << endl;
			subseq.PrintSeq(cout);
		}
		int i;
		for (i =0; i < query.length; i++) {
			subseq.seq[i] = toupper(subseq.seq[i]);
		}
	}

	for(p=0; p < ref.length-query.length-1; p++ ){
		DNASequence subseq;
		subseq.seq = &refRC.seq[p];
		subseq.length = query.length;
		if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) {
			cout << ">" << p << "rc" << endl;
			subseq.PrintSeq(cout);
		}
		int i;
		for (i =0; i < query.length; i++) {
			subseq.seq[i] = toupper(subseq.seq[i]);
		}
	}

}
コード例 #14
0
int main(int argc, char* argv[]) {


	string refFileName, notNormalFileName, normalFileName;

	if (argc < 4) {
		cout << "usage: normalizeGCContent ref source dest " << endl
				 << "       flips the C/Gs in source randomly until they are the same gc content as ref." << endl;
		exit(1);
	}
		
	refFileName = argv[1];
	notNormalFileName = argv[2];
	normalFileName = argv[3];


	FASTAReader reader;
	FASTAReader queryReader;
	FASTASequence ref;
	vector<FASTASequence> querySequences;
	int queryTotalLength;
	reader.Initialize(refFileName);
	reader.ReadAllSequencesIntoOne(ref);

	queryReader.Initialize(notNormalFileName);
	int refCounts[5], queryCounts[5];
	int s;
	refCounts[0] = refCounts[1] =refCounts[2] = refCounts[3] = refCounts[4] = 0;
	queryCounts[0] = queryCounts[1] =queryCounts[2] = queryCounts[3] = queryCounts[4] = 0;
	
	queryReader.ReadAllSequences(querySequences);
	ofstream normOut;
	CrucialOpen(normalFileName, normOut);

	CountNucs(ref, refCounts);
	
	float refGC = (1.0*refCounts[TwoBit['c']] + refCounts[TwoBit['g']]) / (refCounts[TwoBit['a']] + refCounts[TwoBit['c']] + refCounts[TwoBit['g']] + refCounts[TwoBit['t']]);

	int q;
	for (q = 0; q < querySequences.size(); q++) {
		CountNucs(querySequences[q], queryCounts);
	}

	float queryGC = (1.0*queryCounts[TwoBit['c']] + queryCounts[TwoBit['g']]) / (queryCounts[TwoBit['a']] + queryCounts[TwoBit['c']] + queryCounts[TwoBit['g']] + queryCounts[TwoBit['t']]);

	
	float gcToat = 0.0;
	float atTogc = 0.0;
	if (refGC > queryGC) {
		atTogc = (refGC - queryGC);
	}
	else {
		gcToat = (queryGC - refGC);
	}

	
	DNALength queryGenomeLength = queryCounts[0] +  queryCounts[1] + queryCounts[2] + queryCounts[3] + queryCounts[4];

	DNALength unmaskedQueryLength = queryCounts[0] +  queryCounts[1] + queryCounts[2] + queryCounts[3];

	DNALength ngc2at = unmaskedQueryLength * gcToat;
	DNALength nat2gc = unmaskedQueryLength * atTogc;
	cout << refGC << " " << queryGC << " " << gcToat << " " << atTogc << " " << ngc2at << " " << nat2gc << endl;

	vector<FASTASequence> normalized;

	normalized.resize(querySequences.size());
	vector<DNALength> cumLengths;
	
	cumLengths.resize(normalized.size()+1);
	cumLengths[0] = 0;
	for (q = 0; q < querySequences.size(); q++) {
		normalized[q]   = querySequences[q];
		cumLengths[q+1] = cumLengths[q] + querySequences[q].length;
	}
	
	DNALength i;

																
	for (i = 0; i < ngc2at; i+=2) {
		DNALength pos, chr;
		FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'G', chr, pos);
		normalized[chr].seq[pos] = 'A';
		FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'C', chr, pos);
		normalized[chr].seq[pos] = 'T';		
	}
	
	for (i = 0; i < nat2gc; i+=2) {
		DNALength pos, chr;
		FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'A', chr, pos);
		normalized[chr].seq[pos] = 'g';
		FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'T', chr, pos);
		normalized[chr].seq[pos] = 'c';		
	}

	for (q = 0; q < normalized.size(); q++ ){
		normalized[q].PrintSeq(normOut);
	}

}
コード例 #15
0
ファイル: Evolve.cpp プロジェクト: EichlerLab/blasr
int main(int argc, char* argv[]) {
	CommandLineParser clp;

	string refGenomeName;
	string mutGenomeName;
  string gffFileName;
	float insRate = 0;
	float delRate = 0;
	float mutRate = 0;
  bool  lower = false;
  gffFileName = "";
	clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true);
	clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true);
	clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome.");
	clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", 
													CommandLineParser::NonNegativeFloat, false);
	clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", 
													CommandLineParser::NonNegativeFloat, false);
	clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", 
													CommandLineParser::NonNegativeFloat, false);
  clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false);
	vector<string> leftovers;
	clp.ParseCommandLine(argc, argv, leftovers);
  
	FASTAReader reader;
	FASTASequence refGenome;

	reader.Init(refGenomeName);
	ofstream mutGenomeOut;
	CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out);
  ofstream gffOut;
  if (gffFileName != "") {
    CrucialOpen(gffFileName, gffOut, std::ios::out);
  }

	vector<int> insIndices, delIndices, subIndices;
	int readIndex = 0;
	InitializeRandomGeneratorWithTime();
	while (reader.GetNext(refGenome)) {
		insIndices.resize(refGenome.length);
		delIndices.resize(refGenome.length);
    subIndices.resize(refGenome.length);
		std::fill(insIndices.begin(), insIndices.end(), false);
		std::fill(delIndices.begin(), delIndices.end(), false);
    std::fill(subIndices.begin(), subIndices.end(), 0);

		enum ChangeType { Ins, Del, Mut, None};
		float changeProb[4];
		changeProb[Ins] = insRate;
		changeProb[Del] = changeProb[Ins] + delRate;
		changeProb[Mut] = changeProb[Del] + mutRate;
		changeProb[None] = 1;

		if (changeProb[Mut] > 1) {
			cout << "ERROR! The sum of the error probabilities must be less than 1" << endl;
			exit(1);
		}
		DNALength pos;
		float randomNumber;
		int numIns = 0;
		int numDel = 0;
		int numMut = 0;
		for (pos =0 ; pos < refGenome.length; pos++) { 
			randomNumber = Random();
			if (randomNumber < changeProb[Ins]) {
				insIndices[pos] = true;
				numIns++;
			}
			else if (randomNumber < changeProb[Del]) {
				delIndices[pos] = true;
				numDel++;
			}
			else if (randomNumber < changeProb[Mut]){ 
				Nucleotide newNuc = TwoBitToAscii[RandomInt(4)];
				int maxIts = 100000;
				int it = 0;
				while (newNuc == refGenome.seq[pos]) {
					newNuc = TwoBitToAscii[RandomInt(4)];
					if (it == maxIts) {
						cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl;
						exit(1);
					}
				}
        subIndices[pos] = refGenome[pos];
				refGenome.seq[pos] = ToLower(newNuc,lower);
				++numMut;
			}
		}
		//		cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl;
		if (readIndex % 100000 == 0 && readIndex > 0) {
			cout << readIndex << endl;
		}
		// 
		// Now add the insertions and deletions.
		//
		FASTASequence newSequence;
		DNALength   newPos;
		if (numIns - numDel + refGenome.length < 0) {
			cout << "ERROR, the genome has been deleted to nothing." << endl;
			exit(1);
		}
		ResizeSequence(newSequence, refGenome.length + (numIns - numDel));
		newPos = 0;
		pos = 0;
		for (pos = 0; pos < refGenome.length; pos++) {
			assert(newPos < newSequence.length or delIndices[pos] == true);
      if (subIndices[pos] != 0 and gffFileName != "") {
        gffOut << refGenome.GetName() << "	.	SNV	" << newPos << " " << newPos <<" 0.00	.	.	reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl;
      }
        
			if (insIndices[pos] == true) {
				newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower);
				newPos++;
				newSequence.seq[newPos] = refGenome.seq[pos];
        
				assert(newSequence.seq[newPos] != '1');
				assert(newSequence.seq[newPos] != 1);
        if (gffFileName != "") {
          gffOut << refGenome.GetName() << "	.	deletion	" << newPos << " " << newPos << " 0.00	.	.	reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl;
        }
				newPos++;
			}
			else if (delIndices[pos] == true) {
				// no-op, skip
        if (gffFileName != "") {
          gffOut << refGenome.GetName() << "	.	insertion	" << newPos << " " << newPos << " 0.00	.	.	confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl;
//ref000001	.	deletion	20223	20223	0.00	.	.	reference=T;length=1;confidence=0;coverage=0;Name=20222delT
        }
			}
			else {
				newSequence.seq[newPos] = refGenome.seq[pos];
				newPos++;
			}
		}
		stringstream titlestrm;
		titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate;
		newSequence.CopyTitle(refGenome.title);
		newSequence.AppendToTitle(titlestrm.str());
		newSequence.PrintSeq(mutGenomeOut);
    newSequence.Free();
		readIndex++;
	}
}
コード例 #16
0
ファイル: WordCounter.cpp プロジェクト: csuliweilong/blasr
int main(int argc, char* argv[]) {
  FASTAReader reader;
  if (argc < 5) {
	cout << "usage: wordCounter seqFile tupleSize tupleOutputFile posOutputFile" << endl;
	exit(1);
  }

  string fileName = argv[1];
  int    tupleSize = atoi(argv[2]);
  string tupleListName = argv[3];
	string posOutName    = argv[4];
  
	TupleMetrics tm;
  tm.Initialize(tupleSize);
  reader.Init(fileName);

  FASTASequence seq;
  reader.GetNext(seq);

  vector<CountedDNATuple> tupleList;
  CountedDNATuple tuple;
  DNALength i;
  for (i = 0; i < seq.length - tm.tupleSize + 1; i++ ) {
		if (tuple.FromStringRL((Nucleotide*) (seq.seq + i), tm)) {
			tuple.count = i;
			tupleList.push_back(tuple);
		}
  }

  std::sort(tupleList.begin(), tupleList.end());

  int t;
  int t2;
  int numTuples = tupleList.size();
  t = t2 = 0;
  int numUnique = 0;
  while (t < numTuples) {
	t2 = t;
	t2++;
	while (t2 < numTuples and tupleList[t] == tupleList[t2]) {
	  t2++;
	}
	++numUnique;
	t = t2;
  }

  ofstream countedTupleListOut;
  countedTupleListOut.open(tupleListName.c_str(), ios_base::binary);

	ofstream posOut;
	posOut.open(posOutName.c_str(), ios_base::binary);

  countedTupleListOut.write((const char*) &numUnique, sizeof(int));
  countedTupleListOut.write((const char*) &tm.tupleSize, sizeof(int));

  posOut.write((const char*) &numUnique, sizeof(int));

	//
	// Write out the tuple+counts to a file.
	//
  t = t2 = 0;
  CountedDNATuple countedTuple;
	int numMultOne = 0;
  while (t < numTuples) {
		t2 = t;
		t2++;
		while (t2 < numTuples and tupleList[t] == tupleList[t2]) {
			t2++;
		}
		countedTuple.tuple = tupleList[t].tuple;
		countedTuple.count = t2 - t;
		if (countedTuple.count == 1) ++numMultOne;
		countedTupleListOut.write((const char*) &countedTuple,sizeof(CountedDNATuple));
		
		posOut.write((char*)&countedTuple.count, sizeof(int));
		
		int tc;
		for (tc = t; tc < t2; tc++) {
			posOut.write((char*) &tupleList[tc].count, sizeof(int));
		}
		t = t2;
  }

	//
	// Write out the positions of the tuples to a file.
	//
	
	posOut.close();
	countedTupleListOut.close();

	//  cout << "found " << numUnique << " distinct " << DNATuple::TupleSize << "-mers." << endl;
	cout << numMultOne << endl;
  return 0;
}
コード例 #17
0
int main(int argc, char* argv[]) {

	string cmpFileName;
	string refFileName;
	string readsFileName;
  string mapqvTrackName;
	if (argc < 2) {
		cout << "  printMapqvTrack: print a gff file of the average mapping quality value" << endl;
		exit(1);
	}
	vector<int> refPositions;
	cmpFileName = argv[1];
	refFileName = argv[2];
  mapqvFileName = argv[3];

	CmpFile cmpFile;
	FASTASequence ref;
	FASTAReader reader;

	reader.Initialize(refFileName);
	reader.GetNext(ref);

	HDFBasReader basReader;

	SMRTSequence seq, *seqPtr;

	vector<int> refCoverage;
	refCoverage.resize(ref.length);
	std::fill(refCoverage.begin(), refCoverage.end(), 0);
	/*
	 * These guys pull information from the same pls file.
	 */
	HDFCmpReader<CmpAlignment> cmpReader;


	if (cmpReader.Initialize(cmpFileName) == 0) {
		cout << "ERROR, could not open the cmp file." << endl;
		exit(1);
	}
	
	
	cmpReader.Read(cmpFile);
	UInt alignmentIndex;

	//	movieIndexSets.resize(nMovies);
	for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) {
		int refSeqId    = cmpFile.alnInfo.alignments[alignmentIndex].GetRefSeqId();
		int readGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetReadGroupId();
		int refSeqIdIndex;
		if (cmpFile.refSeqTable.GetIndexOfId(refSeqId, refSeqIdIndex) == false) {
			//
			// Sanity check -- we're only looking at alignments to references in the cmp file.
			//
			cout << "ERROR, ref seq id: " << refSeqId << " should exist in the cmp file but it does not." << endl;
			assert(0);
		}

		int readGroupIdIndex;
		cmpFile.readGroupTable.GetIndexOfId(readGroupId, readGroupIdIndex);
		
		string readGroupPath    = cmpFile.readGroupTable.names[readGroupIdIndex];
		string readGroup        = cmpReader.readGroupPathToReadGroup[readGroupPath];
		int readGroupArrayIndex = cmpReader.refAlignGroups[refSeqIdIndex]->experimentNameToIndex[readGroup];
		vector<char> alignedSequence, alignedTarget;

		//
		// This read overlaps one of the ref positions.
		
		UInt offsetEnd, offsetBegin;
				
		offsetEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd();
		offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin();
		vector<unsigned char> byteAlignment;
		int alignedSequenceLength = offsetEnd - offsetBegin;
		if (alignedSequenceLength >= 0) {
			alignedSequence.resize(alignedSequenceLength);
			alignedTarget.resize(alignedSequenceLength);
			byteAlignment.resize(alignedSequenceLength);
		}

		cmpReader.refAlignGroups[refSeqIdIndex]->readGroups[readGroupArrayIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]);
		UInt refStart = cmpFile.alnInfo.alignments[alignmentIndex].GetRefStart();
		UInt refEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetRefEnd();
		UInt readStart= cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart();
		UInt readEnd  = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd();
		//
		// Read the alignment string.
		//
		if (refSeqIdIndex > 0) continue;


		

		
		//
		// Convert to something we can compare easily.
		//
		alignedSequence[alignedSequence.size()-1]= '\0';
		ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]);
		ByteAlignmentToRefString(&byteAlignment[0], byteAlignment.size(), &alignedTarget[0]);
		int gi, i;
		gi = 0;
		int refStrand =  cmpFile.alnInfo.alignments[alignmentIndex].GetRCRefStrand();
		if (refStrand == 1) {
			// revcomp the ref strand
			vector<char> rcAlignedTarget, rcAlignedQuery;
			int t;
			rcAlignedTarget.resize(alignedTarget.size());
			rcAlignedQuery.resize(alignedSequence.size());
			for (t = 0; t < alignedTarget.size(); t++) {
				if (alignedTarget[t] == ' ') {
					rcAlignedTarget[alignedTarget.size() - t - 1] = ' ';
				}
				else {
					rcAlignedTarget[alignedTarget.size() - t - 1] = ReverseComplementNuc[alignedTarget[t]];
				}
				if (alignedSequence[t] == ' '){ 
					rcAlignedQuery[alignedTarget.size()  - t - 1] = ' ';
				}
				else {
					rcAlignedQuery[alignedTarget.size() - t - 1] = ReverseComplementNuc[alignedTarget[t]];
				}
			}
			alignedTarget = rcAlignedTarget;
			alignedSequence = rcAlignedQuery;
		}
		
		int holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber();
		int ri = readStart;

		gi = refStart;

		for (i = 0; i < alignedTarget.size(); i++, gi++, ri++ ) {
			while(i < alignedTarget.size() and alignedTarget[i] == ' ') { 
				i++; 
			}
			if (alignedSequence[i] != ' ') {
				refCoverage[gi]++;
			}
		}
	} // end looping over regions

// Now compute the number of gaps.
	UInt pos;
	int numNotCovered = 0;
	for (pos = 0; pos < refCoverage.size(); pos++ ){
		if (refCoverage[pos] < 1) { numNotCovered++;}
	}
	if (numNotCovered > 100) {
		cout << "TOO Many!!!" << endl;
	}
	else {
		for (pos = 0; pos < refCoverage.size(); pos++ ){
			//		cout << refCoverage[pos] << endl;
			if (refCoverage[pos] < 1) {
				int left, right;
				left = right = -1;
				if (pos > 0) { left = refCoverage[pos-1];}
				if (pos < refCoverage.size()-1) {right = refCoverage[pos+1];}
				cout << pos << " " << left << " " << right << endl;
			}
		}
	}

}
コード例 #18
0
int main(int argc, char* argv[]) {
	string rgInName, rgOutName;
	int minPathLength;
	string vertexSequenceFileName;
	if (argc < 5) {
		cout << "usage: trimShortEnds in.rg  vertexSequences minPathLength out.rg" << endl;
		exit(1);
	}

	rgInName      = argv[1];
	vertexSequenceFileName = argv[2];
	minPathLength = atoi(argv[3]);
	rgOutName     = argv[4];

	ofstream rgOut;
	CrucialOpen(rgOutName, rgOut, std::ios::out);
	FASTAReader vertexSequenceReader;
	vertexSequenceReader.Init(vertexSequenceFileName);

	RepeatGraph<string> rg;
	vector<FASTASequence> vertexSequences;
	rg.ReadGraph(rgInName);
	vertexSequenceReader.ReadAllSequences(vertexSequences);

	VectorIndex vertexIndex;
	VectorIndex outEdgeIndex;
	VectorIndex edgeIndex;
	
	if (rg.edges.size() == 0) {
		cout << "LIKELY INVALID GRAPH. There are no edges." << endl;
		return 0;
	}
	//
	// At first, any edge that exists is connected to a vertex. This
	// will change as low coverage edges are deleted and replaced by
	// high coverage edges from the end of the array.
	//
	for (edgeIndex = 0; edgeIndex < rg.edges.size(); edgeIndex++) {
		rg.edges[edgeIndex].connected = true;
	}
	set<std::pair<VectorIndex, VectorIndex> > srcDestToRemove;
	
	for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++) {
		if (rg.vertices[vertexIndex].inEdges.size() == 0 and
				rg.vertices[vertexIndex].outEdges.size() == 1) {
			//
			// This is a source.  Traverse this until a branching vertex or the end is found.
			//
			vector<VectorIndex> path;
			path.push_back(vertexIndex);
			int pathLength = 0;
			VectorIndex pathVertex;
			VectorIndex pathEdge;
			pathEdge = rg.vertices[vertexIndex].outEdges[0];
			pathVertex = rg.edges[pathEdge].dest;
			while (rg.vertices[pathVertex].inEdges.size() == 1 and
						 rg.vertices[pathVertex].outEdges.size() == 1) {
				path.push_back(pathVertex);
				pathEdge   =  rg.vertices[pathVertex].outEdges[0];
				pathVertex =  rg.edges[pathEdge].dest;
				pathLength += vertexSequences[pathVertex/2].length;
			}
			pathLength += vertexSequences[pathVertex/2].length;
			path.push_back(pathVertex);
			if (pathLength < minPathLength and path.size() < 3) {
				//
				// Remove this path, it is too short.
				// Also remove the complement.
				//
				cout << "trimming path of " << path.size() << " is of sequence length " << pathLength << endl;

				VectorIndex pathIndex;
				for (pathIndex = 0; pathIndex < path.size() - 1; pathIndex++) {
					srcDestToRemove.insert(pair<VectorIndex, VectorIndex>(path[pathIndex], path[pathIndex+1]));
					srcDestToRemove.insert(pair<VectorIndex, VectorIndex>(2*(path[pathIndex+1]/2) + !(path[pathIndex+1]%2),
																																2*(path[pathIndex]/2) + !(path[pathIndex]%2)));
				}
			}
		}
	}

	MarkEdgePairsForRemoval(srcDestToRemove, rg.vertices, rg.edges);
	RemoveUnconnectedEdges(rg.vertices, rg.edges);

	rg.WriteGraph(rgOut);
	return 0;
}
コード例 #19
0
int main(int argc, char* argv[1]) {
	if (argc < 3) {
		cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl;
		cout << "  genome.fasta.sa must exist." << endl;
		cout << "  Finds sequences at least effective_k in length that are unique." << endl;
		cout << "  -max m       Allow up to m matches" << endl;
		cout << "  -minLength l Ensure the length of the match is at least this." << endl;
		cout << "  -prefix p n  Allow up to n matches across a prefix of length p" << endl;
		cout << "  -suffix s n  Allow up to n matches across a suffix of length s" << endl;
		cout << "               Prefix and suffix options override max." << endl;
		cout << "  -out file    Print queries to this output file (query.fasta.queries)" << endl;
		exit(0);
	}

	DNASuffixArray sarray;
	
	string genomeFileName = argv[1];
	string suffixArrayFileName = genomeFileName + ".sa";
	
	FASTAReader reader;
	FASTASequence genome;

	int maxN = 0;

	int prefix = 0;
	int suffix = 0;
	int prefixN = 0;
	int suffixN = 0;
	int argi = 4;
	string outputFileName = "";
	int minLength = 0;
	while (argi < argc) {
		if (strcmp(argv[argi], "-max") == 0) {
			++argi;
			maxN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-prefix") == 0) {
			++argi;
			prefix = atoi(argv[argi]);
			++argi;
			prefixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-suffix") == 0) {
			++argi;
			suffix = atoi(argv[argi]);
			++argi;
			suffixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-out") == 0) {
			++argi;
			outputFileName = argv[argi];
		}
		else if (strcmp(argv[argi], "-minLength") == 0) {
			++argi;
			minLength = atoi(argv[argi]);
		}
		++argi;
	}

	reader.Initialize(genomeFileName);
	reader.ReadAllSequencesIntoOne(genome);
	sarray.Read(suffixArrayFileName);

	FASTAReader queryReader;
	FASTASequence querySequence;
	string queryFileName = argv[2];
	int maxLength = atoi(argv[3]);
	string summaryTableFileName = queryFileName + ".summary";
	if (outputFileName == "") {
		outputFileName = queryFileName + ".queries";
	}
		
	
	ofstream summaryTable(summaryTableFileName.c_str());
	ofstream outputFile(outputFileName.c_str());

	queryReader.Initialize(queryFileName);

	while (queryReader.GetNext(querySequence)) {
		int i;
		cerr << "searching " << querySequence.title << endl;
		if (querySequence.length < maxLength) {
			continue;
		}

		int nMatches = 0;
		querySequence.ToUpper();
		int localMax;
		for (i = 0; i < querySequence.length - maxLength + 1; i++) {
			if ((i + 1) % 100000 == 0) {
				cerr << "processed: " << i + 1 << endl;
			}

			int lcpLength;
			vector<SAIndex> lcpLeftBounds, lcpRightBounds;
			vector<SAIndex> rclcpLeftBounds, rclcpRightBounds;
			localMax = maxN;
			if (i < prefix) {
				localMax = prefixN;
			}
			if (i >= querySequence.length - suffix) {
				localMax = suffixN;
			}
			if (querySequence.length - i <= maxLength) {
				continue;
			}
			if (querySequence.seq[i] == 'N') {
				continue;
			}
			lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																				&querySequence.seq[i], querySequence.length-i,
																				true,
																				maxLength,
																				lcpLeftBounds, lcpRightBounds,
																				false);
			if (lcpLength < minLength) {
				continue;
			}
			if (lcpLength < maxLength or 
					lcpRightBounds.size() == 0 or 
					(lcpRightBounds.size() > 0 and 
					 lcpLeftBounds.size() > 0 and  
					 lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) {

				FASTASequence rc;
				DNASequence subseq;
				subseq.ReferenceSubstring(querySequence, i, maxLength);
				subseq.MakeRC(rc);
				int rclcpLength;
				int numForwardMatches;
				if (lcpLength == 0) {
					numForwardMatches = 0;
				}
				else {
					numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1];
				}
				rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																						rc.seq, maxLength,
																						true,
																						rclcpLength,
																						rclcpLeftBounds, rclcpRightBounds,
																						false);

				string rcstr((const char*)rc.seq, rc.length);

				if (rclcpLength < maxLength or 
						rclcpRightBounds.size() == 0 or
						(numForwardMatches + 
						 rclcpRightBounds[rclcpRightBounds.size() - 1] -
						 rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) 
					{
						char* substr = new char[maxLength+1];
						substr[maxLength] = '\0';
						memcpy(substr, &querySequence.seq[i], maxLength);

						//						string substr = string((const char*) querySequence.seq, i, maxLength);
						
						outputFile << querySequence.title << "\t" << substr << "\t" << i << endl;

						++nMatches;
						delete[] substr;
						//					}
					}
				rc.Free();
			}

		}
		summaryTable << querySequence.title << "\t" << nMatches << endl;
		querySequence.Free();
	}
	outputFile.close();
	genome.Free();
}
コード例 #20
0
ファイル: SamToCmpH5.cpp プロジェクト: macmanes/blasr
int main(int argc, char* argv[]) {
  string program = "samtoh5";
  string versionString = VERSION;
  AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);
  string samFileName, cmpFileName, refFileName;
  bool parseSmrtTitle = false;
  bool useShortRefName = false;
  CommandLineParser clp;
  string readType = "standard";
  int verbosity = 0;

  clp.SetProgramName(program);
  clp.SetProgramSummary("Converts in.sam file to out.cmp.h5 file.");
  clp.SetVersion(versionString);

  clp.RegisterStringOption("in.sam", &samFileName, 
                           "Input SAM file.", true);
  clp.RegisterStringOption("reference.fasta", &refFileName, 
                           "Reference used to generate reads.", true);
  clp.RegisterStringOption("out.cmp.h5", &cmpFileName, 
                           "Output cmp.h5 file.", true);
  clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, 
                         "Use this option when converting alignments "
                         "generated from reads produced by the "
                         "pls2fasta from bas.h5 files by parsing read "
                         "coordinates from the SMRT read title.  The title " 
                         "is in the format /name/hole/coordinates, where "
                         "coordinates are in the format \\d+_\\d+, and "
                         "represent the interval of the read that was "
                         "aligned.");
  clp.RegisterStringOption("readType", &readType, 
                         "Set the read type: 'standard', 'strobe', 'CCS', "
                         "or 'cDNA'");
  clp.RegisterIntOption("verbosity", &verbosity, 
                         "Set desired verbosity.", 
                         CommandLineParser::PositiveInteger);
  clp.RegisterFlagOption("useShortRefName", &useShortRefName, 
                         "Use abbreviated reference names obtained "
                         "from file.sam instead of using full names "
                         "from reference.fasta.");
  string description = ("Because SAM has optional tags that have different "
    "meanings in different programs, careful usage is required in order to "
    "have proper output. The \"xs\" tag in bwa-sw is used to show the "
    "suboptimal score, but in PacBio SAM (blasr) it is defined as the start "
    "in the query sequence of the alignment.\nWhen \"-smrtTitle\" is "
    "specified, the xs tag is ignored, but when it is not specified, the "
    "coordinates given by the xs and xe tags are used to define the interval "
    "of a read that is aligned. The CIGAR string is relative to this interval.");
  clp.SetExamples(description);

  clp.ParseCommandLine(argc, argv);

  if (readType != "standard" and readType != "strobe" and 
      readType != "cDNA" and readType != "CCS") {
    cout << "ERROR. Read type '" << readType 
         << "' must be one of either 'standard', 'strobe', 'cDNA' or 'CCS'." 
         << endl;
    exit(1);
  }
    
  cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl;

  SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader;
  FASTAReader fastaReader;
  HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > cmpFile;

  //
  // Initialize input/output files.
  //
  samReader.Initialize(samFileName);
  fastaReader.Initialize(refFileName);
  cmpFile.Create(cmpFileName);

  //
  // Configure the file log.
  //
  string command;
  CommandLineParser::CommandLineToString(argc, argv, command);
  string log = "Convert sam to cmp.h5";
  cmpFile.fileLogGroup.AddEntry(command, log, program, GetTimestamp(), versionString);

  //
  // Set the readType
  //
  cmpFile.SetReadType(readType);

  //
  // Read necessary input.
  //

  vector<FASTASequence> references;
  fastaReader.ReadAllSequences(references);
  
  //
  // This should probably be handled by the alignmentSetAdapter, but
  // time constraints...
  //
  AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet;
  samReader.ReadHeader(alignmentSet);
 
  //
  // The order of references in vector<FASTASequence> references and
  // AlignmentSet<, , >alignmentSet.references can be different.
  // Rearrange alignmentSet.references such that it is ordered in
  // exactly the same way as vector<FASTASequence> references.
  //
  alignmentSet.RearrangeReferences(references);

  //
  // Always recompute the MD5 values even if they exist in the input
  // sam file. Because MD5 is defined differently in sam and cmp.h5 files.
  // The SAM convention uppercases and normalizes before computing the MD5. 
  // For cmp.h5, we compute the MD5 on the sequence 'as is'.
  // 
  for(int i = 0; i < alignmentSet.references.size(); i++) {
      MakeMD5((const char*)&references[i].seq[0], 
              (unsigned int)references[i].length, alignmentSet.references[i].md5);
  }
 
  //
  // Map short names for references obtained from file.sam to full names obtained from reference.fasta
  //
  map<string, string> shortRefNameToFull;
  map<string, string>::iterator it;
  assert(references.size() == alignmentSet.references.size());
  if (!useShortRefName) {
      for (int i = 0; i < references.size(); i++) {
          string shortRefName = alignmentSet.references[i].GetSequenceName();
          string fullRefName(references[i].title); 
          if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) {
              cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl;
              exit(1);
          } 
          shortRefNameToFull[shortRefName] = fullRefName;
          alignmentSet.references[i].sequenceName = fullRefName;
      }
  }

  //
  // Start setting up the cmp.h5 file.
  //
  AlignmentSetToCmpH5Adapter<HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > > alignmentSetAdapter;
  alignmentSetAdapter.Initialize();
  alignmentSetAdapter.StoreReferenceInfo(alignmentSet.references, cmpFile);
  
  //
  // Store the alignments.
  //
  SAMAlignment samAlignment;
  int alignIndex = 0;
  while (samReader.GetNextAlignment(samAlignment)) {
    if (samAlignment.rName == "*") {
      continue;
    }
    if (!useShortRefName) {
        //convert shortRefName to fullRefName
        it = shortRefNameToFull.find(samAlignment.rName);
        if (it == shortRefNameToFull.end()) {
            cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl;
            exit(1);
        }
        samAlignment.rName = (*it).second;
    }
    vector<AlignmentCandidate<> > convertedAlignments;
    if (verbosity > 0) {
      cout << "Storing alignment for " << samAlignment.qName << endl;
    }
    SAMAlignmentsToCandidates(samAlignment, 
                              references, alignmentSetAdapter.refNameToIndex,
                              convertedAlignments, parseSmrtTitle, false);

    alignmentSetAdapter.StoreAlignmentCandidateList(convertedAlignments, cmpFile, alignIndex);
    int a;
    for (a = 0; a < convertedAlignments.size(); a++) {
      convertedAlignments[a].FreeSubsequences();
    }
    ++alignIndex;
    /*    if (alignIndex == 100) {
      return 0;
      }*/
  }

  cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl;
  return 0;
}
コード例 #21
0
int main(int argc, char* argv[]) {
  string gencodeGffFileName, genomeFileName, genesOutFileName;
  string geneType = "protein_coding";
  bool randomSplicing = false;
  int numRandomSplicing = 1;
  float pSkip = 0.5;
  if (argc < 4) {
    cout << "Usage: extractGenes gencodeGTFFile genomeFile genesOutFileName [-geneType type (protein_coding)] [-randomSplicing] [-numRandomSplicing n] [-pSkip prob (0-1, default:0.5)]" << endl;
    exit(1);
  }

  gencodeGffFileName = argv[1];
  genomeFileName     = argv[2];
  genesOutFileName   = argv[3];

  int argi = 4;
  string coordinatesFileName;

  while (argi < argc) {
    if (strcmp(argv[argi], "-geneType") == 0) {
      geneType = argv[++argi];
    }
    else if (strcmp(argv[argi], "-randomSplicing") == 0) {
      randomSplicing = true;
    }
    else if (strcmp(argv[argi], "-numRandomSplicing") == 0) {
      numRandomSplicing = atoi(argv[++argi]);
    }
    else if (strcmp(argv[argi], "-pSkip") == 0) {
      pSkip = atof(argv[++argi]);
    }
    else {
      cout << "ERROR, bad option  " << argv[argi] << endl;
      exit(1);
    }
    ++argi;
  }

  coordinatesFileName = genesOutFileName;
  coordinatesFileName.append(".pos");
  FASTAReader reader;
  reader.Initialize(genomeFileName);

  ofstream outFile, coordsFile;
  CrucialOpen(genesOutFileName, outFile, std::ios::out);

  string coordsFileName = genesOutFileName + ".coords";
  CrucialOpen(coordsFileName, coordsFile, std::ios::out);

  vector<FASTASequence> referenceSequences;
  reader.ReadAllSequences(referenceSequences);
  int i;
  map<string, int> titleToIndex;
  for (i = 0; i < referenceSequences.size(); i++) {
    titleToIndex[referenceSequences[i].title] = i;
  }

  GencodeGFFFile gencodeFile;
  gencodeFile.ReadAll(gencodeGffFileName);
  
  vector<GencodeGFFGene> genes;
  IndexGencodeGenes(gencodeFile, genes, geneType);

  for (i = 0; i < genes.size(); i++) {
    genes[i].OrderExonsByStart();
  }

  int e;
  for (i = 0; i < genes.size(); i++) {
    FASTASequence geneSequence;
    geneSequence.CopyTitle(genes[i].geneName);
    if (titleToIndex.find(genes[i].chromosome) == titleToIndex.end()) {
      continue;
    }
    int chrIndex = titleToIndex[genes[i].chromosome];
    string sequence = "";
    //
    // Do nothing with 0 length exons.
    //
    if (genes[i].exons.size() == 0) {
      continue;
    }
    vector<FASTASequence> geneSequences;
    vector<GeneCoordinates> geneCoordinates;
    genes[i].GenerateGeneSequences(referenceSequences[chrIndex], geneSequences, geneCoordinates, randomSplicing);
    int gi;
    for (gi = 0; gi < geneSequences.size(); gi++) {
      if (genes[i].GetStrand() == '+') {
        geneSequences[gi].PrintSeq(outFile);
      }
      else {
        FASTASequence rc;
        geneSequences[gi].MakeRC(rc);
        rc.PrintSeq(outFile);
        rc.Free();
      }
      coordsFile << geneSequences[gi].title << " " << geneCoordinates[gi].chromosome << " " << geneCoordinates[gi].exonCoordinates.size() << " " << geneCoordinates[gi].strand;
      int i;
      for (i = 0; i < geneCoordinates[gi].exonCoordinates.size(); i++) {
        coordsFile << " " 
                   << geneCoordinates[gi].exonCoordinates[i].start << " "  
                   << geneCoordinates[gi].exonCoordinates[i].end << " ";
      }
      coordsFile << endl;
      geneSequences[gi].Free();
    }
    // 
    // No need to free the seq, since it is controlled by the string.
    //
  }
  coordsFile.close();
  
}
コード例 #22
0
ファイル: PrintWordCount.cpp プロジェクト: EichlerLab/blasr
int main(int argc, char* argv[]) {
	string genomeFileName;
	string suffixArrayFileName;
	if (argc < 4) {
		cout << "Usage: printWordCount genome suffixArray k [k2 k3 k4...]" << endl;
		exit(1);
	}
	genomeFileName = argv[1];
	suffixArrayFileName = argv[2];
	int argi = 3;
	vector<DNALength> k;
	while (argi < argc) {
		k.push_back(atoi(argv[argi]));
		argi++;
	}

	// Get the ref sequence.
	FASTAReader reader;
	reader.Init(genomeFileName);
	FASTASequence seq;
  //	reader.GetNext(seq);
  reader.ReadAllSequencesIntoOne(seq);
	seq.ToUpper();
	// Get the suffix array.
	DNASuffixArray sarray;
	sarray.Read(suffixArrayFileName);
	
	int ki;
  char *word;
  cout << "wordlen word nword" << endl;
	for (ki = 0; ki < k.size(); ki++) {
    word = new char[k[ki]+1];
    word[k[ki]] = '\0';
		DNALength i;
		DNALength numUnique = 0;
		for (i = 0; i < seq.length - k[ki] - 1; ) {
			DNALength j = i + 1;
      bool seqAtN = false;
      int si;
      for(si = 0; si < k[ki]; si++) {
        if (seq.seq[sarray.index[i] + si] == 'N') {
          seqAtN = true;
          break;
        }
      }
      if (seqAtN) {
        i++;
        continue;
      }
			while (j < seq.length - k[ki] and 
						 seq.length - sarray.index[i] >= k[ki] and
						 seq.length - sarray.index[j] >= k[ki] and 
						 strncmp((const char*) &seq.seq[sarray.index[i]], (const char*) &seq.seq[sarray.index[j]], k[ki]) == 0) {
				j++;
			}
      if (seq.length - sarray.index[i] >= k[ki]) {
        for(si = 0; si < k[ki]; si++) {
          word[si] = seq.seq[sarray.index[i]+si];
        }
        cout << k[ki] << " " << word << " " << j - i + 1 << endl;
        if (j == i + 1) { 
          ++numUnique;
        }
      }
			i = j;
		}
	}
}
コード例 #23
0
int main(int argc, char* argv[]) {
    string inFileName, readsFileName;
    DNALength readLength;
    float coverage = 0;
    bool noRandInit = false;
    int numReads = -1;
    CommandLineParser clp;
    int qualityValue = 20;
    bool printFastq = false;
    int stratify = 0;
    string titleType = "pacbio";
    string fastqType = "illumina"; // or "sanger"
    clp.RegisterStringOption("inFile", &inFileName, "Reference sequence", 0);
    clp.RegisterPreviousFlagsAsHidden();
    clp.RegisterIntOption("readLength", (int*) &readLength, "The length of reads to simulate.  The length is fixed.",
                          CommandLineParser::PositiveInteger, "Length of every read.", 0);
    clp.RegisterFloatOption("coverage", &coverage, "Total coverage (from which the number of reads is calculated",
                            CommandLineParser::PositiveFloat, 0);
    clp.RegisterFlagOption("nonRandInit", &noRandInit, "Skip initializing the random number generator with time.");
    clp.RegisterIntOption("nReads", &numReads, "Total number of reads (from which coverage is calculated)", CommandLineParser::PositiveInteger, 0);
    clp.RegisterStringOption("readsFile", &readsFileName, "Reads output file", 0);
    clp.RegisterFlagOption("fastq", &printFastq, "Fake fastq output with constant quality value (20)");
    clp.RegisterIntOption("quality", &qualityValue, "Value to use for fastq quality", CommandLineParser::PositiveInteger);
    clp.RegisterIntOption("stratify", &stratify, "Sample a read every 'stratify' bases, rather than randomly.", CommandLineParser::PositiveInteger);
    clp.RegisterStringOption("titleType", &titleType, "Set the name of the title: 'pacbio'|'illumina'");
    clp.RegisterStringOption("fastqType", &fastqType, "Set the type of fastq: 'illumina'|'sanger'");
    vector<string> leftovers;
    clp.ParseCommandLine(argc, argv, leftovers);

    if (!noRandInit) {
        InitializeRandomGeneratorWithTime();
    }

    FASTAReader inReader;
    inReader.Init(inFileName);
    vector<FASTASequence> reference;

    inReader.ReadAllSequences(reference);
    ofstream readsFile;
    if (readsFileName == "") {
        cout << "ERROR.  You must specify a reads file." << endl;
        exit(0);
    }
    CrucialOpen(readsFileName, readsFile, std::ios::out);

    ofstream sangerFastqFile;
    if (fastqType == "sanger") {
        string sangerFastqFileName = readsFileName + ".fastq";
        CrucialOpen(sangerFastqFileName, sangerFastqFile, std::ios::out);
    }

    DNALength refLength = 0;
    int i;
    for (i = 0; i < reference.size(); i++) {
        refLength += reference[i].length;
    }
    if (numReads == -1 and coverage == 0 and stratify == 0) {
        cout << "Error, you must specify either coverage, nReads, or stratify." << endl;
        exit(1);
    }
    else if (numReads == -1) {
        numReads = (refLength / readLength) * coverage;
    }

    if (stratify) {
        if (!readLength) {
            cout << "ERROR. If you are using stratification, a read length must be specified." << endl;
            exit(1);
        }
    }

    DNASequence sampleSeq;
    sampleSeq.length = readLength;
    int maxRetry = 10000000;
    int retryNumber = 0;
    DNALength seqIndex, seqPos;
    if (stratify) {
        seqIndex = 0;
        seqPos   = 0;
    }
    DNALength origReadLength = readLength;
    for (i = 0; stratify or i < numReads; i++) {
        if (stratify == 0) {
            FindRandomPos(reference, seqIndex, seqPos, readLength );
        }
        else {
            //
            // find the next start pos, or bail if done
            //
            if (seqPos >= reference[seqIndex].length) {
                if (seqIndex == reference.size() - 1) {
                    break;
                }
                else {
                    seqIndex = seqIndex + 1;
                    seqPos   = 0;
                    continue;
                }
            }
            readLength = min(reference[seqIndex].length - seqPos, origReadLength);
        }
        sampleSeq.seq = &reference[seqIndex].seq[seqPos];
        int j;
        int gappedRead = 0;
        string title;
        stringstream titleStrm;
        if (titleType == "pacbio") {
            titleStrm << i << "|"<< reference[seqIndex].GetName() << "|" << seqPos << "|" << seqPos + readLength;
        }
        else if (titleType == "illumina") {
            titleStrm << "SE_" << i << "_0@" << seqPos << "-"<<seqPos+readLength <<"/1";
        }
        else {
            cout << "ERROR. Bad title type " << titleType << endl;
            exit(0);
        }
        title = titleStrm.str();
        sampleSeq.length = readLength;
        if (!printFastq) {
            readsFile << ">" << title << endl;
            sampleSeq.PrintSeq(readsFile);
        }
        else {
            FASTQSequence fastqSampleSeq;
            fastqSampleSeq.CopyTitle(title);
            fastqSampleSeq.seq = sampleSeq.seq;
            fastqSampleSeq.length = sampleSeq.length;
            fastqSampleSeq.qual.data = new unsigned char[sampleSeq.length];
            fill(fastqSampleSeq.qual.data, fastqSampleSeq.qual.data + sampleSeq.length, qualityValue);
            if (fastqType == "illumina") {
                fastqSampleSeq.PrintFastq(readsFile, fastqSampleSeq.length+1);
            }
            else {
                fastqSampleSeq.PrintSeq(readsFile);
                fastqSampleSeq.PrintQual(sangerFastqFile);
            }
            delete[] fastqSampleSeq.qual.data;
            delete[] fastqSampleSeq.title;
        }

        if (stratify) {
            seqPos += readLength;
        }

    }
    return 0;
}
コード例 #24
0
ファイル: PrintScaffolds.cpp プロジェクト: jcombs1/blasr
int main(int argc, char* argv[]) {
	
	if (argc < 4) {
		PrintUsage();
		exit(0);
	}

	string rgFileName, vertexSeqFileName, scaffoldDirName;
	
	rgFileName         = argv[1];
	vertexSeqFileName  = argv[2];
	scaffoldDirName    = argv[3];

	string repeatFileName = "";
	bool printRepeatsSeparately = false;
	int argi = 4;
	bool printSeparate=false;
	while (argi < argc) {
		if (strcmp(argv[argi], "-separate") == 0) {
			printSeparate=true;
		}
		else if (strcmp(argv[argi], "-repeats") == 0) {
			printRepeatsSeparately = true;
			repeatFileName = argv[++argi];
		}
		else {
			cout << "bad option: " << argv[argi] << endl;
			PrintUsage();
			exit(1);
		}
		++argi;
	}
	
	FASTAReader vertexSequenceReader;
	vertexSequenceReader.Init(vertexSeqFileName);

	//
	// Input necessary data
	//
	vector<FASTASequence> vertexSequences;
	vertexSequenceReader.ReadAllSequences(vertexSequences);
	RepeatGraph<string> rg;
	rg.ReadGraph(rgFileName);

	vector<FASTASequence> vertexRCSequences;
	VectorIndex vertexIndex;	
	vertexRCSequences.resize(vertexSequences.size());
	for (vertexIndex = 0; vertexIndex < vertexSequences.size(); vertexIndex++ ){
		vertexSequences[vertexIndex].MakeRC(vertexRCSequences[vertexIndex]);
	}
	
	VectorIndex outEdgeIndex;
	int scaffoldIndex = 0;
	ofstream scaffoldOut;

	if (printSeparate==false) {
		// scaffold dir name is really a file name here.
		CrucialOpen(scaffoldDirName, scaffoldOut, std::ios::out);
	}
	for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++ ){
		rg.vertices[vertexIndex].traversed = false;
	}

	//
	// Set up flow for calling multiplicity.
	//
	/*
		Test all this out later.
	AssignMinimumFlowToEdges(rg, 2);
	AssignVertexFlowBalance(rg);
	BalanceKirchhoffFlow(rg);

	UInt edgeIndex;
	for (edgeIndex = 0; edgeIndex < rg.edges.size(); edgeIndex++) {
		if (rg.edges[edgeIndex].flow > 1) {
			cout << edgeIndex << " " << rg.edges[edgeIndex].flow << endl;
		}
	}
	*/

	int numPrintedVertices = 0;
	for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++ ){
		//
		// Look to see if this vertex is a branching vertex.
		//
		if ((rg.vertices[vertexIndex].inEdges.size() != 1 or
				 rg.vertices[vertexIndex].outEdges.size() != 1) and
				rg.vertices[vertexIndex].traversed == false) {

			//
			// This is a branching vertex.  Print all paths from this vertex, but not the vertex
			// itself if it appears repetitive. 
			//
			VectorIndex outEdgeIndex;
			bool printedThisVertex = false;
			for (outEdgeIndex = 0; outEdgeIndex < rg.vertices[vertexIndex].outEdges.size(); outEdgeIndex++ ){
				//
				// This is a branching vertex.
				// 

				VectorIndex pathIndex;
				stringstream scaffoldFileNameStrm;
				cout << " printing scaffold: " << scaffoldIndex << endl;
				if (printSeparate) {
					scaffoldFileNameStrm << scaffoldDirName << "/" << scaffoldIndex << ".fasta";
					string scaffoldFileName = scaffoldFileNameStrm.str();
					CrucialOpen(scaffoldFileName, scaffoldOut, std::ios::out);
				}			
				++scaffoldIndex;

				//
				// Store the nonbranching path in a list so that it may be quickly processed.
				//
				bool pathIsPrinted = false;
				vector<VectorIndex> path;
				if (rg.vertices[vertexIndex].InDegree() == 0 and rg.vertices[vertexIndex].OutDegree() == 1) {
					path.push_back(vertexIndex);
				}
				VectorIndex pathVertex = rg.edges[rg.vertices[vertexIndex].outEdges[outEdgeIndex]].dest;				
				while(rg.vertices[pathVertex].inEdges.size() == 1 and
							rg.vertices[pathVertex].outEdges.size() == 1) {
					if (rg.vertices[pathVertex].traversed == true) {
						pathIsPrinted = true;
						break;
					}
					path.push_back(pathVertex);
					// Mark the forward and reverse complement as traversed.
					pathVertex = rg.edges[rg.vertices[pathVertex].outEdges[0]].dest;
					//
				}
				//
				// Look to see if this is the end of a simple path, if so, add it to the scaffold.
				//
				pathVertex = rg.edges[rg.vertices[vertexIndex].outEdges[outEdgeIndex]].dest;				
				if (rg.vertices[pathVertex].OutDegree() == 0 and rg.vertices[pathVertex].InDegree() == 1) {
					path.push_back(pathVertex);
				}
				//
				// Determine the sequences in the scaffold and the total scaffold length.
				//
				if (pathIsPrinted == false) {
					VectorIndex p;
					DNALength scaffoldLength = 0;
					for (p = 0; p < path.size(); p++ ){
						scaffoldLength += vertexSequences[path[p]/2].length;
						rg.vertices[path[p]].traversed = true;
						//					rg.vertices[2*(path[p]/2)+ !(path[p]%2)].traversed = true;
						++numPrintedVertices;
					}
					cout << "path is of size " << path.size() << " length " << scaffoldLength << endl;
					if (!printSeparate) {
						scaffoldOut << ">" << scaffoldIndex << " " << path.size() << " " << scaffoldLength << endl;
					}
					for (p = 0; p < path.size(); p++) {
						if (printSeparate) {
							scaffoldOut << ">" << p << " " << path[p]/2 << " " << vertexSequences[path[p]/2].length << endl;
						}
						if (path[p]%2 == 0) {
							((DNASequence)vertexSequences[path[p]/2]).PrintSeq(scaffoldOut);
						}
						else {
							((DNASequence)vertexRCSequences[path[p]/2]).PrintSeq(scaffoldOut);
						}
						rg.vertices[path[p]].traversed = true;
						rg.vertices[2*(path[p]/2) + !(path[p]%2)].traversed = true;
					}
					if (printSeparate) {
						scaffoldOut.close();
						scaffoldOut.clear();
					}
				}
			}
		}
	}

	ofstream* outPtr;
	ofstream repeatOut;
	if (printRepeatsSeparately) {
		CrucialOpen(repeatFileName, repeatOut, std::ios::out);
		outPtr = &repeatOut;
	}
	else {
		outPtr = &scaffoldOut;
	}

	for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++ ){
		if (rg.vertices[vertexIndex].traversed == false) {
			//
			// Print this vertex sequence only.  It is repetitive, or isolated.
			//
			*outPtr << ">" << scaffoldIndex << endl;
			++scaffoldIndex;
			if (vertexIndex%2 == 0) {
				((DNASequence)vertexSequences[vertexIndex/2]).PrintSeq(*outPtr);
			}
			else {
				((DNASequence)vertexRCSequences[vertexIndex/2]).PrintSeq(*outPtr);
			}
			rg.vertices[vertexIndex].traversed = true;
			rg.vertices[2*(vertexIndex/2)+ !(vertexIndex%2)].traversed = true;
		}
	}	

	cout << "printed: " << numPrintedVertices << " of " << rg.vertices.size() << endl;
}
コード例 #25
0
ファイル: RemoveAdapters.cpp プロジェクト: csuliweilong/blasr
int main(int argc, char* argv[]) {
	string ad1File, ad2File, readsFile, readsOutFile;

	FASTAReader ad1Reader;
	FASTAReader ad2Reader;
	FASTAReader reader;


	CommandLineParser cl;
	float minPctSimilarity = 0.60;
	int indel = 3;
	int minLength = 10;
	cl.RegisterStringOption("ad1", &ad1File, "FASTA file with the first adapter");
	cl.RegisterStringOption("ad2", &ad2File, "FASTA file with the second adapter");
	cl.RegisterStringOption("reads", &readsFile, "FASTA file with SMRTBell reads");
	cl.RegisterStringOption("readsout", &readsOutFile, "output file for split reads");
	cl.RegisterPreviousFlagsAsHidden();
	cl.RegisterFloatOption("pctSim", &minPctSimilarity, "Minimum percent similarity to trigger a match to an adapter.", 
												 CommandLineParser::PositiveFloat);
	cl.RegisterIntOption("indel", &indel, "Penalty for indel (positive)", CommandLineParser::NonNegativeInteger);
	cl.RegisterIntOption("minLength", &minLength, "Minimum length pass to retain.", CommandLineParser::PositiveInteger);
	vector<string> opts;
	cl.ParseCommandLine(argc, argv, opts);

	/*
	 * Open all the required files, quitting if they are unavailable.
	 */

	ad1Reader.Init(ad1File);
	ad2Reader.Init(ad2File);
	reader.Init(readsFile);

	ofstream splitOut;
	CrucialOpen(readsOutFile, splitOut);

	FASTASequence ad1, ad2;
	ad1Reader.GetNext(ad1);
	ad2Reader.GetNext(ad2);

	FASTASequence read;
	vector<int> scoreMat;
	vector<Arrow> pathMat;
	int readIndex = 0;
	while(reader.GetNext(read)) {
		read.ToUpper();
		//
		// Do a fitting sequence alignment to match one of the two 
		// adapters into the read.
		//
		vector<int> passStarts, passLengths, la;
		read.PrintSeq(cout);
		SplitRead(read, 0, read.length, ad1, ad2,
							indel, 
							passStarts, passLengths,la, 0,
							scoreMat, pathMat, minPctSimilarity, minLength);
		int i;
		for (i = 0; i < passStarts.size(); i++) {
			cout << "read: " << readIndex << " pass: "******" " << passStarts[i] << " " << passLengths[i] << " " << la[i] << endl;
		}
		++readIndex;
	}
}
コード例 #26
0
int main(int argc, char* argv[]) {


	FASTAReader reader;
	FASTASequence read;
	int maxLength = 100;
	if (argc < 3) {
		cout << "usage: pairAlignAllContigs inFile maxLength equivalencies [-minIdent i]" << endl; 
		exit(0);
	}
	string readsFileName, equivalenciesFileName;
	readsFileName = argv[1];
	maxLength = atoi(argv[2]);
	equivalenciesFileName = argv[3];
	int argi = 4;
  float minIdentity = 80;
	while (argi < argc) {
		if (strcmp(argv[argi], "-minIdent") == 0) {
			minIdentity = atoi(argv[++argi]);
		}
		++argi;
	}
	vector<FASTASequence> reads, readsRC;;
	reader.Init(readsFileName); 
  reader.ReadAllSequences(reads);
	readsRC.resize(reads.size());
	int r;
	for (r =0; r < reads.size();r++) {
	  reads[r].MakeRC(readsRC[r]);
  }
	ofstream equivOut;
	CrucialOpen(equivalenciesFileName, equivOut);

	Matrix<int> alignScores;
	Matrix<float> alignIdentities;
	alignScores.Resize(reads.size(), reads.size());
	alignIdentities.Resize(reads.size(), reads.size());
	vector<int> scoreMat;
	vector<Arrow> pathMat;
	int i, j;
	int alignScore;
	FASTASequence readi, readj;
	FASTASequence rcReadi, rcReadj;
	
	for (i = 0; i < reads.size(); i++) {
		float maxFrontIdent, maxEndIdent;
		int   maxFrontIdentIndex, maxEndIdentIndex;
		maxFrontIdent = 0; maxEndIdent = 0;
		maxFrontIdentIndex = 0;
		maxEndIdentIndex   = 0;
		int maxFrontIdentLength = 0;
		int maxEndIdentLength  = 0;
		int maxFrontLength     = 0;
		int	maxEndLength       = 0;
		int nmaxFrontLengthIndex = 0;
		int maxEndLengthIndex  = 0;
		float maxFrontLengthIdent = 0;
		float maxEndLengthIdent = 0;
		int maxFrontLengthIndex = 0;
		equivOut << reads[i].GetName();
		for (j = 0; j < reads.size(); j++ ){
			// 
			// Store the two ends of the alignment.
			//
			alignScore = 0;
			int rcAlignScore;
			Alignment alignment;
			Alignment rcAlignment;
			Alignment *optAlignment;
			if (i != j) {
  	  	if (reads[i].length < maxLength and reads[j].length < maxLength) {
  				alignScore = SWAlign(reads[i], reads[j], SMRTDistanceMatrix, 3, scoreMat, pathMat, alignment, Global);
  			}
				if (reads[i].length < maxLength and reads[j].length < maxLength) {
          rcAlignScore = SWAlign(reads[i], readsRC[j], SMRTDistanceMatrix, 3, scoreMat, pathMat, rcAlignment, Global);
        }	
  			ComputeAlignmentStats(alignment, reads[i].seq, reads[j].seq, SMRTDistanceMatrix, 3,3 );
        ComputeAlignmentStats(rcAlignment, reads[i].seq, readsRC[j].seq, SMRTDistanceMatrix, 3,3 );

  			if (alignment.pctSimilarity > minIdentity or rcAlignment.pctSimilarity > minIdentity) {
  				equivOut << " " << reads[j].GetName();
  			}	
      }
	}
		equivOut << endl;	
	}

	return 0;
}
コード例 #27
0
int main(int argc, char* argv[]) {
  
  CommandLineParser clp;
  string readsFileName;
  string alignmentsFileName;
  string outputFileName;
  float minMergeIdentity = 0.70;
  clp.RegisterStringOption("reads", &readsFileName, "Reads used for alignments.");
  clp.RegisterStringOption("alignments", &alignmentsFileName, "SAM formatted alignments.");
  clp.RegisterIntOption("k", &vertexSize, "Minimum match length", CommandLineParser::PositiveInteger);
  clp.RegisterStringOption("outfile", &outputFileName, "Alignment output.");
  clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterFlagOption("v", &verbose, "");
  clp.RegisterFloatOption("minMergeIdentity", 
                          &minMergeIdentity, 
                          "Minimum identity to merge paths.", CommandLineParser::PositiveFloat);
  
  clp.ParseCommandLine(argc, argv);

  if (minMergeIdentity < 0 or minMergeIdentity > 1) {
    cout << "ERROR. minMergeIdentity must be between 0 and 1" << endl;
    exit(1);
  }
  
  vector<FASTASequence> reads;

  FASTAReader fastaReader;
  fastaReader.Initialize(readsFileName);
  fastaReader.ReadAllSequences(reads);

  //
  // It is necessary to go from read title to index in the list of reads. 
  //
  map<string, int> readNameToIndex;
  BuildReadNameToIndexMap(reads, readNameToIndex);

  ReadWordMatchVector readWordMatches;
  InitializeFromReads(reads, readWordMatches);
  
  //
  // Get ready to read in the alignments.
  //
  SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader;
  samReader.Initialize(alignmentsFileName);
  AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet;
  samReader.ReadHeader(alignmentSet);
  
  SAMAlignment samAlignment;
  AlignmentCandidate<> alignment;
  int numAlignedBases = 0;
  int alignmentIndex = 0;
  while ( samReader.GetNextAlignment( samAlignment ) ) {
    vector<AlignmentCandidate<> > alignments;
    SAMAlignmentsToCandidates(samAlignment,
                              reads,
                              readNameToIndex,
                              alignments, false, true);

    int i;
    ++alignmentIndex;
    int a;
    for (a = 0; a < alignments.size();a++) {
      if (alignments[a].qName != alignments[a].tName) {
        MarkMatches(alignments[a], readNameToIndex, vertexSize, readWordMatches);
      }
    }
    if (alignmentIndex % 1000 == 0) {
      cout << alignmentIndex << endl;
    }
  }


  int numMatches = 0;
  int parentIndex = 1;
  int r;
  for (r = 0; r < readWordMatches.size(); r++) {
    readWordMatches[r].CreateParents();
    numMatches += readWordMatches[r].pos.size();
  }

  vector<int> parentIndices;
  parentIndices.resize(2*numMatches + 1);
  fill(parentIndices.begin(), parentIndices.end(), 0);
  //
  // Start indexing off at 1 so that 0 does not need to be treated in
  // a special case.
  //
  int curParentIndex = 1;
  cout << "There are " << numMatches << " matches." << endl;

  samReader.Close();
  samReader.Initialize(alignmentsFileName);
  AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet2;
  samReader.ReadHeader(alignmentSet2);
  
  numAlignedBases = 0;
  alignmentIndex = 0;
  while ( samReader.GetNextAlignment( samAlignment ) ) {
    vector<AlignmentCandidate<> > alignments;
    SAMAlignmentsToCandidates(samAlignment,
                              reads,
                              readNameToIndex,
                              alignments, false, true);

    int i;
    ++alignmentIndex;
    int a;
    for (a = 0; a < alignments.size();a++) {
      if (alignments[a].qName != alignments[a].tName) {
        JoinVertices(alignments[a], vertexSize, readNameToIndex, readWordMatches, curParentIndex, parentIndices);
      }
    }
    if (alignmentIndex % 1000 == 0) {
      cout << alignmentIndex << endl;
    }
  }
  vector<int> parentCounts;
  parentCounts.resize(parentIndices.size());
  fill(parentCounts.begin(), parentCounts.end(), 0);
  int p;
  PromoteAll(parentIndices);
  int i;
  for (r = 0; r < readWordMatches.size(); r++) {
    for (i = 0; i < readWordMatches[r].parents.size(); i++) {
      readWordMatches[r].parents[i] = parentIndices[readWordMatches[r].parents[i]];
      parentCounts[readWordMatches[r].parents[i]]++;
    }
  }
  /*
  for (i = 0; i < readWordMatches.size(); i++) {
    readWordMatches[i].PrintPos(cout);
    readWordMatches[i].PrintParents(cout);
  }
  */

  map<int,int> hist;
  int numParents = 0;
  for (i = 1; i < parentCounts.size() && parentIndices[i] != 0; i++) {
    if (parentCounts[i] != 0) {
      ++numParents;
    }
    if (hist.find(parentCounts[i]) == hist.end()) {
      hist[parentCounts[i]] = 1;
    }
    else {
      hist[parentCounts[i]]++;
    }
  }

  map<int,int>::iterator histIt;
  cout << " freq count" << endl;
  for(histIt = hist.begin(); histIt != hist.end(); ++histIt) {
    cout << (*histIt).second << " " << (*histIt).first << endl;
  }

  MatchVertexList vertices;
  vertices.resize(numParents);
  cout << "there are " << numParents << " parents. " << endl;
  
}
コード例 #28
0
ファイル: SAWriter.cpp プロジェクト: bnbowman/blasr
int main(int argc, char* argv[]) {

	if (argc < 2) {
		PrintUsage();
		exit(1);
	}
	int argi = 1;
	string saFile = argv[argi++];
	vector<string> inFiles;
	
	int doBLT = 1;
	int bltPrefixLength = 8;
	int parsingOptions = 0;
	SAType saBuildType = larsson;
	int read4BitCompressed  = 0;
	int diffCoverSize = 0;
	while (argi < argc) {
		if (strlen(argv[argi]) > 0 and
				argv[argi][0] == '-'){ 
			parsingOptions = 1;
		}
		if (!parsingOptions) {
			inFiles.push_back(argv[argi]);
		}
		else {
			if (strcmp(argv[argi], "-blt") == 0) {
				doBLT = 1;
        if (argi < argc - 1) {
          bltPrefixLength = atoi(argv[++argi]);
          if (bltPrefixLength == 0) {
            cout << argv[argi] << " is not a valid lookup table length." << endl;
            exit(1);
          }
        }
        else {
          cout << "Please specify a lookup table length." << endl;
          exit(1);
        }
			}
			else if (strcmp(argv[argi], "-mamy") == 0) {
				saBuildType = manmy;
			}
			else if (strcmp(argv[argi], "-larsson") == 0) {
				saBuildType = larsson;
			}
			else if (strcmp(argv[argi], "-mcilroy") == 0) {
				saBuildType = mcilroy;
			}
			else if (strcmp(argv[argi], "-slow") == 0) {
				saBuildType = slow;
			}
			else if (strcmp(argv[argi], "-kark") == 0) {
				saBuildType = kark;
			}
			else if (strcmp(argv[argi], "-mafe") == 0) {
				saBuildType = mafe;
			}
			else if (strcmp(argv[argi], "-welter") == 0) {
				saBuildType = welter;
			}
			else if (strcmp(argv[argi], "-welterweight") == 0) {
        if (argi < argc-1) {
          diffCoverSize = atoi(argv[++argi]);
        }
        else {
          cout << "Please specify a difference cover size.  Valid values are 7,32,64,111, and 2281.  Larger values use less memory but may be slower." << endl;
          exit(1);
        }
        if ( ! (diffCoverSize == 7 or 
                diffCoverSize == 32 or
                diffCoverSize == 64 or 
                diffCoverSize == 111 or
                diffCoverSize == 2281) ) {
          cout << "The difference cover size must be one of 7,32,64,111, or 2281." << endl;
          cout << "Larger numbers use less space but are more slow." << endl;
          exit(1);
        }
			}
			else if (strcmp(argv[argi], "-4bit") == 0) {
				read4BitCompressed = 1;
			}
			else {
				PrintUsage();
				cout << "ERROR, bad option: " << argv[argi] << endl;
				exit(1);
			}
		}
		++argi;
	}
  
  if (inFiles.size() == 0) {
    //
    // Special use case: the input file is a fasta file.  Write to that file + .sa
    //
    inFiles.push_back(saFile);
    saFile = saFile + ".sa";
  }
  
	VectorIndex inFileIndex;
	FASTASequence seq;
	CompressedSequence<FASTASequence> compSeq;

	if (read4BitCompressed == 0) {
		for (inFileIndex = 0; inFileIndex < inFiles.size(); ++inFileIndex) {
			FASTAReader reader;
			reader.Init(inFiles[inFileIndex]);
			reader.SetSpacePadding(111);
			if (saBuildType == kark) {
				//
				// The Karkkainen sa building method requires a little extra
				// space at the end of the dna sequence so that counting may
				// be done mod 3 without adding extra logic for boundaries.
				//
			}
  
			if (inFileIndex == 0) {
				reader.ReadAllSequencesIntoOne(seq);
				reader.Close();
			}
			else {
				while(reader.ConcatenateNext(seq)) {
					cout << "added " << seq.title << endl;
				}
			}
		}
		seq.ToThreeBit();
		//seq.ToUpper();
	}
	else {
		assert(inFiles.size() == 1);
		cout << "reading compressed sequence." << endl;
		compSeq.Read(inFiles[0]);
		seq.seq = compSeq.seq;
		seq.length = compSeq.length;
		compSeq.RemoveCompressionCounts();
		cout << "done." << endl;
	}

  //
  // For now, do not allow creation of suffix arrays on sequences > 4G.
  //
  if (seq.length >= UINT_MAX) {
    cout << "ERROR, references greater than " << UINT_MAX << " bases are not supported." << endl;
    cout << "Consider breaking the reference into multiple files, running alignment. " << endl;
    cout << "against each file, and merging the result." << endl;
    exit(1);
  }
	vector<int> alphabet;
	
	SuffixArray<Nucleotide, vector<int> >  sa;
	//	sa.InitTwoBitDNAAlphabet(alphabet);
	//	sa.InitAsciiCharDNAAlphabet(alphabet);
  sa.InitThreeBitDNAAlphabet(alphabet);

	if (saBuildType == manmy) {
		sa.MMBuildSuffixArray(seq.seq, seq.length, alphabet);
	}
	else if (saBuildType == mcilroy) {
		sa.index = new SAIndex[seq.length+1];
		DNALength i;
		for (i = 0; i < seq.length; i++) { sa.index[i] = seq.seq[i] + 1;}
		sa.index[seq.length] = 0;
		ssort(sa.index, NULL);
		for (i = 1; i < seq.length+1; i++ ){ sa.index[i-1] = sa.index[i];};
		sa.length = seq.length;
	}
	else if (saBuildType == larsson) {
		sa.LarssonBuildSuffixArray(seq.seq, seq.length, alphabet);
	}
	else if (saBuildType == kark) {
		sa.index = new SAIndex[seq.length];
		seq.ToThreeBit();
		DNALength p;
		for (p = 0; p < seq.length; p++ ){ seq.seq[p]++; }
		KarkkainenBuildSuffixArray<Nucleotide>(seq.seq, sa.index, seq.length, 5);
		sa.length = seq.length;
	}
	else if (saBuildType == mafe) {
		//		sa.MaFeBuildSuffixArray(seq.seq, seq.length);
		
	}
	else if (saBuildType == welter) {
		if (diffCoverSize == 0) {
			sa.LightweightBuildSuffixArray(seq.seq, seq.length);
		}
		else {
			sa.LightweightBuildSuffixArray(seq.seq, seq.length, diffCoverSize);
		}
	}
	if (doBLT) {
		sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength);
	}
	sa.Write(saFile);

	return 0;

}
コード例 #29
0
ファイル: SamFilter.cpp プロジェクト: Mondale-tw/blasr
int main(int argc, char* argv[]) {
#ifdef USE_GOOGLE_PROFILER
    char *profileFileName = getenv("CPUPROFILE");
    if (profileFileName != NULL) {
      ProfilerStart(profileFileName);
    }
    else {
      ProfilerStart("google_profile.txt");
    }
#endif

    // Register inputs and outputs.
    string samFileName, refFileName, outFileName;

    CommandLineParser clp;
    clp.RegisterStringOption("file.sam", &samFileName,
                             "Input SAM file.");
    clp.RegisterStringOption("reference.fasta", &refFileName,
                             "Reference used to generate reads.");
    clp.RegisterStringOption("out.sam", &outFileName,
                             "Output SAM file.");
    clp.RegisterPreviousFlagsAsHidden();

    // Register filter criteria options.
    int minAlnLength = 50;
    float minPctSimilarity = 70, minPctAccuracy = 70;
    string hitPolicyStr = "randombest";
    bool useScoreCutoff = false;
    int  scoreCutoff = INF_INT;
    int  scoreSignInt = -1;
    RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, 
                          minPctAccuracy, hitPolicyStr, useScoreCutoff,
                          scoreSignInt, scoreCutoff);

    int seed = 1; 
    clp.RegisterIntOption("seed", &seed,
            "(1)  Seed for random number generator.\n"
            "If seed is 0, then use current time as seed.",
            CommandLineParser::Integer);

    string holeNumberStr;
    Ranges holeNumberRanges;
    clp.RegisterStringOption("holeNumbers", &holeNumberStr,
            "A string of comma-delimited hole number ranges to output hits, "
            "such as '1,2,10-12'. "
            "This requires hit titles to be in SMRT read title format.");

    bool parseSmrtTitle = false;
    clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle,
            "Use this option when filtering alignments generated by "
            "programs other than blasr, e.g. bwa-sw or gmap. "
            "  Parse read coordinates from the SMRT read title. " 
            "The title is in the format /name/hole/coordinates, where"
            " coordinates are in the format \\d+_\\d+, and represent "
            "the interval of the read that was aligned.");
    /* This experimental option can be useful for metagenomics, in which case
     * there are hundreds of sequences in the target, of which many titles are
     * long and may contain white spaces (e.g., ' ', '\t'). 
     * In order to save disc space and avoid the (possibly) none unique mapping
     * between full and short reference names, one may call blasr with 
     * -titleTable option to represent all target sequences in the output
     * by their indices in the title table.*/

    string titleTableName = "";
    clp.RegisterStringOption("titleTable", &titleTableName,
            "Use this experimental option when filtering alignments generated by "
            "blasr with -titleTable titleTableName, in which case "
            "reference titles in SAM are represented by their "
            "indices (e.g., 0, 1, 2, ...) in the title table.");

    string adapterGffFileName = "";
    clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName,
            "Use this option to remove reads which can only map to adapters " 
            "specified in the GFF file.");

    bool verbose = false;
    clp.RegisterFlagOption("v", &verbose, "Be verbose.");

    clp.SetExamples(
            "Because SAM has optional tags that have different meanings"
            " in different programs, careful usage is required in order "
            "to have proper output.  The \"xs\" tag in bwa-sw is used to "
            "show the suboptimal score, but in PacBio SAM (blasr) it is "
            "defined as the start in the query sequence of the alignment.\n"
            "When \"-smrtTitle\" is specified, the xs tag is ignored, but "
            "when it is not specified, the coordinates given by the xs and "
            "xe tags are used to define the interval of a read that is "
            "aligned.  The CIGAR string is relative to this interval.");

    clp.ParseCommandLine(argc, argv);

    // Set random number seed. 
    if (seed == 0) {
        srand(time(NULL));
    } else {
        srand(seed);
    }
    
    scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE;
    Score s(static_cast<float>(scoreCutoff), scoreSign);
    FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, 
                                  minPctAccuracy, true, s);
    filterCriteria.Verbose(verbose);
    HitPolicy hitPolicy(hitPolicyStr, scoreSign);
                                  
    string errMsg;
    if (not filterCriteria.MakeSane(errMsg)) {
        cout << errMsg << endl;
        exit(1);
    }

    // Parse hole number ranges. 
    if (holeNumberStr.size() != 0) {
        if (not holeNumberRanges.setRanges(holeNumberStr)) {
            cout << "Could not parse hole number ranges: "
                 << holeNumberStr << "." << endl;
            exit(1);
        } 
    }

    // Open output file.
    ostream * outFilePtr = &cout;
	ofstream outFileStrm;
	if (outFileName != "") {
		CrucialOpen(outFileName, outFileStrm, std::ios::out);
		outFilePtr = &outFileStrm;
	}
    
    GFFFile adapterGffFile;
    if (adapterGffFileName != "")
        adapterGffFile.ReadAll(adapterGffFileName);
    
    SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader;
    FASTAReader fastaReader;

    //
    // Initialize samReader and fastaReader.
    //
    samReader.Initialize(samFileName);
    fastaReader.Initialize(refFileName);

    //
    // Configure the file log.
    //
    string command;
    CommandLineParser::CommandLineToString(argc, argv, command);
    string log = "Filter sam hits.";
    string program = "samFilter";
    string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

    //
    // Read necessary input.
    //
    vector<FASTASequence> references;
    fastaReader.ReadAllSequences(references);

    // If the SAM file is generated by blasr with -titleTable,
    // then references in the SAM are represented by 
    // their corresponding indices in the title table.
    // In that case, we need to convert reference titles in fasta file
    // to their corresponding indices in the title table, such that
    // references in both SAM and fasta files are represented
    // by title table indices and therefore can match.
    if (titleTableName != "") {
        ConvertTitlesToTitleTableIndices(references, titleTableName);
    }
 
    AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet;
    vector<string> allHeaders = samReader.ReadHeader(alignmentSet); 

    // Process SAM Header.
    string commandLineString;
    clp.CommandLineToString(argc, argv, commandLineString);
    allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \
                         "\tCL:" + program + " " + commandLineString);
    for (int i = 0; i < allHeaders.size(); i++) {
        outFileStrm << allHeaders[i] << endl;
    }

    //
    // The order of references in vector<FASTASequence> references and
    // AlignmentSet<, , >alignmentSet.references can be different.
    // Rearrange alignmentSet.references such that they are ordered in
    // exactly the same way as vector<FASTASequence> references.
    //
    alignmentSet.RearrangeReferences(references);

    // Map reference name obtained from SAM file to indices
    map<string, int> refNameToIndex;
    for (int i = 0; i < references.size(); i++) {
        string refName = alignmentSet.references[i].GetSequenceName();
        refNameToIndex[refName] = i;
    }

    //
    // Store the alignments.
    //
    SAMAlignment samAlignment;
    int alignIndex = 0; 

    //
    // For 150K, each chip produces about 300M sequences 
    // (not including quality values and etc.).
    // Let's assume that the sam file and reference data can 
    // fit in the memory. 
    // Need to scale for larger sequal data in the future.
    //
    vector<SAMAlignment> allSAMAlignments;
    while (samReader.GetNextAlignment(samAlignment)) {
        if (samAlignment.rName == "*") {
            continue;
        }

        if (parseSmrtTitle and holeNumberStr.size() != 0) {
            string movieName;
            int thisHoleNumber;
            if (not ParsePBIReadName(samAlignment.qName, 
                                     movieName, 
                                     thisHoleNumber)) {
                cout << "ERROR, could not parse SMRT title: "
                     << samAlignment.qName << "." << endl;
                exit(1);
            }
            if (not holeNumberRanges.contains(UInt(thisHoleNumber))) {
                if (verbose) 
                    cout << thisHoleNumber << " is not in range." << endl; 
                continue;
            }
        }

        if (samAlignment.cigar.find('P') != string::npos) {
            cout << "WARNING. Could not process SAM record with 'P' in "
                 << "its cigar string." << endl;
            continue;
        }

        vector<AlignmentCandidate<> > convertedAlignments;
        SAMAlignmentsToCandidates(samAlignment, 
                references, refNameToIndex,
                convertedAlignments, parseSmrtTitle, false);
        
        if (convertedAlignments.size() > 1) {
            cout << "WARNING. Ignore multiple segments." << endl;
            continue;
        }

        for (int i = 0; i < 1; i++) {
            AlignmentCandidate<> & alignment = convertedAlignments[i];

            //score func does not matter
            DistanceMatrixScoreFunction<DNASequence, DNASequence> distFunc; 
            ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, 
                                  alignment.tAlignedSeq.seq, distFunc);
                                  
            // Check whether this alignment can only map to adapters in 
            // the adapter GFF file.
            if (adapterGffFileName != "" and 
                CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) {
                if (verbose)
                    cout << alignment.qName << " filter adapter only."
                         << endl;
                continue;
            }

            // Assign score to samAlignment.
            samAlignment.score = samAlignment.as;

            if (not filterCriteria.Satisfy(static_cast<AlignmentCandidate<> *>(&alignment))) {
                continue;
            }
            allSAMAlignments.push_back( samAlignment ); 

            alignment.FreeSubsequences();
        }
        ++alignIndex;
    }

    // Sort all SAM alignments by qName, score and target position.
    sort(allSAMAlignments.begin(), allSAMAlignments.end(), 
         byQNameScoreTStart);

    unsigned int groupBegin = 0;
    unsigned int groupEnd = -1;
    vector<SAMAlignment> filteredSAMAlignments;
    while(groupBegin < allSAMAlignments.size()) {
        // Get the next group of SAM alignments which have the same qName
        // from allSAMAlignments[groupBegin ... groupEnd)
        GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd);
        vector<unsigned int> hitIndices = ApplyHitPolicy(
                hitPolicy, allSAMAlignments, groupBegin, groupEnd);
        for(unsigned int i = 0; i < hitIndices.size(); i++) {
            filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]);
        }
        groupBegin = groupEnd;
    }

    // Sort all SAM alignments by reference name and query name
    sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), 
         byRNameQName);

    for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) {
        filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm);
    }

	if (outFileName != "") {
		outFileStrm.close();
	}
#ifdef USE_GOOGLE_PROFILER
  ProfilerStop();
#endif
    return 0;
}
コード例 #30
0
ファイル: SamToM4.cpp プロジェクト: Coryza/blasr
int main(int argc, char* argv[]) {
    string program = "samtom4";
    string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

    string samFileName, refFileName, outFileName;
    bool printHeader = false;
    bool parseSmrtTitle = false;
    bool useShortRefName = false;

    CommandLineParser clp;
    clp.SetProgramName(program);
    clp.SetVersion(versionString);
    clp.SetProgramSummary("Converts a SAM file generated by blasr to M4 format.");
    clp.RegisterStringOption("in.sam",        &samFileName,
                             "Input SAM file, which is produced by blasr.");
    clp.RegisterStringOption("reference.fasta", &refFileName,
                             "Reference used to generate file.sam.");
    clp.RegisterStringOption("out.m4",          &outFileName,
                             "Output in blasr M4 format.");
    clp.RegisterPreviousFlagsAsHidden();
    clp.RegisterFlagOption("header",            &printHeader,
                           "Print M4 header.");
    clp.RegisterFlagOption("useShortRefName",   &useShortRefName, 
                           "Use abbreviated reference names obtained "
                           "from file.sam instead of using full names "
                           "from reference.fasta.");
    //clp.SetExamples(program + " file.sam reference.fasta out.m4");

    clp.ParseCommandLine(argc, argv);

    ostream * outFilePtr = &cout;
	ofstream outFileStrm;
	if (outFileName != "") {
		CrucialOpen(outFileName, outFileStrm, std::ios::out);
		outFilePtr = &outFileStrm;
	}

    SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader;
    FASTAReader fastaReader;

    //
    // Initialize samReader and fastaReader.
    //
    samReader.Initialize(samFileName);
    fastaReader.Initialize(refFileName);

    //
    // Configure the file log.
    //
    string command;
    CommandLineParser::CommandLineToString(argc, argv, command);

    //
    // Read necessary input.
    //
    vector<FASTASequence> references;
    fastaReader.ReadAllSequences(references);

    AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet;
    samReader.ReadHeader(alignmentSet); 

    //
    // The order of references in vector<FASTASequence> references and
    // AlignmentSet<, , >alignmentSet.references can be different.
    // Rearrange alignmentSet.references such that it is ordered in
    // exactly the same way as vector<FASTASequence> references.
    //
    alignmentSet.RearrangeReferences(references);

    //
    // Map short names for references obtained from file.sam to 
    // full names obtained from reference.fasta
    //
    map<string, string> shortRefNameToFull;
    map<string, string>::iterator it;
    assert(references.size() == alignmentSet.references.size());
    if (!useShortRefName) {
        for (size_t i = 0; i < references.size(); i++) {
            string shortRefName = alignmentSet.references[i].GetSequenceName();
            string fullRefName(references[i].title); 
            if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) {
                cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl;
                exit(1);
            } 
            shortRefNameToFull[shortRefName] = fullRefName;
            alignmentSet.references[i].sequenceName = fullRefName;
        }
    }

    // Map reference name obtained from SAM file to indices
    map<string, int> refNameToIndex;
    for (size_t i = 0; i < references.size(); i++) {
        string refName = alignmentSet.references[i].GetSequenceName();
        refNameToIndex[refName] = i;
    }

    //
    // Store the alignments.
    //
    SAMAlignment samAlignment;
    size_t alignIndex = 0; 

    //
    // For 150K, each chip produces about 300M sequences 
    // (not including quality values and etc.).
    // Let's assume that the sam file and reference data can 
    // fit in the memory. 
    // Need to scale for larger sequal data in the future.
    //
    if (printHeader)
        IntervalOutput::PrintHeader(*outFilePtr);

    // The socre matrix does not matter because we will use the 
    // aligner's score from SAM file anyway.
    DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn;

    while (samReader.GetNextAlignment(samAlignment)) {
        if (samAlignment.rName == "*") {
            continue;
        }

        if (!useShortRefName) {
            //convert shortRefName to fullRefName
            it = shortRefNameToFull.find(samAlignment.rName);
            if (it == shortRefNameToFull.end()) {
                cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl;
                exit(1);
            }
            samAlignment.rName = (*it).second;
        }

        // The padding character 'P' is not supported
        if (samAlignment.cigar.find('P') != string::npos) {
            cout << "WARNING. Could not process sam record with 'P' in its cigar string."
                 << endl;
            continue;
        }

        vector<AlignmentCandidate<> > convertedAlignments;

        //
        // Keep reference as forward.
        // So if IsReverseComplement(sam.flag)==true, then qStrand is reverse
        // and tStrand is forward.
        //
        bool keepRefAsForward = false;

        SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex,
                                  convertedAlignments, parseSmrtTitle, 
                                  keepRefAsForward);

        if (convertedAlignments.size() > 1) {
            cout << "WARNING. Ignore an alignment which has multiple segments." << endl;
            continue;
        }

        //all alignments are unique single-ended alignments.
        for (int i = 0; i < 1; i++) {
            AlignmentCandidate<> & alignment = convertedAlignments[i];

            ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, 
                                  alignment.tAlignedSeq.seq, distScoreFn);

            // Use aligner's score from SAM file anyway.
            alignment.score = samAlignment.as;
            alignment.mapQV = samAlignment.mapQV;

            // Since SAM only has the aligned sequence, many info of the 
            // original query (e.g. the full length) is missing. 
            // Overwrite alignment.qLength (which is length of the query
            // in the SAM alignment) with xq (which is the length of the 
            // original query sequence saved by blasr) right before printing 
            // the output so that one can reconstruct a blasr m4 record from 
            // a blasr sam alignment.
            if (samAlignment.xq!=0)
                alignment.qLength = samAlignment.xq;

            IntervalOutput::PrintFromSAM(alignment, *outFilePtr);

            alignment.FreeSubsequences();
        }
        ++alignIndex;
    }

	if (outFileName != "") {
		outFileStrm.close();
	}
    return 0;
}