Example #1
0
int main(int argc, char* argv[])
{
    std::string seqInName, seqOutName, dotOutName;
    if (argc < 4) {
        std::cout << "usage: exciseRepeats inName repMaskOutFile outName" << std::endl;
        std::exit(EXIT_FAILURE);
    }

    seqInName = argv[1];
    dotOutName = argv[2];
    seqOutName = argv[3];
    FASTAReader reader;
    reader.Initialize(seqInName);
    FASTASequence origSeq;
    reader.GetNext(origSeq);

    std::ifstream dotOutFile;
    CrucialOpen(dotOutName, dotOutFile);
    std::ofstream seqOutFile;
    std::ofstream seqOut;
    CrucialOpen(seqOutName, seqOut, std::ios::out);
    std::string dotOutLine;
    getline(dotOutFile, dotOutLine);
    getline(dotOutFile, dotOutLine);
    getline(dotOutFile, dotOutLine);
    while (getline(dotOutFile, dotOutLine)) {
        std::stringstream lineStrm(dotOutLine);
        int swScore;
        float pctDiv, pctDel, pctIns;
        std::string query;
        DNALength qPosBegin, qPosEnd;
        std::string left;
        char strand;
        std::string matchingRepeat;
        std::string repClass;
        std::string repPos, repEnd, repLeft;
        int id;
        lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >>
            left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id;
        for (DNALength seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) {
            origSeq.seq[seqPos] = 'X';
        }
    }

    DNALength seqPos, unexPos;
    unexPos = 0;
    for (seqPos = 0; seqPos < origSeq.length; seqPos++) {
        if (origSeq.seq[seqPos] != 'X') {
            origSeq.seq[unexPos] = origSeq.seq[seqPos];
            unexPos++;
        }
    }
    origSeq.length = unexPos;

    origSeq.PrintSeq(seqOut);
    return 0;
}
Example #2
0
int main(int argc, char* argv[]) {
	string genomeFileName, subseqFileName;
	if (argc != 3) {
		cout << "usage: extractRepeats genome repeat" << endl;
		exit(0);
	}

	genomeFileName = argv[1];
	subseqFileName = argv[2];

	FASTASequence genome, sub;
	FASTAReader reader;
	reader.Init(genomeFileName);
	reader.GetNext(genome);
	reader.Init(subseqFileName);
	reader.GetNext(sub);

	genome.ToUpper();
	sub.ToUpper();	
	DNALength genomePos;
	FASTASequence genomeSub;
	int kband = (int) (0.15) * sub.length;
	vector<int> scoreMat;
	vector<Arrow> pathMat;
	int readIndex = 0;
	cout << "starting extraction" << endl;
	for (genomePos = 0; genomePos < genome.length - sub.length + 1; genomePos++) {
		genomeSub.seq = &genome.seq[genomePos];
		genomeSub.length = sub.length;
		int alignScore;
		Alignment alignment;
		alignScore = SWAlign(genomeSub, sub,
												 EditDistanceMatrix, 1, //1,kband,
												 scoreMat, pathMat,
												 alignment, QueryFit);
												 
		if (alignScore < 0.25 * sub.length) {
			stringstream titlestrm;
			titlestrm << readIndex << "|" 
								 << genomePos << "|"
								<< genomePos + sub.length << " " << alignScore/ (1.0*sub.length);
			FASTASequence subcopy;
			subcopy.CopyTitle(titlestrm.str());
			subcopy.seq = &genome.seq[genomePos];
			subcopy.length = sub.length;
			subcopy.PrintSeq(std::cout);
			genomePos += sub.length;
		}
	}
}
Example #3
0
int main(int argc, char* argv[]) {
		if (argc < 4) {
			cout << "usage: splitContigs in.fa contiglength out" << endl;
			exit(1);
		}
		string inFileName, outFileName;
		inFileName = argv[1];
		int contigLength = atoi(argv[2]);		
		outFileName = argv[3];

		ofstream seqOut;
		CrucialOpen(outFileName, seqOut, std::ios::out);
		FASTAReader reader;
		reader.Init(inFileName);
		FASTASequence seq;
		DNALength curOffset;
		
		while(reader.GetNext(seq)) {
			FASTASequence subseq;
			int i;
			curOffset = 0;
			for (i =0 ; i < seq.length / contigLength + 1; i++ ) {
				subseq.seq = &seq.seq[curOffset];
				subseq.title = seq.title;
				if (curOffset + contigLength > seq.length) {
					subseq.length = seq.length - curOffset;
				}
				else {
					subseq.length = contigLength;
				}
				subseq.PrintSeq(seqOut);
				curOffset += contigLength;
			}
		}
		return 0;
}
Example #4
0
int main(int argc, char* argv[]) {
  string barcodeFileName, insertFileName, outputFileName;
  if (argc != 4) {
    cout << "usage: makeBarcodeDatabase insert.fasta barcodes.fasta output.fasta" << endl;
    exit(1);
  }
  insertFileName = argv[1];
  barcodeFileName = argv[2];
  outputFileName  = argv[3];

  FASTAReader barcodeReader, insertReader;
  barcodeReader.Initialize(barcodeFileName);
  insertReader.Initialize(insertFileName);
  
  ofstream barcodedOut;
  CrucialOpen(outputFileName, barcodedOut, std::ios::out);

  vector<FASTASequence> forwardBarcodes, reverseBarcodes;
  FASTASequence barcodeSequence, reverseBarcodeSequence;
  while(barcodeReader.GetNext(barcodeSequence)) {
    forwardBarcodes.push_back(barcodeSequence);
    barcodeSequence.MakeRC(reverseBarcodeSequence);
    reverseBarcodes.push_back(reverseBarcodeSequence);
  }
  
  FASTASequence insert;
  insertReader.GetNext(insert);
  
  int i;
  for (i = 0; i < forwardBarcodes.size(); i++) {
    FASTASequence barcodedInsert;
    barcodedInsert.Resize(forwardBarcodes[i].length * 2 + insert.length);
    stringstream titleStrm;
    titleStrm << insert.title << "|ff|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);

    titleStrm.str("");
    titleStrm << insert.title << "|fr|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);


    titleStrm.str("");
    titleStrm << insert.title << "|rf|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);


    titleStrm.str("");
    titleStrm << insert.title << "|rr|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);
  }
}
Example #5
0
int main(int argc, char* argv[]) {
	string ad1File, ad2File, readsFile, readsOutFile;

	FASTAReader ad1Reader;
	FASTAReader ad2Reader;
	FASTAReader reader;


	CommandLineParser cl;
	float minPctSimilarity = 0.60;
	int indel = 3;
	int minLength = 10;
	cl.RegisterStringOption("ad1", &ad1File, "FASTA file with the first adapter");
	cl.RegisterStringOption("ad2", &ad2File, "FASTA file with the second adapter");
	cl.RegisterStringOption("reads", &readsFile, "FASTA file with SMRTBell reads");
	cl.RegisterStringOption("readsout", &readsOutFile, "output file for split reads");
	cl.RegisterPreviousFlagsAsHidden();
	cl.RegisterFloatOption("pctSim", &minPctSimilarity, "Minimum percent similarity to trigger a match to an adapter.", 
												 CommandLineParser::PositiveFloat);
	cl.RegisterIntOption("indel", &indel, "Penalty for indel (positive)", CommandLineParser::NonNegativeInteger);
	cl.RegisterIntOption("minLength", &minLength, "Minimum length pass to retain.", CommandLineParser::PositiveInteger);
	vector<string> opts;
	cl.ParseCommandLine(argc, argv, opts);

	/*
	 * Open all the required files, quitting if they are unavailable.
	 */

	ad1Reader.Init(ad1File);
	ad2Reader.Init(ad2File);
	reader.Init(readsFile);

	ofstream splitOut;
	CrucialOpen(readsOutFile, splitOut);

	FASTASequence ad1, ad2;
	ad1Reader.GetNext(ad1);
	ad2Reader.GetNext(ad2);

	FASTASequence read;
	vector<int> scoreMat;
	vector<Arrow> pathMat;
	int readIndex = 0;
	while(reader.GetNext(read)) {
		read.ToUpper();
		//
		// Do a fitting sequence alignment to match one of the two 
		// adapters into the read.
		//
		vector<int> passStarts, passLengths, la;
		read.PrintSeq(cout);
		SplitRead(read, 0, read.length, ad1, ad2,
							indel, 
							passStarts, passLengths,la, 0,
							scoreMat, pathMat, minPctSimilarity, minLength);
		int i;
		for (i = 0; i < passStarts.size(); i++) {
			cout << "read: " << readIndex << " pass: "******" " << passStarts[i] << " " << passLengths[i] << " " << la[i] << endl;
		}
		++readIndex;
	}
}
Example #6
0
int main(int argc, char* argv[]) {
	CommandLineParser clp;

	string refGenomeName;
	string mutGenomeName;
  string gffFileName;
	float insRate = 0;
	float delRate = 0;
	float mutRate = 0;
  bool  lower = false;
  gffFileName = "";
	clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true);
	clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true);
	clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome.");
	clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", 
													CommandLineParser::NonNegativeFloat, false);
	clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", 
													CommandLineParser::NonNegativeFloat, false);
	clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", 
													CommandLineParser::NonNegativeFloat, false);
  clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false);
	vector<string> leftovers;
	clp.ParseCommandLine(argc, argv, leftovers);
  
	FASTAReader reader;
	FASTASequence refGenome;

	reader.Init(refGenomeName);
	ofstream mutGenomeOut;
	CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out);
  ofstream gffOut;
  if (gffFileName != "") {
    CrucialOpen(gffFileName, gffOut, std::ios::out);
  }

	vector<int> insIndices, delIndices, subIndices;
	int readIndex = 0;
	InitializeRandomGeneratorWithTime();
	while (reader.GetNext(refGenome)) {
		insIndices.resize(refGenome.length);
		delIndices.resize(refGenome.length);
    subIndices.resize(refGenome.length);
		std::fill(insIndices.begin(), insIndices.end(), false);
		std::fill(delIndices.begin(), delIndices.end(), false);
    std::fill(subIndices.begin(), subIndices.end(), 0);

		enum ChangeType { Ins, Del, Mut, None};
		float changeProb[4];
		changeProb[Ins] = insRate;
		changeProb[Del] = changeProb[Ins] + delRate;
		changeProb[Mut] = changeProb[Del] + mutRate;
		changeProb[None] = 1;

		if (changeProb[Mut] > 1) {
			cout << "ERROR! The sum of the error probabilities must be less than 1" << endl;
			exit(1);
		}
		DNALength pos;
		float randomNumber;
		int numIns = 0;
		int numDel = 0;
		int numMut = 0;
		for (pos =0 ; pos < refGenome.length; pos++) { 
			randomNumber = Random();
			if (randomNumber < changeProb[Ins]) {
				insIndices[pos] = true;
				numIns++;
			}
			else if (randomNumber < changeProb[Del]) {
				delIndices[pos] = true;
				numDel++;
			}
			else if (randomNumber < changeProb[Mut]){ 
				Nucleotide newNuc = TwoBitToAscii[RandomInt(4)];
				int maxIts = 100000;
				int it = 0;
				while (newNuc == refGenome.seq[pos]) {
					newNuc = TwoBitToAscii[RandomInt(4)];
					if (it == maxIts) {
						cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl;
						exit(1);
					}
				}
        subIndices[pos] = refGenome[pos];
				refGenome.seq[pos] = ToLower(newNuc,lower);
				++numMut;
			}
		}
		//		cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl;
		if (readIndex % 100000 == 0 && readIndex > 0) {
			cout << readIndex << endl;
		}
		// 
		// Now add the insertions and deletions.
		//
		FASTASequence newSequence;
		DNALength   newPos;
		if (numIns - numDel + refGenome.length < 0) {
			cout << "ERROR, the genome has been deleted to nothing." << endl;
			exit(1);
		}
		ResizeSequence(newSequence, refGenome.length + (numIns - numDel));
		newPos = 0;
		pos = 0;
		for (pos = 0; pos < refGenome.length; pos++) {
			assert(newPos < newSequence.length or delIndices[pos] == true);
      if (subIndices[pos] != 0 and gffFileName != "") {
        gffOut << refGenome.GetName() << "	.	SNV	" << newPos << " " << newPos <<" 0.00	.	.	reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl;
      }
        
			if (insIndices[pos] == true) {
				newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower);
				newPos++;
				newSequence.seq[newPos] = refGenome.seq[pos];
        
				assert(newSequence.seq[newPos] != '1');
				assert(newSequence.seq[newPos] != 1);
        if (gffFileName != "") {
          gffOut << refGenome.GetName() << "	.	deletion	" << newPos << " " << newPos << " 0.00	.	.	reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl;
        }
				newPos++;
			}
			else if (delIndices[pos] == true) {
				// no-op, skip
        if (gffFileName != "") {
          gffOut << refGenome.GetName() << "	.	insertion	" << newPos << " " << newPos << " 0.00	.	.	confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl;
//ref000001	.	deletion	20223	20223	0.00	.	.	reference=T;length=1;confidence=0;coverage=0;Name=20222delT
        }
			}
			else {
				newSequence.seq[newPos] = refGenome.seq[pos];
				newPos++;
			}
		}
		stringstream titlestrm;
		titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate;
		newSequence.CopyTitle(refGenome.title);
		newSequence.AppendToTitle(titlestrm.str());
		newSequence.PrintSeq(mutGenomeOut);
    newSequence.Free();
		readIndex++;
	}
}
int main(int argc, char* argv[]) {
  string gencodeGffFileName, genomeFileName, genesOutFileName;
  string geneType = "protein_coding";
  bool randomSplicing = false;
  int numRandomSplicing = 1;
  float pSkip = 0.5;
  if (argc < 4) {
    cout << "Usage: extractGenes gencodeGTFFile genomeFile genesOutFileName [-geneType type (protein_coding)] [-randomSplicing] [-numRandomSplicing n] [-pSkip prob (0-1, default:0.5)]" << endl;
    exit(1);
  }

  gencodeGffFileName = argv[1];
  genomeFileName     = argv[2];
  genesOutFileName   = argv[3];

  int argi = 4;
  string coordinatesFileName;

  while (argi < argc) {
    if (strcmp(argv[argi], "-geneType") == 0) {
      geneType = argv[++argi];
    }
    else if (strcmp(argv[argi], "-randomSplicing") == 0) {
      randomSplicing = true;
    }
    else if (strcmp(argv[argi], "-numRandomSplicing") == 0) {
      numRandomSplicing = atoi(argv[++argi]);
    }
    else if (strcmp(argv[argi], "-pSkip") == 0) {
      pSkip = atof(argv[++argi]);
    }
    else {
      cout << "ERROR, bad option  " << argv[argi] << endl;
      exit(1);
    }
    ++argi;
  }

  coordinatesFileName = genesOutFileName;
  coordinatesFileName.append(".pos");
  FASTAReader reader;
  reader.Initialize(genomeFileName);

  ofstream outFile, coordsFile;
  CrucialOpen(genesOutFileName, outFile, std::ios::out);

  string coordsFileName = genesOutFileName + ".coords";
  CrucialOpen(coordsFileName, coordsFile, std::ios::out);

  vector<FASTASequence> referenceSequences;
  reader.ReadAllSequences(referenceSequences);
  int i;
  map<string, int> titleToIndex;
  for (i = 0; i < referenceSequences.size(); i++) {
    titleToIndex[referenceSequences[i].title] = i;
  }

  GencodeGFFFile gencodeFile;
  gencodeFile.ReadAll(gencodeGffFileName);
  
  vector<GencodeGFFGene> genes;
  IndexGencodeGenes(gencodeFile, genes, geneType);

  for (i = 0; i < genes.size(); i++) {
    genes[i].OrderExonsByStart();
  }

  int e;
  for (i = 0; i < genes.size(); i++) {
    FASTASequence geneSequence;
    geneSequence.CopyTitle(genes[i].geneName);
    if (titleToIndex.find(genes[i].chromosome) == titleToIndex.end()) {
      continue;
    }
    int chrIndex = titleToIndex[genes[i].chromosome];
    string sequence = "";
    //
    // Do nothing with 0 length exons.
    //
    if (genes[i].exons.size() == 0) {
      continue;
    }
    vector<FASTASequence> geneSequences;
    vector<GeneCoordinates> geneCoordinates;
    genes[i].GenerateGeneSequences(referenceSequences[chrIndex], geneSequences, geneCoordinates, randomSplicing);
    int gi;
    for (gi = 0; gi < geneSequences.size(); gi++) {
      if (genes[i].GetStrand() == '+') {
        geneSequences[gi].PrintSeq(outFile);
      }
      else {
        FASTASequence rc;
        geneSequences[gi].MakeRC(rc);
        rc.PrintSeq(outFile);
        rc.Free();
      }
      coordsFile << geneSequences[gi].title << " " << geneCoordinates[gi].chromosome << " " << geneCoordinates[gi].exonCoordinates.size() << " " << geneCoordinates[gi].strand;
      int i;
      for (i = 0; i < geneCoordinates[gi].exonCoordinates.size(); i++) {
        coordsFile << " " 
                   << geneCoordinates[gi].exonCoordinates[i].start << " "  
                   << geneCoordinates[gi].exonCoordinates[i].end << " ";
      }
      coordsFile << endl;
      geneSequences[gi].Free();
    }
    // 
    // No need to free the seq, since it is controlled by the string.
    //
  }
  coordsFile.close();
  
}