Beispiel #1
0
int main(int argc, char* argv[])
{
    std::string seqInName, seqOutName, dotOutName;
    if (argc < 4) {
        std::cout << "usage: exciseRepeats inName repMaskOutFile outName" << std::endl;
        std::exit(EXIT_FAILURE);
    }

    seqInName = argv[1];
    dotOutName = argv[2];
    seqOutName = argv[3];
    FASTAReader reader;
    reader.Initialize(seqInName);
    FASTASequence origSeq;
    reader.GetNext(origSeq);

    std::ifstream dotOutFile;
    CrucialOpen(dotOutName, dotOutFile);
    std::ofstream seqOutFile;
    std::ofstream seqOut;
    CrucialOpen(seqOutName, seqOut, std::ios::out);
    std::string dotOutLine;
    getline(dotOutFile, dotOutLine);
    getline(dotOutFile, dotOutLine);
    getline(dotOutFile, dotOutLine);
    while (getline(dotOutFile, dotOutLine)) {
        std::stringstream lineStrm(dotOutLine);
        int swScore;
        float pctDiv, pctDel, pctIns;
        std::string query;
        DNALength qPosBegin, qPosEnd;
        std::string left;
        char strand;
        std::string matchingRepeat;
        std::string repClass;
        std::string repPos, repEnd, repLeft;
        int id;
        lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >>
            left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id;
        for (DNALength seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) {
            origSeq.seq[seqPos] = 'X';
        }
    }

    DNALength seqPos, unexPos;
    unexPos = 0;
    for (seqPos = 0; seqPos < origSeq.length; seqPos++) {
        if (origSeq.seq[seqPos] != 'X') {
            origSeq.seq[unexPos] = origSeq.seq[seqPos];
            unexPos++;
        }
    }
    origSeq.length = unexPos;

    origSeq.PrintSeq(seqOut);
    return 0;
}
Beispiel #2
0
int FASTAReader::ConcatenateNext(FASTASequence &cur) {
    FASTASequence next;
    int retVal;	
    if ((retVal = GetNext(next))) {
        next.CleanupASCII();
        cur.Concatenate((Nucleotide*) "N");
        cur.Concatenate(next);	
    }
    next.Free();
    return retVal;
}
Beispiel #3
0
int main(int argc, char* argv[]) {
	string genomeFileName, subseqFileName;
	if (argc != 3) {
		cout << "usage: extractRepeats genome repeat" << endl;
		exit(0);
	}

	genomeFileName = argv[1];
	subseqFileName = argv[2];

	FASTASequence genome, sub;
	FASTAReader reader;
	reader.Init(genomeFileName);
	reader.GetNext(genome);
	reader.Init(subseqFileName);
	reader.GetNext(sub);

	genome.ToUpper();
	sub.ToUpper();	
	DNALength genomePos;
	FASTASequence genomeSub;
	int kband = (int) (0.15) * sub.length;
	vector<int> scoreMat;
	vector<Arrow> pathMat;
	int readIndex = 0;
	cout << "starting extraction" << endl;
	for (genomePos = 0; genomePos < genome.length - sub.length + 1; genomePos++) {
		genomeSub.seq = &genome.seq[genomePos];
		genomeSub.length = sub.length;
		int alignScore;
		Alignment alignment;
		alignScore = SWAlign(genomeSub, sub,
												 EditDistanceMatrix, 1, //1,kband,
												 scoreMat, pathMat,
												 alignment, QueryFit);
												 
		if (alignScore < 0.25 * sub.length) {
			stringstream titlestrm;
			titlestrm << readIndex << "|" 
								 << genomePos << "|"
								<< genomePos + sub.length << " " << alignScore/ (1.0*sub.length);
			FASTASequence subcopy;
			subcopy.CopyTitle(titlestrm.str());
			subcopy.seq = &genome.seq[genomePos];
			subcopy.length = sub.length;
			subcopy.PrintSeq(std::cout);
			genomePos += sub.length;
		}
	}
}
Beispiel #4
0
void FASTAReader::ReadTitle(GenomeLength &p, FASTASequence & seq) {
    char * seqTitle = NULL;
    int seqTitleLen; 
    ReadTitle(p, seqTitle, seqTitleLen);
    seq.CopyTitle(seqTitle, seqTitleLen);
    if (seqTitle) {delete [] seqTitle;}
}
// Create a reverse complement FASTASequence of *this and assign to rhs.
void FASTASequence::MakeRC(FASTASequence &rhs, DNALength rhsPos, DNALength rhsLength) {
    rhs.Free();
    DNASequence::MakeRC((DNASequence&) rhs, rhsPos, rhsLength);
    if (title != NULL) {
        (static_cast<FASTASequence*>(&rhs))->CopyTitle(title);
    }
}
Beispiel #6
0
int main(int argc, char* argv[]) {
		if (argc < 4) {
			cout << "usage: splitContigs in.fa contiglength out" << endl;
			exit(1);
		}
		string inFileName, outFileName;
		inFileName = argv[1];
		int contigLength = atoi(argv[2]);		
		outFileName = argv[3];

		ofstream seqOut;
		CrucialOpen(outFileName, seqOut, std::ios::out);
		FASTAReader reader;
		reader.Init(inFileName);
		FASTASequence seq;
		DNALength curOffset;
		
		while(reader.GetNext(seq)) {
			FASTASequence subseq;
			int i;
			curOffset = 0;
			for (i =0 ; i < seq.length / contigLength + 1; i++ ) {
				subseq.seq = &seq.seq[curOffset];
				subseq.title = seq.title;
				if (curOffset + contigLength > seq.length) {
					subseq.length = seq.length - curOffset;
				}
				else {
					subseq.length = contigLength;
				}
				subseq.PrintSeq(seqOut);
				curOffset += contigLength;
			}
		}
		return 0;
}
Beispiel #7
0
int FASTAReader::GetNext(FASTASequence &seq) {
    if (curPos == fileSize) {
        return 0;
    }

    seq.Free(); //Free seq before read

    // 
    // Extract the title of the current record.
    //
    GenomeLength p = curPos;

    AdvanceToTitleStart(p);

    // 
    // Make sure there is a '>'
    //
    CheckValidTitleStart(p);

    ReadTitle(p, seq); 

    //
    // Read in the next sequence.
    //

    // Count the length of the sequence.

    GenomeLength seqLength = 0;
    curPos = p;
    char c;
    while (p < fileSize and
            (c = filePtr[p]) != endOfReadDelim) {
        if (c != ' ' and
                c != '\t' and
                c != '\n' and
                c != '\r') {
            seqLength++;
        }
        p++;
    }
    if (seqLength > UINT_MAX) {
        cout << "ERROR! Reading sequences stored in more than 4Gbytes of space is not supported." << endl;
        exit(1);
    }

    seq.length = 0;
    if (seqLength > 0) {
        seq.length = seqLength;
        seq.seq = ProtectedNew <Nucleotide>(seqLength+padding+1);
        p = curPos;
        seq.deleteOnExit = true;
        GenomeLength s = 0;
        while (p < fileSize and
                (c = filePtr[p]) != endOfReadDelim) {
            if (c != ' ' and
                    c != '\t' and
                    c != '\n' and
                    c != '\r') {
                seq.seq[s] = convMat[static_cast<unsigned char>(filePtr[p])];
                s++;
            }
            p++;
        }
        seq.seq[seqLength] = 0;
    }
    curPos = p;

    if (computeMD5) {
        MakeMD5((const char*) &seq.seq, seq.length, curReadMD5);
    }
    return 1;
}
Beispiel #8
0
GenomeLength FASTAReader::ReadAllSequencesIntoOne(FASTASequence &seq, SequenceIndexDatabase<FASTASequence> *seqDBPtr) {
    seq.Free();
    GenomeLength p = curPos;
    AdvanceToTitleStart(p);
    CheckValidTitleStart(p);
    ReadTitle(p, seq); 
     
    if (seq.title == NULL) {
        cout << "ERROR, sequence must have a nonempty title." << endl;
        exit(1);
    }
    if (seqDBPtr != NULL) {
        seqDBPtr->growableName.push_back(seq.title);
    }
    GenomeLength seqLength;
    seqLength = fileSize - p;
    GenomeLength memorySize = seqLength+padding+1;

    if (memorySize > UINT_MAX) {
        cout << "ERROR! Reading fasta files greater than 4Gbytes is not supported." << endl;
        exit(1);
    }
    seq.Resize(memorySize);
    GenomeLength i;
    i = 0L;
    for (; p < fileSize; p++, i++ ) {
        seq.seq[i] = filePtr[p];
    }
    i = p = 0;
    while (p < seqLength) {
        //
        // If this is the beginning of another read, add an 'N' 
        // to delineate spaces between reads.
        //

        while (p < seqLength and
                (seq.seq[p] == ' ' or
                 seq.seq[p] == '\n' or 
                 seq.seq[p] == '\t' or
                 seq.seq[p] == '\r')) {
            p++;
        }
        if (p < seqLength and seq.seq[p] == '>') {
            seq.seq[i] = 'N';

            GenomeLength titleStartPos = p+1;
            i++;
            while (p < seqLength and seq.seq[p] != '\n') p++;
            if (seqDBPtr != NULL and p < seqLength) {
                string title;
                GenomeLength tp;
                for (tp = titleStartPos; tp < p; tp++) {
                    title.push_back(seq.seq[tp]);
                }

                seqDBPtr->growableName.push_back(title);
                seqDBPtr->growableSeqStartPos.push_back(i);
                int nSeq = seqDBPtr->growableSeqStartPos.size();
                if (nSeq > 1 and computeMD5) {
                    string md5Str;
                    MakeMD5((const char*) &seq.seq[seqDBPtr->growableSeqStartPos[nSeq-2]],
                            seqDBPtr->growableSeqStartPos[nSeq-1] - seqDBPtr->growableSeqStartPos[nSeq-2] - 1,
                            md5Str);
                    seqDBPtr->md5.push_back(md5Str);
                }
            }
        }
        else if (p < seqLength)  {
            // Otherwise, p may be at whitespace
            // advance past that as well.
            seq.seq[i] = convMat[seq.seq[p]];
            i++;
            p++;
        }
    }
    if (i > UINT_MAX) {
        cout << "ERROR! Sequences greater than 4Gbase are not supported." << endl;
        exit(1);
    }
    //
    // Append an 'N' at the end of the last sequence for consistency
    // between different orderings of reference input. 
    //
    seq.seq[i] = 'N';
    i++;
    seq.length = i;
    // fill padding.
    for (; i < memorySize; i++ ){
        seq.seq[i] = 0;
    }
    seq.deleteOnExit = true;
    if (seqDBPtr != NULL) {
        seqDBPtr->growableSeqStartPos.push_back(seq.length);
        int nSeq = seqDBPtr->growableSeqStartPos.size();
        if (nSeq > 1 and computeMD5) {
            string md5Str;
            MakeMD5((const char*) &seq.seq[seqDBPtr->growableSeqStartPos[nSeq-2]],
                    seqDBPtr->growableSeqStartPos[nSeq-1] - seqDBPtr->growableSeqStartPos[nSeq-2] - 1,
                    md5Str);
            seqDBPtr->md5.push_back(md5Str);
        }
        seqDBPtr->Finalize();
    }
    return seq.length;
}
Beispiel #9
0
int main(int argc, char* argv[]) {
	string ad1File, ad2File, readsFile, readsOutFile;

	FASTAReader ad1Reader;
	FASTAReader ad2Reader;
	FASTAReader reader;


	CommandLineParser cl;
	float minPctSimilarity = 0.60;
	int indel = 3;
	int minLength = 10;
	cl.RegisterStringOption("ad1", &ad1File, "FASTA file with the first adapter");
	cl.RegisterStringOption("ad2", &ad2File, "FASTA file with the second adapter");
	cl.RegisterStringOption("reads", &readsFile, "FASTA file with SMRTBell reads");
	cl.RegisterStringOption("readsout", &readsOutFile, "output file for split reads");
	cl.RegisterPreviousFlagsAsHidden();
	cl.RegisterFloatOption("pctSim", &minPctSimilarity, "Minimum percent similarity to trigger a match to an adapter.", 
												 CommandLineParser::PositiveFloat);
	cl.RegisterIntOption("indel", &indel, "Penalty for indel (positive)", CommandLineParser::NonNegativeInteger);
	cl.RegisterIntOption("minLength", &minLength, "Minimum length pass to retain.", CommandLineParser::PositiveInteger);
	vector<string> opts;
	cl.ParseCommandLine(argc, argv, opts);

	/*
	 * Open all the required files, quitting if they are unavailable.
	 */

	ad1Reader.Init(ad1File);
	ad2Reader.Init(ad2File);
	reader.Init(readsFile);

	ofstream splitOut;
	CrucialOpen(readsOutFile, splitOut);

	FASTASequence ad1, ad2;
	ad1Reader.GetNext(ad1);
	ad2Reader.GetNext(ad2);

	FASTASequence read;
	vector<int> scoreMat;
	vector<Arrow> pathMat;
	int readIndex = 0;
	while(reader.GetNext(read)) {
		read.ToUpper();
		//
		// Do a fitting sequence alignment to match one of the two 
		// adapters into the read.
		//
		vector<int> passStarts, passLengths, la;
		read.PrintSeq(cout);
		SplitRead(read, 0, read.length, ad1, ad2,
							indel, 
							passStarts, passLengths,la, 0,
							scoreMat, pathMat, minPctSimilarity, minLength);
		int i;
		for (i = 0; i < passStarts.size(); i++) {
			cout << "read: " << readIndex << " pass: "******" " << passStarts[i] << " " << passLengths[i] << " " << la[i] << endl;
		}
		++readIndex;
	}
}
Beispiel #10
0
int main(int argc, char* argv[]) {
	CommandLineParser clp;

	string refGenomeName;
	string mutGenomeName;
  string gffFileName;
	float insRate = 0;
	float delRate = 0;
	float mutRate = 0;
  bool  lower = false;
  gffFileName = "";
	clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true);
	clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true);
	clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome.");
	clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", 
													CommandLineParser::NonNegativeFloat, false);
	clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", 
													CommandLineParser::NonNegativeFloat, false);
	clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", 
													CommandLineParser::NonNegativeFloat, false);
  clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false);
	vector<string> leftovers;
	clp.ParseCommandLine(argc, argv, leftovers);
  
	FASTAReader reader;
	FASTASequence refGenome;

	reader.Init(refGenomeName);
	ofstream mutGenomeOut;
	CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out);
  ofstream gffOut;
  if (gffFileName != "") {
    CrucialOpen(gffFileName, gffOut, std::ios::out);
  }

	vector<int> insIndices, delIndices, subIndices;
	int readIndex = 0;
	InitializeRandomGeneratorWithTime();
	while (reader.GetNext(refGenome)) {
		insIndices.resize(refGenome.length);
		delIndices.resize(refGenome.length);
    subIndices.resize(refGenome.length);
		std::fill(insIndices.begin(), insIndices.end(), false);
		std::fill(delIndices.begin(), delIndices.end(), false);
    std::fill(subIndices.begin(), subIndices.end(), 0);

		enum ChangeType { Ins, Del, Mut, None};
		float changeProb[4];
		changeProb[Ins] = insRate;
		changeProb[Del] = changeProb[Ins] + delRate;
		changeProb[Mut] = changeProb[Del] + mutRate;
		changeProb[None] = 1;

		if (changeProb[Mut] > 1) {
			cout << "ERROR! The sum of the error probabilities must be less than 1" << endl;
			exit(1);
		}
		DNALength pos;
		float randomNumber;
		int numIns = 0;
		int numDel = 0;
		int numMut = 0;
		for (pos =0 ; pos < refGenome.length; pos++) { 
			randomNumber = Random();
			if (randomNumber < changeProb[Ins]) {
				insIndices[pos] = true;
				numIns++;
			}
			else if (randomNumber < changeProb[Del]) {
				delIndices[pos] = true;
				numDel++;
			}
			else if (randomNumber < changeProb[Mut]){ 
				Nucleotide newNuc = TwoBitToAscii[RandomInt(4)];
				int maxIts = 100000;
				int it = 0;
				while (newNuc == refGenome.seq[pos]) {
					newNuc = TwoBitToAscii[RandomInt(4)];
					if (it == maxIts) {
						cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl;
						exit(1);
					}
				}
        subIndices[pos] = refGenome[pos];
				refGenome.seq[pos] = ToLower(newNuc,lower);
				++numMut;
			}
		}
		//		cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl;
		if (readIndex % 100000 == 0 && readIndex > 0) {
			cout << readIndex << endl;
		}
		// 
		// Now add the insertions and deletions.
		//
		FASTASequence newSequence;
		DNALength   newPos;
		if (numIns - numDel + refGenome.length < 0) {
			cout << "ERROR, the genome has been deleted to nothing." << endl;
			exit(1);
		}
		ResizeSequence(newSequence, refGenome.length + (numIns - numDel));
		newPos = 0;
		pos = 0;
		for (pos = 0; pos < refGenome.length; pos++) {
			assert(newPos < newSequence.length or delIndices[pos] == true);
      if (subIndices[pos] != 0 and gffFileName != "") {
        gffOut << refGenome.GetName() << "	.	SNV	" << newPos << " " << newPos <<" 0.00	.	.	reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl;
      }
        
			if (insIndices[pos] == true) {
				newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower);
				newPos++;
				newSequence.seq[newPos] = refGenome.seq[pos];
        
				assert(newSequence.seq[newPos] != '1');
				assert(newSequence.seq[newPos] != 1);
        if (gffFileName != "") {
          gffOut << refGenome.GetName() << "	.	deletion	" << newPos << " " << newPos << " 0.00	.	.	reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl;
        }
				newPos++;
			}
			else if (delIndices[pos] == true) {
				// no-op, skip
        if (gffFileName != "") {
          gffOut << refGenome.GetName() << "	.	insertion	" << newPos << " " << newPos << " 0.00	.	.	confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl;
//ref000001	.	deletion	20223	20223	0.00	.	.	reference=T;length=1;confidence=0;coverage=0;Name=20222delT
        }
			}
			else {
				newSequence.seq[newPos] = refGenome.seq[pos];
				newPos++;
			}
		}
		stringstream titlestrm;
		titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate;
		newSequence.CopyTitle(refGenome.title);
		newSequence.AppendToTitle(titlestrm.str());
		newSequence.PrintSeq(mutGenomeOut);
    newSequence.Free();
		readIndex++;
	}
}
Beispiel #11
0
int main(int argc, char* argv[]) {
	string genomeFileName;
	string suffixArrayFileName;
	if (argc < 4) {
		cout << "Usage: printWordCount genome suffixArray k [k2 k3 k4...]" << endl;
		exit(1);
	}
	genomeFileName = argv[1];
	suffixArrayFileName = argv[2];
	int argi = 3;
	vector<DNALength> k;
	while (argi < argc) {
		k.push_back(atoi(argv[argi]));
		argi++;
	}

	// Get the ref sequence.
	FASTAReader reader;
	reader.Init(genomeFileName);
	FASTASequence seq;
  //	reader.GetNext(seq);
  reader.ReadAllSequencesIntoOne(seq);
	seq.ToUpper();
	// Get the suffix array.
	DNASuffixArray sarray;
	sarray.Read(suffixArrayFileName);
	
	int ki;
  char *word;
  cout << "wordlen word nword" << endl;
	for (ki = 0; ki < k.size(); ki++) {
    word = new char[k[ki]+1];
    word[k[ki]] = '\0';
		DNALength i;
		DNALength numUnique = 0;
		for (i = 0; i < seq.length - k[ki] - 1; ) {
			DNALength j = i + 1;
      bool seqAtN = false;
      int si;
      for(si = 0; si < k[ki]; si++) {
        if (seq.seq[sarray.index[i] + si] == 'N') {
          seqAtN = true;
          break;
        }
      }
      if (seqAtN) {
        i++;
        continue;
      }
			while (j < seq.length - k[ki] and 
						 seq.length - sarray.index[i] >= k[ki] and
						 seq.length - sarray.index[j] >= k[ki] and 
						 strncmp((const char*) &seq.seq[sarray.index[i]], (const char*) &seq.seq[sarray.index[j]], k[ki]) == 0) {
				j++;
			}
      if (seq.length - sarray.index[i] >= k[ki]) {
        for(si = 0; si < k[ki]; si++) {
          word[si] = seq.seq[sarray.index[i]+si];
        }
        cout << k[ki] << " " << word << " " << j - i + 1 << endl;
        if (j == i + 1) { 
          ++numUnique;
        }
      }
			i = j;
		}
	}
}
int main(int argc, char* argv[]) {
  string gencodeGffFileName, genomeFileName, genesOutFileName;
  string geneType = "protein_coding";
  bool randomSplicing = false;
  int numRandomSplicing = 1;
  float pSkip = 0.5;
  if (argc < 4) {
    cout << "Usage: extractGenes gencodeGTFFile genomeFile genesOutFileName [-geneType type (protein_coding)] [-randomSplicing] [-numRandomSplicing n] [-pSkip prob (0-1, default:0.5)]" << endl;
    exit(1);
  }

  gencodeGffFileName = argv[1];
  genomeFileName     = argv[2];
  genesOutFileName   = argv[3];

  int argi = 4;
  string coordinatesFileName;

  while (argi < argc) {
    if (strcmp(argv[argi], "-geneType") == 0) {
      geneType = argv[++argi];
    }
    else if (strcmp(argv[argi], "-randomSplicing") == 0) {
      randomSplicing = true;
    }
    else if (strcmp(argv[argi], "-numRandomSplicing") == 0) {
      numRandomSplicing = atoi(argv[++argi]);
    }
    else if (strcmp(argv[argi], "-pSkip") == 0) {
      pSkip = atof(argv[++argi]);
    }
    else {
      cout << "ERROR, bad option  " << argv[argi] << endl;
      exit(1);
    }
    ++argi;
  }

  coordinatesFileName = genesOutFileName;
  coordinatesFileName.append(".pos");
  FASTAReader reader;
  reader.Initialize(genomeFileName);

  ofstream outFile, coordsFile;
  CrucialOpen(genesOutFileName, outFile, std::ios::out);

  string coordsFileName = genesOutFileName + ".coords";
  CrucialOpen(coordsFileName, coordsFile, std::ios::out);

  vector<FASTASequence> referenceSequences;
  reader.ReadAllSequences(referenceSequences);
  int i;
  map<string, int> titleToIndex;
  for (i = 0; i < referenceSequences.size(); i++) {
    titleToIndex[referenceSequences[i].title] = i;
  }

  GencodeGFFFile gencodeFile;
  gencodeFile.ReadAll(gencodeGffFileName);
  
  vector<GencodeGFFGene> genes;
  IndexGencodeGenes(gencodeFile, genes, geneType);

  for (i = 0; i < genes.size(); i++) {
    genes[i].OrderExonsByStart();
  }

  int e;
  for (i = 0; i < genes.size(); i++) {
    FASTASequence geneSequence;
    geneSequence.CopyTitle(genes[i].geneName);
    if (titleToIndex.find(genes[i].chromosome) == titleToIndex.end()) {
      continue;
    }
    int chrIndex = titleToIndex[genes[i].chromosome];
    string sequence = "";
    //
    // Do nothing with 0 length exons.
    //
    if (genes[i].exons.size() == 0) {
      continue;
    }
    vector<FASTASequence> geneSequences;
    vector<GeneCoordinates> geneCoordinates;
    genes[i].GenerateGeneSequences(referenceSequences[chrIndex], geneSequences, geneCoordinates, randomSplicing);
    int gi;
    for (gi = 0; gi < geneSequences.size(); gi++) {
      if (genes[i].GetStrand() == '+') {
        geneSequences[gi].PrintSeq(outFile);
      }
      else {
        FASTASequence rc;
        geneSequences[gi].MakeRC(rc);
        rc.PrintSeq(outFile);
        rc.Free();
      }
      coordsFile << geneSequences[gi].title << " " << geneCoordinates[gi].chromosome << " " << geneCoordinates[gi].exonCoordinates.size() << " " << geneCoordinates[gi].strand;
      int i;
      for (i = 0; i < geneCoordinates[gi].exonCoordinates.size(); i++) {
        coordsFile << " " 
                   << geneCoordinates[gi].exonCoordinates[i].start << " "  
                   << geneCoordinates[gi].exonCoordinates[i].end << " ";
      }
      coordsFile << endl;
      geneSequences[gi].Free();
    }
    // 
    // No need to free the seq, since it is controlled by the string.
    //
  }
  coordsFile.close();
  
}
Beispiel #13
0
int main(int argc, char* argv[]) {
    string refGenomeFileName = "";
    string lengthModelFileName = "";
    string outputModelFileName = "";
    DNALength numBasesPerFile = 0;
    string sourceReadsFileName = "";
    string titleTableFileName = "";
    int numBasH5Files = 1;
    string basH5BaseFileName = "simulated";
    string movieName = "m101211_092754_00114_cSIM_s1_p0";
    bool   doRandGenInit = true;
    bool   usePosMap     = false;
    bool   printPercentRepeat = false;
    string posMapFileName = "";
    vector<string> movieNames;
    bool useLengthModel = false;
    bool useFixedLength = false;
    ofstream posMapFile;
    int scaledLength = 0;
    int fixedLength = 0;
    int nBasFiles = 1;
    bool useLengthsModel = true;
    bool printHelp = false;


    //  Look to see if the refAsReads flag is specified anywhere before
    //  parsing the command line.

    CommandLineParser clp;
    string commandLine;
    string helpString;
    SetHelp(helpString);
    vector<string> fns;

    clp.RegisterStringOption("genome", &refGenomeFileName, "");
    clp.RegisterIntOption("numBasesPerFile", (int*)&numBasesPerFile, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterStringOption("sourceReads", &sourceReadsFileName, "");
    clp.RegisterStringOption("lengthModel", &lengthModelFileName, "");
    clp.RegisterIntOption("fixedLength", &fixedLength, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterFlagOption("lengthModel", &useLengthModel, "");
    clp.RegisterStringOption("movieName", &movieName, "");
    clp.RegisterStringOption("titleTable", &titleTableFileName, "");
    clp.RegisterStringOption("baseFileName", &basH5BaseFileName, "");
    clp.RegisterIntOption("nFiles", &nBasFiles, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterIntOption("meanLength", &scaledLength, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterStringOption("posMap", &posMapFileName, "");
    clp.RegisterFlagOption("printPercentRepeat", &printPercentRepeat, "");
    clp.RegisterFlagOption("h", &printHelp, "");

    clp.SetHelp(helpString);
    clp.ParseCommandLine(argc, argv, fns);
    clp.CommandLineToString(argc, argv, commandLine);

    clp.SetProgramName("alchemy");

    outputModelFileName = fns[0];
    if (argc <= 1 or printHelp or outputModelFileName == "") {
        cout << helpString << endl;
        exit(0);
    }

    if (usePosMap) {
        CrucialOpen(posMapFileName, posMapFile, std::ios::out);
    }

    if (sourceReadsFileName == "" and fixedLength == 0) {
        useLengthModel = true;
    }

    if (useLengthModel and fixedLength != 0) {
        cout << "ERROR! You must either use a length model or a fixed length." << endl;
        exit(1);
    }

    if (sourceReadsFileName == "" and numBasesPerFile == 0) {
        cout << "ERROR! You must specify either a set of read to use as " << endl
             << "original reads for simulation or the total number of bases " << endl
             << "to simulate in each bas.h5 file." << endl;
        exit(1);
    }

    if (sourceReadsFileName == "" and refGenomeFileName == "") {
        cout << "ERROR! You must specify a genome to sample reads from or a set of read "<<endl
             << "to use as original reads for simulation." << endl;
        exit(1);
    }

    if (fixedLength != 0 and refGenomeFileName == "") {
        cout << "ERROR! You must specify a genome file if using a fixed length." << endl;
        exit(1);
    }

    if ((fixedLength != 0 or scaledLength != 0) and sourceReadsFileName != "") {
        cout << "ERROR! You cannot specify a fixed length nor mean length with a source " << endl
             << "reads file.  The read lengths are taken from the source reads or the length model." << endl;
        exit(1);
    }

    LengthHistogram   lengthHistogram;
    OutputSampleListSet   outputModel(0);
    TitleTable titleTable;

    if (doRandGenInit) {
        InitializeRandomGeneratorWithTime();
    }

    //
    // Read models.
    //
    if (titleTableFileName != "") {
        titleTable.Read(titleTableFileName);
    }


    outputModel.Read(outputModelFileName);

    if (useLengthModel) {
        lengthHistogram.BuildFromAlignmentLengths(outputModel.lengths);
    }


    vector<int> alignmentLengths;
    int meanAlignmentLength;


    if (scaledLength != 0 and useLengthModel) {
        //
        // Scale the histogram so that the average length is 'scaledLength'.
        //

        // 1. Integrate histogram
        long totalLength = 0;
        long totalSamples = 0;
        int hi;
        for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size()-1; hi++) {
            int ni;
            ni = lengthHistogram.lengthHistogram.cdf[hi+1] - lengthHistogram.lengthHistogram.cdf[hi];
            totalLength += ni * lengthHistogram.lengthHistogram.data[hi];
        }
        totalSamples = lengthHistogram.lengthHistogram.cdf[lengthHistogram.lengthHistogram.cdf.size()-1];

        float meanSampleLength = totalLength / (1.0*totalSamples);
        float fractionIncrease = scaledLength / meanSampleLength;

        for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size(); hi++) {
            lengthHistogram.lengthHistogram.data[hi] *= fractionIncrease;
        }
    }

    FASTAReader inReader, seqReader;
    vector<FASTASequence> reference;
    DNALength refLength = 0;
    int i;
    if (refGenomeFileName != "") {
        inReader.Init(refGenomeFileName);
        inReader.ReadAllSequences(reference);

        for (i = 0; i < reference.size(); i++) {
            refLength += reference[i].length;
        }
    }

    if (sourceReadsFileName !=  "") {
        seqReader.Init(sourceReadsFileName);
    }

    ofstream readsFile;

    //
    // Create and simulate bas.h5 files.
    //
    int baseFileIndex;
    bool readsRemain = true;
    for (baseFileIndex = 0; ((sourceReadsFileName == "" and baseFileIndex < nBasFiles)  // case 1 is reads are generated by file
                             or (sourceReadsFileName != "" and readsRemain)); // case 2 is reads are generated by an input file.
            baseFileIndex++) {
        //
        // Prep the base file for writing.
        //
        stringstream fileNameStrm, movieNameStrm;
        //string movieName = "m000000_000000_00000_cSIMULATED_s";
        movieNameStrm << movieName << baseFileIndex << "_p0";
        string fullMovieName = movieNameStrm.str();
        fileNameStrm  << fullMovieName <<  ".bas.h5";


        HDFBasWriter basWriter;
        HDFRegionTableWriter regionWriter;
        //
        // This is mainly used to create the atributes.
        //
        RegionTable regionTable;
        regionTable.CreateDefaultAttributes();

        basWriter.SetPlatform(Springfield);
        //
        // Use a fixed set of fields for now.
        //

        // These are all pulled from the outputModel.
        basWriter.IncludeField("Basecall");
        basWriter.IncludeField("QualityValue");
        basWriter.IncludeField("SubstitutionQV");
        basWriter.IncludeField("SubstitutionTag");
        basWriter.IncludeField("InsertionQV");
        basWriter.IncludeField("DeletionQV");
        basWriter.IncludeField("DeletionTag");
        basWriter.IncludeField("WidthInFrames");
        basWriter.IncludeField("PreBaseFrames");
        basWriter.IncludeField("PulseIndex");

        vector<unsigned char> qualityValue, substitutionQV, substitutionTag, insertionQV, deletionQV, deletionTag;
        vector<HalfWord> widthInFrames, preBaseFrames, pulseIndex;

        // Just go from 0 .. hole Number
        basWriter.IncludeField("HoleNumber");
        // Fixed to 0.
        basWriter.IncludeField("HoleXY");
        if (usePosMap == false) {
            basWriter.IncludeField("SimulatedSequenceIndex");
            basWriter.IncludeField("SimulatedCoordinate");
        }
        basWriter.SetChangeListID("1.3.0.50.104380");


        DNALength numSimulatedBases  = 0;
        FASTASequence sampleSeq;
        //sampleSeq.length = readLength;
        int maxRetry = 10000000;
        int retryNumber = 0;
        int numReads = 0;
        int readLength = 0;

        while (numBasesPerFile == 0 or numSimulatedBases < numBasesPerFile) {
            DNALength seqIndex, seqPos;
            if (useLengthModel or fixedLength) {
                if (useLengthModel) {
                    lengthHistogram.GetRandomLength(readLength);
                }
                else {
                    readLength = fixedLength;
                }
            }
            if (refGenomeFileName != "") {
                FindRandomPos(reference, seqIndex, seqPos, readLength + (outputModel.keyLength - 1));
                sampleSeq.seq    = &reference[seqIndex].seq[seqPos];
                sampleSeq.length = readLength + (outputModel.keyLength - 1);
                assert(reference[seqIndex].length >= sampleSeq.length);
            }
            else if (sourceReadsFileName != "") {
                if (seqReader.GetNext(sampleSeq) == false) {
                    readsRemain = false;
                    break;
                }
                if (sampleSeq.length < outputModel.keyLength) {
                    continue;
                }
                //
                // Now attempt to parse the position from the fasta title.
                //

                if (useLengthModel) {
                    int tryNumber = 0;
                    readLength = 0;
                    int maxNTries = 1000;
                    int tryBuffer[5] = {-1,-1,-1,-1,-1};
                    while (tryNumber < maxNTries and readLength < outputModel.keyLength) {
                        lengthHistogram.GetRandomLength(readLength);
                        readLength = sampleSeq.length = min(sampleSeq.length, (unsigned int) readLength);
                        tryBuffer[tryNumber%5] = readLength;
                        tryNumber++;
                    }
                    if (tryNumber >= maxNTries) {
                        cout << "ERROR. Could not generate a read length greater than the " << outputModel.keyLength << " requried " <<endl
                             << "minimum number of bases using the length model specified in the alchemy." <<endl
                             << "model.  Something is either wrong with the model or the context length is too large." <<endl;
                        cout << "The last few tries were: " << tryBuffer[0] << " " << tryBuffer[1] << " " << tryBuffer[2] << " " << tryBuffer[3] << " " << tryBuffer[4] << endl;
                        exit(1);
                    }
                }

                readLength = sampleSeq.length;
                vector<string> tokens;
                Tokenize(sampleSeq.title, "|", tokens);
                if (tokens.size() == 4) {
                    seqPos = atoi(tokens[2].c_str());
                    if (titleTableFileName == "") {
                        seqIndex = 0;
                    }
                    else {
                        int index;
                        titleTable.Lookup(tokens[1], index);
                        seqIndex = index;
                    }
                }
                else {
                    seqPos   = 0;
                }
            }

            //
            // If this is the first read printed to the base file, initialize it.
            //
            if (numSimulatedBases == 0) {
                basWriter.Initialize(fileNameStrm.str(), movieNameStrm.str(), Springfield);
                regionWriter.Initialize(basWriter.pulseDataGroup);
            }

            numSimulatedBases += readLength;

            int p;
            // create the sample sequence
            int contextLength = outputModel.keyLength;
            int contextMiddle = contextLength / 2;
            string outputString;

            int nDel = 0;
            int nIns = 0;

            //
            // Simulate to beyond the sample length.
            //
            qualityValue.clear();
            substitutionQV.clear();
            substitutionTag.clear();
            insertionQV.clear();
            deletionQV.clear();
            deletionTag.clear();
            pulseIndex.clear();
            widthInFrames.clear();
            preBaseFrames.clear();
            assert(sampleSeq.length > contextMiddle + 1);
            for (p = contextMiddle;
                    p < sampleSeq.length - contextMiddle - 1; p++) {
                string refContext;
                refContext.assign((const char*) &sampleSeq.seq[p-contextMiddle], contextLength);

                string outputContext;
                int    contextWasFound;
                OutputSample sample;
                int i;
                for (i = 0; i < refContext.size(); i++) {
                    refContext[i] = toupper(refContext[i]);
                }
                outputModel.SampleRandomSample(refContext, sample);

                if (sample.type == OutputSample::Deletion ) {
                    //
                    // There was a deletion.  Advance in reference, then output
                    // the base after the deletion.
                    //
                    p++;
                    ++nDel;
                }

                int cp;
                //
                // Add the sampled context, possibly multiple characters because of an insertion.
                //
                for (i = 0; i < sample.nucleotides.size(); i++) {
                    outputString.push_back(sample.nucleotides[i]);
                    qualityValue.push_back(sample.qualities[i].qv[0]);
                    deletionQV.push_back(sample.qualities[i].qv[1]);
                    insertionQV.push_back(sample.qualities[i].qv[2]);
                    substitutionQV.push_back(sample.qualities[i].qv[3]);
                    deletionTag.push_back(sample.qualities[i].tags[0]);
                    substitutionTag.push_back(sample.qualities[i].tags[1]);
                    pulseIndex.push_back(sample.qualities[i].frameValues[0]);
                    preBaseFrames.push_back(sample.qualities[i].frameValues[1]);
                    widthInFrames.push_back(sample.qualities[i].frameValues[2]);
                }
                nIns += sample.qualities.size() - 1;
            }
            if (outputString.find('N') != outputString.npos or
                    outputString.find('n') != outputString.npos) {
                cout << "WARNING!  The sampled string " << endl << outputString << endl
                     << "should not contain N's, but it seems to.  This is being ignored "<<endl
                     << "for now so that simulation may continue, but this shouldn't happen"<<endl
                     << "and is really a bug." << endl;
                numSimulatedBases -= readLength;
                continue;
            }
            //
            // Ok, done creating the read, now time to create some quality values!!!!!
            //
            SMRTSequence read;
            read.length = outputString.size();
            read.Allocate(read.length);
            memcpy(read.seq, outputString.c_str(), read.length * sizeof(unsigned char));
            assert(qualityValue.size() == read.length * sizeof(unsigned char));
            memcpy(read.qual.data, &qualityValue[0], read.length * sizeof(unsigned char));
            memcpy(read.deletionQV.data, &deletionQV[0], read.length * sizeof(unsigned char));
            memcpy(read.insertionQV.data, &insertionQV[0], read.length * sizeof(unsigned char));
            memcpy(read.substitutionQV.data, &substitutionQV[0], read.length * sizeof(unsigned char));
            memcpy(read.deletionTag, &deletionTag[0], read.length * sizeof(unsigned char));
            memcpy(read.substitutionTag, &substitutionTag[0], read.length * sizeof(unsigned char));
            memcpy(read.pulseIndex, &pulseIndex[0], read.length * sizeof(int));
            memcpy(read.preBaseFrames, &preBaseFrames[0], read.length * sizeof(HalfWord));
            memcpy(read.widthInFrames, &widthInFrames[0], read.length * sizeof(HalfWord));

            //
            // The pulse index for now is just fake data.
            //
            int i;
            for (i = 0; i < read.length; i++) {
                read.pulseIndex[i] = 1;
            }
            read.xy[0] = seqIndex;
            read.xy[1] = seqPos;
            read.zmwData.holeNumber = numReads;

            basWriter.Write(read);
            // Record where this was simulated from.
            if (usePosMap == false) {
                basWriter.WriteSimulatedCoordinate(seqPos);
                basWriter.WriteSimulatedSequenceIndex(seqIndex);
            }
            else {
                posMapFile << fullMovieName << "/" << numReads << "/0_" << read.length << " " << seqIndex << " "<< seqPos;
                if (printPercentRepeat) {
                    DNALength nRepeat = sampleSeq.GetRepeatContent();
                    posMapFile << " " << nRepeat*1.0/sampleSeq.length;
                }
                posMapFile << endl;
            }
            RegionAnnotation region;
            region.row[0] = read.zmwData.holeNumber;
            region.row[1] = 1;
            region.row[2] = 0;
            region.row[3] = read.length;
            region.row[4] = 1000; // Should be enough.
            regionWriter.Write(region);
            region.row[1] = 2; // Rewrite for hq region encompassing everything.
            regionWriter.Write(region);
            if (sourceReadsFileName != "") {
                sampleSeq.Free();
            }
            read.Free();
            ++numReads;
        }
        regionWriter.Finalize(regionTable.columnNames,
                              regionTable.regionTypes,
                              regionTable.regionDescriptions,
                              regionTable.regionSources);
        basWriter.Close();
        numReads = 0;
        //
        // The bas writer should automatically flush on closing.
        //
    }
    if (usePosMap) {
        posMapFile.close();
    }

    for (i = 0; i < reference.size(); i++) {
        reference[i].Free();
    }
}
Beispiel #14
0
void FASTASequence::MakeRC(FASTASequence &rhs, DNALength rhsPos, DNALength rhsLength) {
    DNASequence::MakeRC((DNASequence&) rhs, rhsPos, rhsLength);
    if (title != NULL) {
        rhs.CopyTitle(title);
    }
}
Beispiel #15
0
int main(int argc, char* argv[1]) {
	if (argc < 3) {
		cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl;
		cout << "  genome.fasta.sa must exist." << endl;
		cout << "  Finds sequences at least effective_k in length that are unique." << endl;
		cout << "  -max m       Allow up to m matches" << endl;
		cout << "  -minLength l Ensure the length of the match is at least this." << endl;
		cout << "  -prefix p n  Allow up to n matches across a prefix of length p" << endl;
		cout << "  -suffix s n  Allow up to n matches across a suffix of length s" << endl;
		cout << "               Prefix and suffix options override max." << endl;
		cout << "  -out file    Print queries to this output file (query.fasta.queries)" << endl;
		exit(0);
	}

	DNASuffixArray sarray;
	
	string genomeFileName = argv[1];
	string suffixArrayFileName = genomeFileName + ".sa";
	
	FASTAReader reader;
	FASTASequence genome;

	int maxN = 0;

	int prefix = 0;
	int suffix = 0;
	int prefixN = 0;
	int suffixN = 0;
	int argi = 4;
	string outputFileName = "";
	int minLength = 0;
	while (argi < argc) {
		if (strcmp(argv[argi], "-max") == 0) {
			++argi;
			maxN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-prefix") == 0) {
			++argi;
			prefix = atoi(argv[argi]);
			++argi;
			prefixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-suffix") == 0) {
			++argi;
			suffix = atoi(argv[argi]);
			++argi;
			suffixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-out") == 0) {
			++argi;
			outputFileName = argv[argi];
		}
		else if (strcmp(argv[argi], "-minLength") == 0) {
			++argi;
			minLength = atoi(argv[argi]);
		}
		++argi;
	}

	reader.Initialize(genomeFileName);
	reader.ReadAllSequencesIntoOne(genome);
	sarray.Read(suffixArrayFileName);

	FASTAReader queryReader;
	FASTASequence querySequence;
	string queryFileName = argv[2];
	int maxLength = atoi(argv[3]);
	string summaryTableFileName = queryFileName + ".summary";
	if (outputFileName == "") {
		outputFileName = queryFileName + ".queries";
	}
		
	
	ofstream summaryTable(summaryTableFileName.c_str());
	ofstream outputFile(outputFileName.c_str());

	queryReader.Initialize(queryFileName);

	while (queryReader.GetNext(querySequence)) {
		int i;
		cerr << "searching " << querySequence.title << endl;
		if (querySequence.length < maxLength) {
			continue;
		}

		int nMatches = 0;
		querySequence.ToUpper();
		int localMax;
		for (i = 0; i < querySequence.length - maxLength + 1; i++) {
			if ((i + 1) % 100000 == 0) {
				cerr << "processed: " << i + 1 << endl;
			}

			int lcpLength;
			vector<SAIndex> lcpLeftBounds, lcpRightBounds;
			vector<SAIndex> rclcpLeftBounds, rclcpRightBounds;
			localMax = maxN;
			if (i < prefix) {
				localMax = prefixN;
			}
			if (i >= querySequence.length - suffix) {
				localMax = suffixN;
			}
			if (querySequence.length - i <= maxLength) {
				continue;
			}
			if (querySequence.seq[i] == 'N') {
				continue;
			}
			lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																				&querySequence.seq[i], querySequence.length-i,
																				true,
																				maxLength,
																				lcpLeftBounds, lcpRightBounds,
																				false);
			if (lcpLength < minLength) {
				continue;
			}
			if (lcpLength < maxLength or 
					lcpRightBounds.size() == 0 or 
					(lcpRightBounds.size() > 0 and 
					 lcpLeftBounds.size() > 0 and  
					 lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) {

				FASTASequence rc;
				DNASequence subseq;
				subseq.ReferenceSubstring(querySequence, i, maxLength);
				subseq.MakeRC(rc);
				int rclcpLength;
				int numForwardMatches;
				if (lcpLength == 0) {
					numForwardMatches = 0;
				}
				else {
					numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1];
				}
				rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																						rc.seq, maxLength,
																						true,
																						rclcpLength,
																						rclcpLeftBounds, rclcpRightBounds,
																						false);

				string rcstr((const char*)rc.seq, rc.length);

				if (rclcpLength < maxLength or 
						rclcpRightBounds.size() == 0 or
						(numForwardMatches + 
						 rclcpRightBounds[rclcpRightBounds.size() - 1] -
						 rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) 
					{
						char* substr = new char[maxLength+1];
						substr[maxLength] = '\0';
						memcpy(substr, &querySequence.seq[i], maxLength);

						//						string substr = string((const char*) querySequence.seq, i, maxLength);
						
						outputFile << querySequence.title << "\t" << substr << "\t" << i << endl;

						++nMatches;
						delete[] substr;
						//					}
					}
				rc.Free();
			}

		}
		summaryTable << querySequence.title << "\t" << nMatches << endl;
		querySequence.Free();
	}
	outputFile.close();
	genome.Free();
}
int main(int argc, char* argv[]) {
  string barcodeFileName, insertFileName, outputFileName;
  if (argc != 4) {
    cout << "usage: makeBarcodeDatabase insert.fasta barcodes.fasta output.fasta" << endl;
    exit(1);
  }
  insertFileName = argv[1];
  barcodeFileName = argv[2];
  outputFileName  = argv[3];

  FASTAReader barcodeReader, insertReader;
  barcodeReader.Initialize(barcodeFileName);
  insertReader.Initialize(insertFileName);
  
  ofstream barcodedOut;
  CrucialOpen(outputFileName, barcodedOut, std::ios::out);

  vector<FASTASequence> forwardBarcodes, reverseBarcodes;
  FASTASequence barcodeSequence, reverseBarcodeSequence;
  while(barcodeReader.GetNext(barcodeSequence)) {
    forwardBarcodes.push_back(barcodeSequence);
    barcodeSequence.MakeRC(reverseBarcodeSequence);
    reverseBarcodes.push_back(reverseBarcodeSequence);
  }
  
  FASTASequence insert;
  insertReader.GetNext(insert);
  
  int i;
  for (i = 0; i < forwardBarcodes.size(); i++) {
    FASTASequence barcodedInsert;
    barcodedInsert.Resize(forwardBarcodes[i].length * 2 + insert.length);
    stringstream titleStrm;
    titleStrm << insert.title << "|ff|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);

    titleStrm.str("");
    titleStrm << insert.title << "|fr|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);


    titleStrm.str("");
    titleStrm << insert.title << "|rf|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);


    titleStrm.str("");
    titleStrm << insert.title << "|rr|" << forwardBarcodes[i].title;
    barcodedInsert.CopyTitle(titleStrm.str());
    memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length);
    memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length);
    barcodedInsert.PrintSeq(barcodedOut);
  }
}
Beispiel #17
0
int main(int argc, char* argv[]) {

	if (argc < 2) {
		PrintUsage();
		exit(1);
	}
	int argi = 1;
	string saFile = argv[argi++];
	vector<string> inFiles;
	
	int doBLT = 1;
	int bltPrefixLength = 8;
	int parsingOptions = 0;
	SAType saBuildType = larsson;
	int read4BitCompressed  = 0;
	int diffCoverSize = 0;
	while (argi < argc) {
		if (strlen(argv[argi]) > 0 and
				argv[argi][0] == '-'){ 
			parsingOptions = 1;
		}
		if (!parsingOptions) {
			inFiles.push_back(argv[argi]);
		}
		else {
			if (strcmp(argv[argi], "-blt") == 0) {
				doBLT = 1;
        if (argi < argc - 1) {
          bltPrefixLength = atoi(argv[++argi]);
          if (bltPrefixLength == 0) {
            cout << argv[argi] << " is not a valid lookup table length." << endl;
            exit(1);
          }
        }
        else {
          cout << "Please specify a lookup table length." << endl;
          exit(1);
        }
			}
			else if (strcmp(argv[argi], "-mamy") == 0) {
				saBuildType = manmy;
			}
			else if (strcmp(argv[argi], "-larsson") == 0) {
				saBuildType = larsson;
			}
			else if (strcmp(argv[argi], "-mcilroy") == 0) {
				saBuildType = mcilroy;
			}
			else if (strcmp(argv[argi], "-slow") == 0) {
				saBuildType = slow;
			}
			else if (strcmp(argv[argi], "-kark") == 0) {
				saBuildType = kark;
			}
			else if (strcmp(argv[argi], "-mafe") == 0) {
				saBuildType = mafe;
			}
			else if (strcmp(argv[argi], "-welter") == 0) {
				saBuildType = welter;
			}
			else if (strcmp(argv[argi], "-welterweight") == 0) {
        if (argi < argc-1) {
          diffCoverSize = atoi(argv[++argi]);
        }
        else {
          cout << "Please specify a difference cover size.  Valid values are 7,32,64,111, and 2281.  Larger values use less memory but may be slower." << endl;
          exit(1);
        }
        if ( ! (diffCoverSize == 7 or 
                diffCoverSize == 32 or
                diffCoverSize == 64 or 
                diffCoverSize == 111 or
                diffCoverSize == 2281) ) {
          cout << "The difference cover size must be one of 7,32,64,111, or 2281." << endl;
          cout << "Larger numbers use less space but are more slow." << endl;
          exit(1);
        }
			}
			else if (strcmp(argv[argi], "-4bit") == 0) {
				read4BitCompressed = 1;
			}
			else {
				PrintUsage();
				cout << "ERROR, bad option: " << argv[argi] << endl;
				exit(1);
			}
		}
		++argi;
	}
  
  if (inFiles.size() == 0) {
    //
    // Special use case: the input file is a fasta file.  Write to that file + .sa
    //
    inFiles.push_back(saFile);
    saFile = saFile + ".sa";
  }
  
	VectorIndex inFileIndex;
	FASTASequence seq;
	CompressedSequence<FASTASequence> compSeq;

	if (read4BitCompressed == 0) {
		for (inFileIndex = 0; inFileIndex < inFiles.size(); ++inFileIndex) {
			FASTAReader reader;
			reader.Init(inFiles[inFileIndex]);
			reader.SetSpacePadding(111);
			if (saBuildType == kark) {
				//
				// The Karkkainen sa building method requires a little extra
				// space at the end of the dna sequence so that counting may
				// be done mod 3 without adding extra logic for boundaries.
				//
			}
  
			if (inFileIndex == 0) {
				reader.ReadAllSequencesIntoOne(seq);
				reader.Close();
			}
			else {
				while(reader.ConcatenateNext(seq)) {
					cout << "added " << seq.title << endl;
				}
			}
		}
		seq.ToThreeBit();
		//seq.ToUpper();
	}
	else {
		assert(inFiles.size() == 1);
		cout << "reading compressed sequence." << endl;
		compSeq.Read(inFiles[0]);
		seq.seq = compSeq.seq;
		seq.length = compSeq.length;
		compSeq.RemoveCompressionCounts();
		cout << "done." << endl;
	}

  //
  // For now, do not allow creation of suffix arrays on sequences > 4G.
  //
  if (seq.length >= UINT_MAX) {
    cout << "ERROR, references greater than " << UINT_MAX << " bases are not supported." << endl;
    cout << "Consider breaking the reference into multiple files, running alignment. " << endl;
    cout << "against each file, and merging the result." << endl;
    exit(1);
  }
	vector<int> alphabet;
	
	SuffixArray<Nucleotide, vector<int> >  sa;
	//	sa.InitTwoBitDNAAlphabet(alphabet);
	//	sa.InitAsciiCharDNAAlphabet(alphabet);
  sa.InitThreeBitDNAAlphabet(alphabet);

	if (saBuildType == manmy) {
		sa.MMBuildSuffixArray(seq.seq, seq.length, alphabet);
	}
	else if (saBuildType == mcilroy) {
		sa.index = new SAIndex[seq.length+1];
		DNALength i;
		for (i = 0; i < seq.length; i++) { sa.index[i] = seq.seq[i] + 1;}
		sa.index[seq.length] = 0;
		ssort(sa.index, NULL);
		for (i = 1; i < seq.length+1; i++ ){ sa.index[i-1] = sa.index[i];};
		sa.length = seq.length;
	}
	else if (saBuildType == larsson) {
		sa.LarssonBuildSuffixArray(seq.seq, seq.length, alphabet);
	}
	else if (saBuildType == kark) {
		sa.index = new SAIndex[seq.length];
		seq.ToThreeBit();
		DNALength p;
		for (p = 0; p < seq.length; p++ ){ seq.seq[p]++; }
		KarkkainenBuildSuffixArray<Nucleotide>(seq.seq, sa.index, seq.length, 5);
		sa.length = seq.length;
	}
	else if (saBuildType == mafe) {
		//		sa.MaFeBuildSuffixArray(seq.seq, seq.length);
		
	}
	else if (saBuildType == welter) {
		if (diffCoverSize == 0) {
			sa.LightweightBuildSuffixArray(seq.seq, seq.length);
		}
		else {
			sa.LightweightBuildSuffixArray(seq.seq, seq.length, diffCoverSize);
		}
	}
	if (doBLT) {
		sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength);
	}
	sa.Write(saFile);

	return 0;

}