Ejemplo n.º 1
0
void DindelUtil::doMultipleReadHaplotypeAlignment(const std::vector<DindelRead> & dReads,
                                                  const StringVector & haplotypes)
{

    // globally align haplotypes to the first haplotype (arbitrary)
    assert(haplotypes.size()>0);
    for (size_t h = 0; h < haplotypes.size(); ++h)
    {
        std::cout << "ALIGNING EVERYTHING AGAINST HAPLOTYPE " << h << "\n";
        MultipleAlignment ma;
        const std::string rootSequence = haplotypes[h];
        ma.addBaseSequence("root", rootSequence, "");

        std::string hid;
        for(size_t j = 0; j < haplotypes.size(); j++)
        {
            std::stringstream ss;
            if (j!=h)
                ss << "haplotype-" << j;
            else
                ss << "HAPLOTYPE-" << j;
            SequenceOverlap o = Overlapper::computeOverlap(rootSequence, haplotypes[j]);
            ma.addOverlap(ss.str(), haplotypes[j], "", o);
        }
    

        for(size_t r = 0; r < dReads.size(); ++r)
        {
            std::stringstream ss;
            if (r<dReads.size()/2) 
                ss << "read-" << r << "("  << dReads[r].getID() << ")"; 
            else 
                ss << "MATE read-" << r;
            
            SequenceOverlap o = Overlapper::computeOverlap(rootSequence, dReads[r].getSequence());
            ma.addOverlap(ss.str(), dReads[r].getSequence(), "", o);
        }
        ma.print(100000);
    }
}
bool OverlapHaplotypeBuilder::buildInitialGraph(const StringVector& reads)
{
    PROFILE_FUNC("OverlapHaplotypeBuilder::buildInitialGraph")
    // Compute initial ordering of reads based on the position of the
    // starting kmer sequence. If the starting kmer was corrected out
    // of a read, it is discarded.
    StringVector ordered_reads;
    orderReadsInitial(m_initial_kmer_string, reads, &ordered_reads);

    if(ordered_reads.size() < m_parameters.minDiscoveryCount)
        return false;

#ifdef SHOW_MULTIPLE_ALIGNMENT
    //DEBUG print MA
    MultipleAlignment ma = buildMultipleAlignment(ordered_reads);
    ma.print(200);
#endif
    // Insert initial reads into graph
    for(size_t i = 0; i < ordered_reads.size(); ++i)
        insertVertexIntoGraph("seed-", ordered_reads[i]);
    return true;
}
Ejemplo n.º 3
0
int IntersectByMaster(CCdCore* ccd, double rowFraction) {

    int result = -1;
    unsigned int masterLen = (ccd) ? ccd->GetSequenceStringByRow(0).length() : 0;
    if (masterLen == 0) return result;

    int slaveStart;
    int nAlignedIBM = 0;
    unsigned int i, j, nBlocks;
    unsigned int nRows = ccd->GetNumRows();

    //  If there is already a consistent block model, do nothing.
    MultipleAlignment* ma = new MultipleAlignment(ccd);
    if (ma && ma->isBlockAligned()) {
        delete ma;
        return 0;
    }
    delete ma;


    BlockIntersector blockIntersector(masterLen);
    BlockModel* intersectedBlockModel;
    //BlockModel* simpleIntersectedBlockModel;
    BlockModelPair* bmp;
    vector<BlockModelPair*> blockModelPairs;
    set<int> forcedCTerminiInIntersection;

    list< CRef< CSeq_align > >& cdSeqAligns = ccd->GetSeqAligns();
    list< CRef< CSeq_align > >::iterator cdSeqAlignIt = cdSeqAligns.begin(), cdSeqAlignEnd = cdSeqAligns.end();

    for (i = 0; cdSeqAlignIt != cdSeqAlignEnd; ++cdSeqAlignIt, ++i) {
        bmp = new BlockModelPair(*cdSeqAlignIt);

        //  We assume # of blocks and all block lengths are same on master and slave.
        if (bmp && bmp->isValid()) {

            blockModelPairs.push_back(bmp);
            blockIntersector.addOneAlignment(bmp->getMaster());

            //  Find places the intersection can't merge blocks (i.e., where there are
            //  gaps in the slave across a block boundary, but not in the master).
            BlockModel& slave = bmp->getSlave();
            nBlocks = slave.getBlocks().size();
            for (j = 0; j < nBlocks - 1; ++j) {  //  '-1' as I don't care about end of the C-terminal block
                if (slave.getGapToCTerminal(j) > 0 && bmp->getMaster().getGapToCTerminal(j) == 0) {
                    forcedCTerminiInIntersection.insert(bmp->getMaster().getBlock(j).getEnd());
                }
            }
        }
    }

    //  There was a problem creating one of the BlockModelPair objects from a seq_align,
    //  or one or more seq_align was invalid.
    if (blockModelPairs.size() != cdSeqAligns.size()) {
        return result;
    }

    //simpleIntersectedBlockModel = blockIntersector.getIntersectedAlignment(forcedCTerminiInIntersection);
    intersectedBlockModel = blockIntersector.getIntersectedAlignment(forcedCTerminiInIntersection, rowFraction);
    nAlignedIBM = (intersectedBlockModel) ? intersectedBlockModel->getTotalBlockLength() : 0;
    if (nAlignedIBM == 0) {
        return result;
    }

/*
    string testStr, testStr2;
    string sint = intersectedBlockModel->toString();
    string sintsimple = simpleIntersectedBlockModel->toString();
    delete simpleIntersectedBlockModel;
    cout << "rowFraction = 1:\n" << sintsimple << endl;
    cout << "rowFraction = " << rowFraction << ":\n" << sint << endl;
*/

    //  As we have case where every block model isn't identical,
    //  change each seq-align to reflect the common set of aligned columns.
    nBlocks = intersectedBlockModel->getBlocks().size();
    for (i = 0, cdSeqAlignIt = cdSeqAligns.begin(); i < nRows - 1 ; ++i, ++cdSeqAlignIt) {

        bmp = blockModelPairs[i];  //BlockModelPair seqAlignPair(*cdSeqAlignIt);
        BlockModel* intersectedSeqAlignSlave = new BlockModel(bmp->getSlave().getSeqId(), false);

        bmp->reverse();
        for (j = 0; j < nBlocks; ++j) {
            const Block& jthMasterBlock = intersectedBlockModel->getBlock(j);
            slaveStart = bmp->mapToMaster(jthMasterBlock.getStart());

            //  since we're dealing w/ an intersection, slaveStart should always be valid
            assert(slaveStart != -1);

            Block b(slaveStart, jthMasterBlock.getLen(), jthMasterBlock.getId());
            intersectedSeqAlignSlave->addBlock(b);
        }
        *cdSeqAlignIt = intersectedSeqAlignSlave->toSeqAlign(*intersectedBlockModel);
        //testStr = intersectedSeqAlignSlave->toString();
        //testStr2 = bmp->getMaster().toString();  // original *slave* alignment

        delete bmp;
    }
    blockModelPairs.clear();
    result = nBlocks;

    delete intersectedBlockModel;

    return result;
}
Ejemplo n.º 4
0
void GetAlignmentColumnsForCD(CCdCore* cd, map<unsigned int, string>& columns, unsigned int referenceRow)
{
    bool isOK = true, useRefRow = true;
    int j;
    unsigned int i, col, row, pos, mapIndex, nRows, nCols, nBlocks;
    char** alignedResidues = NULL;
    string rowString, colString;
    //  Map column number to position on the selected reference row.
    map<unsigned int, unsigned int> colToPos;
    map<unsigned int, string> rowStrings;
    vector<int> starts, lengths;
    CRef< CSeq_align > seqAlign;

    //  Empty the columns map first, as this is used as a way to flag problems.
    columns.clear();

    if (!cd) return;

    //  Check if the block structure is consistent.
    try {
        MultipleAlignment* ma = new MultipleAlignment(cd);
        if (!ma) {
            ERR_POST("Creation of MultipleAlignment object failed for CD " << cd->GetAccession() << ".");
            return;
        } else if (! ma->isBlockAligned()) {
            delete ma;
            ERR_POST("CD " << cd->GetAccession() << " must have a consistent block structure for column extraction.");
            return;
        }
        delete ma;
        ma = NULL;
    } catch (...) {
        ERR_POST("Could not extract columns for CD " << cd->GetAccession());
    }

    nCols = cd->GetAlignmentLength();
    nRows = cd->GetNumRows();

    //  Get a reference seq-align for mapping between alignment rows.
    //  If the columns map index will simply be the column count, use the master, row 0.
    if (referenceRow >= nRows) {
        useRefRow = false;
        referenceRow = 0;
    } 
    if (! cd->GetSeqAlign(referenceRow, seqAlign)) {
        isOK = false;
    }

    //  Initialize the column # -> reference row position mapping.
    //  If useRefRow is true, use the indicated row's coordinates as the position.
    //  Otherwise, use the column number as the position.
    if (isOK && GetBlockStarts(seqAlign, starts, (referenceRow == 0)) > 0 && GetBlockLengths(seqAlign, lengths) > 0) {
        nBlocks = starts.size();
        if (nBlocks == lengths.size()) {
            for (i = 0, col = 0; i < nBlocks; ++i) {
                pos = (useRefRow) ? starts[i] : col;
                for (j = 0; j < lengths[i]; ++j, ++col, ++pos) {
                    //  Not explicitly checking if 'pos' is aligned since above 
                    //  we confirmed the CD has a valid block model.
                    colToPos[col] = pos;
                }
            }
        } else {
            isOK = false;
        }
    } else {
        isOK = false;
    }

    SetAlignedResiduesForCD(cd, alignedResidues, true);

    //  Construct the columns as string objects.
    if (isOK && alignedResidues) {
        for (col = 0; col < nCols; ++col) {
            colString.erase();
            for (row = 0; row < nRows; ++row) {
                colString += alignedResidues[row][col];
            }
            mapIndex = colToPos[col];
            columns[mapIndex] = colString;
        }
    }

    //  Clean up array of characters.
    if (alignedResidues) {
        for (row = 0; row < nRows; ++row) {
            delete [] alignedResidues[row];
        }
        delete [] alignedResidues;
    }

}
Ejemplo n.º 5
0
int main (int argc, char *argv[])
{
	int i;
	PlatformSupport* Plat = new PlatformSupport();
    ColumnComp* CC;
	Alignment* ALIGN;
	Tree* T;
	MultipleAlignment* MA;
	ProteinDomains* PROTS =NULL;
	MultiAlignRec* pssmAlignment;
	char outFileName[STR_LEN];
	strcpy(outFileName, "out");
    bool colChosen=false, alignChosen=false, treeChosen=false, maChosen=false, usingDomains=false, inputProvided=false, scoresProvided=false;
	bool neuralTree=false; bool testing=false;bool testingAcc=false; bool testingTree=false; bool famNames=false; bool treeClusts=false; bool printTreeClusts=false;
	bool ma_off=false;
	bool tree_loocv=false;//true;
	bool silent=false, htmlOutput=false; bool simMatching=false;
	bool weighting_on=false;
	int matchTopX = TOP_MATCH;

	char inputTFs[STR_LEN];
	char matchTFs[STR_LEN];
	char scoreDist[STR_LEN];
	char inputProteins[STR_LEN];

	//Misc option settings
	bool genRandMotifs=false;
	bool genRandScores=false;
	char randMatOut[STR_LEN];
	char scoresOut[STR_LEN];
	//Default alignment settings
	double gapOpen = DFLT_GAP_OPEN;
	double gapExtend = DFLT_GAP_EXTEND;
	bool overlapAlign = DFLT_OVLP_ALIGN;
	bool extendOverlap=false;
	bool FBP_on = false;
	bool preAlign=false;
    bool pairwiseOnly=false;
    bool forwardAlignOnly=false;
	bool ungapped=false;

	for(i=1; i<argc; i++){
		if(strcmp(argv[i], "-silent")==0)
			silent=true;
		if(strcmp(argv[i], "-html")==0)
			htmlOutput=true;
	}

	//Welcome message
	if(!silent && !htmlOutput){printf("\n\tSTAMP\n\tSimilarity, Tree-building, & Alignment of Motifs and Profiles\n\n\tShaun Mahony\n\tDepartment of Computational Biology\n\tUniversity of Pittsburgh\n\tVersion 1.0 (Winter 2006)\n\n");}

	if(argc ==1) //First and Foremost, the help option
	{	DisplayHelp();
	}else{

	for(i=1; i<argc; i++)
	{
		if(strcmp(argv[i], "-h")==0 || strcmp(argv[i], "?")==0) //First and Foremost, the help option
		{	DisplayHelp();
		}
		if(strcmp(argv[i], "-out")==0) //Output file (for trees & similarity matching)
		{	if(argv[i+1]!=NULL)
			{ strcpy(outFileName, argv[i+1]);}
		}
		if(strcmp(argv[i], "-genrand")==0) //Generate random motifs
		{	if(argv[i+1]!=NULL)
			{ strcpy(randMatOut, argv[i+1]);}
			genRandMotifs=true;
		}
		if(strcmp(argv[i], "-genscores")==0) //Generate simulation scores
		{	if(argv[i+1]!=NULL)
			{ strcpy(scoresOut, argv[i+1]);}
			genRandScores=true;
		}
		if((strcmp(argv[i], "-cc")) ==0)  //Choose a column comparison measure
		{
			if((strcmp(argv[i+1], "PCC"))==0 || (strcmp(argv[i+1], "pcc"))==0){
				CC = new PearsonCorrelation(); //Pearson's correllation coefficient
			}else if((strcmp(argv[i+1], "ALLR"))==0 || (strcmp(argv[i+1], "allr"))==0){
				CC = new ALLR(); //ALLR
			}else if((strcmp(argv[i+1], "ALLR_LL"))==0 || (strcmp(argv[i+1], "allr_ll"))==0){
				CC = new ALLR_LL(); //ALLR with lower limit
			}else if((strcmp(argv[i+1], "CS"))==0 || (strcmp(argv[i+1], "cs"))==0){
				CC = new ChiSq(); //Pearson's Chi Square
			}else if((strcmp(argv[i+1], "KL"))==0 || (strcmp(argv[i+1], "kl"))==0){
				CC = new KullbackLieber(); //Kullback-Lieber
			}else if((strcmp(argv[i+1], "SSD"))==0 || (strcmp(argv[i+1], "ssd"))==0){
				CC = new SumSqDiff(); //sum of squared difference
			}else{
				CC = new PearsonCorrelation(); //Default = PCC
			}
			colChosen=true;
		}
		//check for alignment settings
		if((strcmp(argv[i], "-go")) ==0){ //Gap Open
			if(argv[i+1]!=NULL)
			{	gapOpen=strtod(argv[i+1], NULL);}
		}
		if((strcmp(argv[i], "-ge")) ==0){ //Gap Extend
			if(argv[i+1]!=NULL)
			{	gapExtend=strtod(argv[i+1], NULL);}
		}
		if((strcmp(argv[i], "-overlapalign")) ==0){ //Only complete overlapping alignments
			overlapAlign = true; if(!silent && !htmlOutput){printf("Overlapping alignments only\n");}
		}if((strcmp(argv[i], "-nooverlapalign")) ==0){ //All overlapping alignments
			overlapAlign = false;
		}
		if((strcmp(argv[i], "-extendoverlap")) ==0){
			extendOverlap=true; if(!silent && !htmlOutput){printf("Extending the overlapping alignments\n");}
		}
        if((strcmp(argv[i], "-forwardonly")) ==0){ //Consider forward alignments only
            forwardAlignOnly = true;
            if(!silent && !htmlOutput){printf("Considering forward direction alignments only\n");}
        }
		if((strcmp(argv[i], "-printpairwise")) ==0){
			pairwiseOnly=true; if(!silent && !htmlOutput){printf("Printing pairwise scores only\n");}
		}
		if((strcmp(argv[i], "-FBP")) ==0){
			FBP_on=true; if(!silent && !htmlOutput){printf("Using FBP profiles\n");}
		}
		if((strcmp(argv[i], "-useweighting")) ==0){
			weighting_on=true; if(!silent && !htmlOutput){printf("Using weighting in FBP construction\n");}
		}
		if((strcmp(argv[i], "-prealigned")) ==0){
			preAlign=true; if(!silent && !htmlOutput){printf("Profiles are pre-aligned\n");}
		}

		//Input TF dataset name
		if((strcmp(argv[i], "-tf")) ==0)
		{	if(argv[i+1]!=NULL)
			{ strcpy(inputTFs, argv[i+1]);}
			inputProvided=true;
		}
		//Score distribution file   Make an auto function for this!!!!!!!
		if((strcmp(argv[i], "-sd")) ==0)
		{	if(argv[i+1]!=NULL)
			{ strcpy(scoreDist, argv[i+1]);}
			scoresProvided=true;
		}
		//Match input TFs against this dataset
		if((strcmp(argv[i], "-match")) ==0)
		{	if(argv[i+1]!=NULL)
			{ strcpy(matchTFs, argv[i+1]);}
			if(argv[i+2]!=NULL && strcmp(argv[i+2], "fams")==0){
				famNames=true;
			}
			simMatching=true;
		}
		if((strcmp(argv[i], "-match_top")) ==0){ //Report the top X matches
			if(argv[i+1]!=NULL)
			{	matchTopX=strtol(argv[i+1], NULL, 10);}
		}
		//Matching input protein (Pfam) alignment dataset name
		if((strcmp(argv[i], "-prot")) ==0)
		{	if(argv[i+1]!=NULL)
			{ strcpy(inputProteins, argv[i+1]);}
			usingDomains = true;
		}
		//Run some tests
		if((strcmp(argv[i], "-test")) ==0)
		{	testing=true;
		}
		//Run some different tests
		if((strcmp(argv[i], "-testacc")) ==0)
		{	testingAcc=true;
			famNames=true;
		}
		//Run some tests with trees
		if((strcmp(argv[i], "-testtree")) ==0)
		{	testingTree=true;
			famNames=true;
		}//Run Calinski & Harabasz with trees
		if((strcmp(argv[i], "-ch")) ==0)
		{	testingTree=true; treeClusts=true;
		}//Run Calinski & Harabasz with trees and print the resulting clusters
		if((strcmp(argv[i], "-chp")) ==0)
		{	testingTree=true;
			printTreeClusts=true; treeClusts=true;
		}
	}
	//Defaults
	if(!colChosen)
	{	CC = new PearsonCorrelation();}

	//Second Pass
	for(i=1; i<argc; i++)
	{
		if((strcmp(argv[i], "-align")) ==0)  //Choose an alignment method
		{
            if((strcmp(argv[i+1], "NW"))==0 || (strcmp(argv[i+1], "nw"))==0){
                ALIGN = new NeedlemanWunsch(CC, gapOpen, gapExtend, overlapAlign, extendOverlap, forwardAlignOnly);
            }
            if((strcmp(argv[i+1], "SWU"))==0 || (strcmp(argv[i+1], "swu"))==0){
                ALIGN = new SmithWatermanUngappedExtended(CC,forwardAlignOnly); ungapped=true;
            }
            if((strcmp(argv[i+1], "SWA"))==0 || (strcmp(argv[i+1], "swa"))==0){
                ALIGN = new SmithWatermanAffine(CC, gapOpen, gapExtend, overlapAlign, extendOverlap,forwardAlignOnly);
            }
            if((strcmp(argv[i+1], "SW"))==0 || (strcmp(argv[i+1], "sw"))==0){
                ALIGN = new SmithWaterman(CC, gapOpen, gapExtend, overlapAlign, extendOverlap,forwardAlignOnly);
            }
            alignChosen = true;
		}
		//Choose a multiple alignment method
		if((strcmp(argv[i], "-ma")) ==0)
		{
			if((strcmp(argv[i+1], "PPA"))==0 || (strcmp(argv[i+1], "ppa"))==0){
				MA = new ProgressiveProfileAlignment(outFileName, htmlOutput);
				maChosen=true;
			}
			if((strcmp(argv[i+1], "IR"))==0 || (strcmp(argv[i+1], "ir"))==0){
				MA = new IterativeRefinementAlignment(outFileName, htmlOutput);
				maChosen=true;
			}
			if((strcmp(argv[i+1], "NONE"))==0 || (strcmp(argv[i+1], "none"))==0){
				maChosen=true; ma_off=true;
			}
		}
	}
	if(!alignChosen)
	{	ALIGN = new SmithWatermanAffine(CC, gapOpen, gapExtend, overlapAlign, extendOverlap);
	}
	if(!maChosen)
		MA = new ProgressiveProfileAlignment(outFileName, htmlOutput);
	//Third pass
	//Choose a tree-construction method
	for(i=1; i<argc; i++)
	{	if((strcmp(argv[i], "-tree")) ==0)
		{
			if((strcmp(argv[i+1], "UPGMA"))==0 || (strcmp(argv[i+1], "upgma"))==0){
				T = new UPGMA(ALIGN);
			}
			if((strcmp(argv[i+1], "SOTA"))==0 || (strcmp(argv[i+1], "sota"))==0){
				T = new SOTA(ALIGN, MA); neuralTree=true;
			}
			if((strcmp(argv[i+1], "NJ"))==0 || (strcmp(argv[i+1], "nj"))==0){
				T = new Neighbourjoin(ALIGN); printf("Using Neighbour-joining... ensure that the distance metric is additive\n");
			}
			if((strcmp(argv[i+1], "TDHC"))==0 || (strcmp(argv[i+1], "tdhc"))==0){
				T = new TopDownHClust(ALIGN, MA); neuralTree=true;
			}
			treeChosen=true;
		}
	}
	if(!treeChosen)
		T = new UPGMA(ALIGN);
	T->BeQuiet(silent);

////////////////////////////////////////////////////////////////////////////////////
//////// Main Program /////////////////////////////////////////////////////////////

	//Initialise the background
	Plat->ReadBackground();
	if(inputProvided){
		//Read in the matrices
		Plat->ReadTransfacFile(inputTFs, famNames,true, weighting_on);
		if(!silent && !htmlOutput){
			printf("MatCount: %d\n", Plat->GetMatCount());
			if(ungapped)
				printf("Ungapped Alignment\n");
			else
				printf("Gap open = %.3lf, gap extend = %.3lf\n", gapOpen, gapExtend);
		}
	}else{
		printf("No input motifs provided!\n\n");
	}
	if(genRandMotifs){
		//Generate some random matrices
		RandPSSMGen* RPG = new RandPSSMGen(Plat->inputMotifs, Plat->GetMatCount(), 10000, randMatOut);
		RPG->RunGenerator();
	}
	if(genRandScores){
		//Find rand dist
		Plat->GetRandDistrib(scoresOut, ALIGN);
	}else if(!scoresProvided){
		printf("No score distribution file provided!\n\n");
	}
	if(testing){
		PlatformTesting* PT = new PlatformTesting(CC);
		//Print the distribution of column depth
	//	PT->ColumnDepthDist(Plat->inputMotifs, Plat->GetMatCount());
		//Print the similarities of all columns against all others
	//	PT->ColumnScoreDist(Plat->inputMotifs, Plat->GetMatCount(), 0.05);
		double z;
		for(z=0.25; z<0.8; z+=0.05)
			PT->RandColumns(Plat, z);
		for(z=0.8; z<=1.0; z+=0.01)
			PT->RandColumns(Plat, z);
		delete(PT);
	}

	if(scoresProvided || preAlign){

		Plat->ReadScoreDists(scoreDist);
		if(!silent && !htmlOutput){printf("Scores read\n");}
		if(Plat->GetMatCount()>1){
			if(preAlign){
				//No alignments or trees built here
				pssmAlignment = MA->PreAlignedInput(Plat);
			}else{
				//Multiple alignment procedure
				Plat->PreAlign(ALIGN);
				if(pairwiseOnly){
					if(!silent && !htmlOutput){printf("\nPairwise alignment scores:\n");}
					Plat->PrintPairwise();
				}if(!ma_off){
					MA->ImportBasics(Plat, ALIGN);
					if(!silent && !htmlOutput){printf("Alignments Finished\n");}
					if(!testingAcc){
						if(tree_loocv && testingTree){
							T->LOOCVBuildTree(Plat, testingTree);
						}else{
							if(testingTree && !silent && !htmlOutput){printf("Calinski & Harabasz:\n\tNumClust\tC&H_Metric\n");}
							T->BuildTree(Plat, testingTree);
							if(!silent && treeClusts){printf("The Calinski & Harabasz statistic suggests %.0lf clusters in the input motifs\n", T->GetNodesMinCH());}
							if(printTreeClusts){
								T->PrintLevel(outFileName, int(T->GetNodesMinCH()));
							}
						}
						T->PrintTree(outFileName);

						if(!silent && !htmlOutput){printf("Tree Built\n");}

						if(!silent){
							if(!silent && !htmlOutput){printf("Multiple Alignment:\n");}
							pssmAlignment = MA->BuildAlignment(Plat, ALIGN, T);
						}
					}
				}
			}

			//Experiment with the Protein Domains
			if(usingDomains){
				PROTS = new ProteinDomains();
				PROTS->ReadDomains(inputProteins, Plat->inputMotifs, Plat->GetMatCount());
				PROTS->MutualInformation(pssmAlignment, MA->Alignment2Profile(pssmAlignment, "AlignmentMotif"), Plat->inputMotifs, Plat->GetMatCount());
				delete PROTS;
			}
		}
		//Similarity match against the database
		if(simMatching){
			Plat->ReadTransfacFile(matchTFs, famNames, false, false);
			Plat->SimilarityMatching(ALIGN, outFileName, famNames, matchTopX);
		}
	}

	if(testingAcc && scoresProvided && inputProvided && Plat->GetMatCount()>1){
		PlatformTesting* PT = new PlatformTesting(CC);
		PT->PairwisePredictionAccuracy(Plat);
	}

	delete(MA);
	delete(T);
	delete(CC);
	delete(ALIGN);
	}
delete(Plat);
return(0);
}
Ejemplo n.º 6
0
void generate_errors_per_base(JSONWriter* pWriter, const BWTIndexSet& index_set)
{

    int n_samples = 100000;
    size_t k = 25;

    double max_error_rate = 0.95;
    size_t min_overlap = 50;
    
    std::vector<size_t> position_count;
    std::vector<size_t> error_count;

    Timer timer("test", true);
#if HAVE_OPENMP
        omp_set_num_threads(opt::numThreads);
        #pragma omp parallel for
#endif
    for(int i = 0; i < n_samples; ++i)
    {
        std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT);
        KmerOverlaps::retrieveMatches(s, k, min_overlap, max_error_rate, 2, index_set);
        //KmerOverlaps::approximateMatch(s, min_overlap, max_error_rate, 2, 200, index_set);

        MultipleAlignment ma = 
            KmerOverlaps::buildMultipleAlignment(s, k, min_overlap, max_error_rate, 2, index_set);

        // Skip when there is insufficient depth to classify errors
        size_t ma_rows = ma.getNumRows();
        if(ma_rows <= 1)
            continue;

        size_t ma_cols = ma.getNumColumns();
        size_t position = 0;
        for(size_t j = 0; j < ma_cols; ++j)
        {
            char s_symbol = ma.getSymbol(0, j);

            // Skip gaps
            if(s_symbol == '-' || s_symbol == '\0')
                continue;
            
            SymbolCountVector scv = ma.getSymbolCountVector(j);
            int s_symbol_count = 0;
            char max_symbol = 0;
            int max_count = 0;

            for(size_t k = 0; k < scv.size(); ++k)
            {
                if(scv[k].symbol == s_symbol)
                    s_symbol_count = scv[k].count;
                if(scv[k].count > max_count)
                {
                    max_count = scv[k].count;
                    max_symbol = scv[k].symbol;
                }
            }

            //printf("P: %zu S: %c M: %c MC: %d\n", position, s_symbol, max_symbol, max_count);

            // Call an error at this position if the consensus symbol differs from the read
            //    and the support for the read symbol is less than 4 and the consensus symbol
            //    is strongly supported.
            bool is_error = s_symbol != max_symbol && s_symbol_count < 4 && max_count >= 3;

#if HAVE_OPENMP
            #pragma omp critical
#endif
            {
                if(position >= position_count.size())
                {
                    position_count.resize(position+1);
                    error_count.resize(position+1);
                }

                position_count[position]++;
                error_count[position] += is_error;
            }
            position += 1;
        }
    }
    
    pWriter->String("ErrorsPerBase");
    pWriter->StartObject();
    
    pWriter->String("base_count");
    pWriter->StartArray();
    for(size_t i = 0; i < position_count.size(); ++i)
        pWriter->Int(position_count[i]);
    pWriter->EndArray();
    
    pWriter->String("error_count");
    pWriter->StartArray();
    for(size_t i = 0; i < position_count.size(); ++i)
        pWriter->Int(error_count[i]);
    pWriter->EndArray();

    pWriter->EndObject();
}