static void printSubsRates(tree *tr,int model, int numSubsRates) { assert(tr->partitionData[model].dataType = DNA_DATA); int i; printBothOpen("Subs rates: "); for(i=0; i<numSubsRates; i++) printBothOpen("%d => %.3f, ", i, tr->partitionData[model].substRates[i]); printBothOpen("\n\n"); }
static void print_state(state *s, double startLH) { assert(startLH == s->tr->startLH); printBothOpen("tr LH %f, startLH %f, incr %f\n", s->tr->likelihood, startLH, s->tr->likelihood - startLH); printBothOpen("pruned %db%d nb %d, nnb %d, reinsert %db%d \n", s->p->number, s->p->back->number, s->nb->number, s->nnb->number, s->q->number, s->r->number); }
static showNNI_move(nodeptr p) { printBothOpen("NNI from p %d %.6f, pnb %d %.6f, pnnb %d %.6f\n", p->number, p->z[0], p->next->back->number, p->next->back->z[0], p->next->next->back->number, p->next->next->back->z[0]); nodeptr q = p->back; printBothOpen("NNI from q %d %.6f, qnb %d %.6f, qnnb %d %.6f\n", q->number, q->z[0], q->next->back->number, q->next->back->z[0], q->next->next->back->number, q->next->next->back->z[0]); }
static void printRecomTree(tree *tr, boolean printBranchLengths, char *title) { FILE *nwfile; nwfile = myfopen("tmp.nw", "w+"); pllTreeToNewickRecomREC(tr->tree_string, tr, tr->start->back, printBranchLengths); fprintf(nwfile,"%s\n", tr->tree_string); fclose(nwfile); if(title) printBothOpen("%s\n", title); if (printBranchLengths) printBothOpen("%s\n", tr->tree_string); printBothOpen("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); system("bin/nw_display tmp.nw"); }
void pinToCore(int tid) { cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET( tid, &cpuset); if(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0) { printBothOpen("\n\nThere was a problem finding a physical core for thread number %d to run on.\n", tid); printBothOpen("Probably this happend because you are trying to run more threads than you have cores available,\n"); printBothOpen("which is a thing you should never ever do again, good bye .... \n\n"); assert(0); } }
void printTraversalInfo(tree *tr) { int k, total_steps = 0; printBothOpen("Traversals : %d \n", tr->travCounter->numTraversals); printBothOpen("Traversals tt: %d \n", tr->travCounter->tt); printBothOpen("Traversals ti: %d \n", tr->travCounter->ti); printBothOpen("Traversals ii: %d \n", tr->travCounter->ii); printBothOpen("all: %d \n", tr->travCounter->tt + tr->travCounter->ii + tr->travCounter->ti); printBothOpen("Traversals len freq : \n"); for(k = 0; k < tr->mxtips; k++) { total_steps += tr->travCounter->travlenFreq[k] * (k - 1); if(tr->travCounter->travlenFreq[k] > 0) printBothOpen("len %d : %d\n", k, tr->travCounter->travlenFreq[k]); } printBothOpen("all steps: %d \n", total_steps); }
void shSupports(tree *tr, analdef *adef, rawdata *rdta, cruncheddata *cdta) { double diff, *lhVectors[3]; char bestTreeFileName[1024], shSupportFileName[1024]; FILE *f; int interchanges = 0, counter = 0; assert(adef->restart); tr->resample = permutationSH(tr, 1000, 12345); lhVectors[0] = (double *)rax_malloc(sizeof(double) * tr->cdta->endsite); lhVectors[1] = (double *)rax_malloc(sizeof(double) * tr->cdta->endsite); lhVectors[2] = (double *)rax_malloc(sizeof(double) * tr->cdta->endsite); tr->bInf = (branchInfo*)rax_malloc(sizeof(branchInfo) * (tr->mxtips - 3)); initModel(tr, rdta, cdta, adef); getStartingTree(tr, adef); if(adef->useBinaryModelFile) { readBinaryModel(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); } else modOpt(tr, adef, FALSE, 10.0); printBothOpen("Time after model optimization: %f\n", gettime() - masterTime); printBothOpen("Initial Likelihood %f\n\n", tr->likelihood); do { double lh1, lh2; lh1 = tr->likelihood; interchanges = encapsulateNNIs(tr, lhVectors, FALSE); evaluateGeneric(tr, tr->start); lh2 = tr->likelihood; diff = ABS(lh1 - lh2); printBothOpen("NNI interchanges %d Likelihood %f\n", interchanges, tr->likelihood); } while(diff > 0.01); printBothOpen("\nFinal Likelihood of NNI-optimized tree: %f\n\n", tr->likelihood); setupBranchInfo(tr->start->back, tr, &counter); assert(counter == tr->mxtips - 3); interchanges = encapsulateNNIs(tr, lhVectors, TRUE); strcpy(bestTreeFileName, workdir); strcat(bestTreeFileName, "RAxML_fastTree."); strcat(bestTreeFileName, run_id); Tree2String(tr->tree_string, tr, tr->start->back, FALSE, TRUE, FALSE, FALSE, FALSE, adef, SUMMARIZE_LH, FALSE, FALSE); f = myfopen(bestTreeFileName, "wb"); fprintf(f, "%s", tr->tree_string); fclose(f); strcpy(shSupportFileName, workdir); strcat(shSupportFileName, "RAxML_fastTreeSH_Support."); strcat(shSupportFileName, run_id); Tree2String(tr->tree_string, tr, tr->start->back, TRUE, TRUE, FALSE, FALSE, FALSE, adef, SUMMARIZE_LH, FALSE, TRUE); f = myfopen(shSupportFileName, "wb"); fprintf(f, "%s", tr->tree_string); fclose(f); printBothOpen("RAxML NNI-optimized tree written to file: %s\n", bestTreeFileName); printBothOpen("Same tree with SH-like supports written to file: %s\n", shSupportFileName); printBothOpen("Total execution time: %f\n", gettime() - masterTime); exit(0); }
void fastSearch(tree *tr, analdef *adef, rawdata *rdta, cruncheddata *cdta) { double likelihood, startLikelihood, *lhVectors[3]; char bestTreeFileName[1024]; FILE *f; int model; lhVectors[0] = (double *)NULL; lhVectors[1] = (double *)NULL; lhVectors[2] = (double *)NULL; /* initialize model parameters with standard starting values */ initModel(tr, rdta, cdta, adef); printBothOpen("Time after init : %f\n", gettime() - masterTime); /* compute starting tree, either by reading in a tree specified via -t or by building one */ getStartingTree(tr, adef); printBothOpen("Time after init and starting tree: %f\n", gettime() - masterTime); /* rough model parameter optimization, the log likelihood epsilon should actually be determined based on the initial tree score and not be hard-coded */ if(adef->useBinaryModelFile) { readBinaryModel(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); } else modOpt(tr, adef, FALSE, 10.0); printBothOpen("Time after init, starting tree, mod opt: %f\n", gettime() - masterTime); /* print out the number of rate categories used for the CAT model, one should use less then the default, e.g., -c 16 works quite well */ for(model = 0; model < tr->NumberOfModels; model++) printBothOpen("Partion %d number of Cats: %d\n", model, tr->partitionData[model].numberOfCategories); /* means that we are going to do thorough insertions with real newton-raphson based br-len opt at the three branches adjactent to every insertion point */ Thorough = 1; /* loop over SPR cycles until the likelihood difference before and after the SPR cycle is <= 0.5 log likelihood units. Rather than being hard-coded this should also be determined based on the actual likelihood of the tree */ do { startLikelihood = tr->likelihood; /* conduct a cycle of linear SPRs */ likelihood = linearSPRs(tr, 20, adef->veryFast); evaluateGeneric(tr, tr->start); /* the NNIs also optimize br-lens of resulting topology a bit */ encapsulateNNIs(tr, lhVectors, FALSE); printBothOpen("LH after SPRs %f, after NNI %f\n", likelihood, tr->likelihood); } while(ABS(tr->likelihood - startLikelihood) > 0.5); /* print out the resulting tree to the RAxML_bestTree. file. note that boosttrapping or doing multiple inferences won't work. This thing computes a single tree and that's it */ strcpy(bestTreeFileName, workdir); strcat(bestTreeFileName, "RAxML_fastTree."); strcat(bestTreeFileName, run_id); Tree2String(tr->tree_string, tr, tr->start->back, FALSE, TRUE, FALSE, FALSE, FALSE, adef, SUMMARIZE_LH, FALSE, FALSE); f = myfopen(bestTreeFileName, "wb"); fprintf(f, "%s", tr->tree_string); fclose(f); printBothOpen("RAxML fast tree written to file: %s\n", bestTreeFileName); writeBinaryModel(tr); printBothOpen("Total execution time: %f\n", gettime() - masterTime); printBothOpen("Good bye ... \n"); }
void computePlacementBias(tree *tr, analdef *adef) { int windowSize = adef->slidingWindowSize, k, i, tips, numTraversalBranches = (2 * (tr->mxtips - 1)) - 3; /* compute number of branches into which we need to insert once we have removed a taxon */ char fileName[1024]; FILE *outFile; /* data for each sliding window starting position */ positionData *pd = (positionData *)malloc(sizeof(positionData) * (tr->cdta->endsite - windowSize)); double *nodeDistances = (double*)calloc(tr->cdta->endsite, sizeof(double)), /* array to store node distnces ND for every sliding window position */ *distances = (double*)calloc(tr->cdta->endsite, sizeof(double)); /* array to store avg distances for every site */ strcpy(fileName, workdir); strcat(fileName, "RAxML_SiteSpecificPlacementBias."); strcat(fileName, run_id); outFile = myfopen(fileName, "w"); printBothOpen("Likelihood of comprehensive tree %f\n\n", tr->likelihood); if(windowSize > tr->cdta->endsite) { printBothOpen("The size of your sliding window is %d while the number of sites in the alignment is %d\n\n", windowSize, tr->cdta->endsite); exit(-1); } if(windowSize >= (int)(0.9 * tr->cdta->endsite)) printBothOpen("WARNING: your sliding window of size %d is only slightly smaller than you alignment that has %d sites\n\n", windowSize, tr->cdta->endsite); printBothOpen("Sliding window size: %d\n\n", windowSize); /* prune and re-insert on tip at a time into all branches of the remaining tree */ for(tips = 1; tips <= tr->mxtips; tips++) { nodeptr myStart, p = tr->nodep[tips]->back, /* this is the node at which we are prunung */ p1 = p->next->back, p2 = p->next->next->back; double pz[NUM_BRANCHES], p1z[NUM_BRANCHES], p2z[NUM_BRANCHES]; int branchCounter = 0; /* reset array values for this tip */ for(i = 0; i < tr->cdta->endsite; i++) { pd[i].lh = unlikely; pd[i].p = (nodeptr)NULL; } /* store the three branch lengths adjacent to the position at which we prune */ for(i = 0; i < tr->numBranches; i++) { p1z[i] = p1->z[i]; p2z[i] = p2->z[i]; pz[i] = p->z[i]; } /* prune the taxon, optimizing the branch between p1 and p2 */ removeNodeBIG(tr, p, tr->numBranches); printBothOpen("Pruning taxon Number %d [%s]\n", tips, tr->nameList[tips]); /* find any tip to start traversing the tree */ myStart = findAnyTip(p1, tr->mxtips); /* insert taxon, compute likelihood and remove taxon again from all branches */ traverseBias(p, myStart->back, tr, &branchCounter, pd, windowSize); assert(branchCounter == numTraversalBranches); /* for every sliding window position calc ND to the true/correct position at p */ for(i = 0; i < tr->cdta->endsite - windowSize; i++) nodeDistances[i] = getNodeDistance(p1, pd[i].p, tr->mxtips); /* now analyze */ for(i = 0; i < tr->cdta->endsite; i++) { double d = 0.0; int s = 0; /* check site position, i.e., doe we have windowSize data points available or fewer because we are at the start or the end of the alignment */ /* for each site just accumulate the node distances we have for all sliding windows that passed over this site */ if(i < windowSize) { for(k = 0; k < i + 1; k++, s++) d += nodeDistances[k]; } else { if(i < tr->cdta->endsite - windowSize) { for(k = i - windowSize + 1; k <= i; k++, s++) d += nodeDistances[k]; } else { for(k = i - windowSize; k < (tr->cdta->endsite - windowSize); k++, s++) d += nodeDistances[k + 1]; } } /* now just divide the accumultaed ND distance by the number of distances we have for this position and then add it to the acc distances over all taxa. I just realized that the version on which I did the tests I sent to Simon I used distances[i] = d / ((double)s); instead of distances[i] += d / ((double)s); gamo tin poutana mou */ distances[i] += (d / ((double)s)); } /* re-connect taxon to its original position */ hookup(p->next, p1, p1z, tr->numBranches); hookup(p->next->next, p2, p2z, tr->numBranches); hookup(p, p->back, pz, tr->numBranches); /* fix likelihood vectors */ newviewGeneric(tr, p); } /* now just compute the average ND over all taxa */ for(i = 0; i < tr->cdta->endsite; i++) { double avg = distances[i] / ((double)tr->mxtips); fprintf(outFile, "%d %f\n", i, avg); } printBothOpen("\nTime for EPA-based site-specific placement bias calculation: %f\n", gettime() - masterTime); printBothOpen("Site-specific placement bias statistics written to file %s\n", fileName); fclose(outFile); exit(0); }
void computeBIGRAPID (tree *tr, analdef *adef, boolean estimateModel) { unsigned int vLength = 0; int i, impr, bestTrav, rearrangementsMax = 0, rearrangementsMin = 0, thoroughIterations = 0, fastIterations = 0; double lh, previousLh, difference, epsilon; bestlist *bestT, *bt; #ifdef _TERRACES /* store the 20 best trees found in a dedicated list */ bestlist *terrace; /* output file names */ char terraceFileName[1024], buf[64]; #endif hashtable *h = (hashtable*)NULL; unsigned int **bitVectors = (unsigned int**)NULL; if(tr->searchConvergenceCriterion) { bitVectors = initBitVector(tr, &vLength); h = initHashTable(tr->mxtips * 4); } bestT = (bestlist *) rax_malloc(sizeof(bestlist)); bestT->ninit = 0; initBestTree(bestT, 1, tr->mxtips); bt = (bestlist *) rax_malloc(sizeof(bestlist)); bt->ninit = 0; initBestTree(bt, 20, tr->mxtips); #ifdef _TERRACES /* initialize the tree list and the output file name for the current tree search/replicate */ terrace = (bestlist *) rax_malloc(sizeof(bestlist)); terrace->ninit = 0; initBestTree(terrace, 20, tr->mxtips); sprintf(buf, "%d", bCount); strcpy(terraceFileName, workdir); strcat(terraceFileName, "RAxML_terrace."); strcat(terraceFileName, run_id); strcat(terraceFileName, ".BS."); strcat(terraceFileName, buf); printf("%s\n", terraceFileName); #endif initInfoList(50); difference = 10.0; epsilon = 0.01; Thorough = 0; if(estimateModel) { if(adef->useBinaryModelFile) { readBinaryModel(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); } else { evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, FALSE, 10.0); } } else treeEvaluate(tr, 2); printLog(tr, adef, FALSE); saveBestTree(bestT, tr); if(!adef->initialSet) bestTrav = adef->bestTrav = determineRearrangementSetting(tr, adef, bestT, bt); else bestTrav = adef->bestTrav = adef->initial; if(estimateModel) { if(adef->useBinaryModelFile) treeEvaluate(tr, 2); else { evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, FALSE, 5.0); } } else treeEvaluate(tr, 1); saveBestTree(bestT, tr); impr = 1; if(tr->doCutoff) tr->itCount = 0; while(impr) { recallBestTree(bestT, 1, tr); if(tr->searchConvergenceCriterion) { int bCounter = 0; if(fastIterations > 1) cleanupHashTable(h, (fastIterations % 2)); bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips, vLength, h, fastIterations % 2, BIPARTITIONS_RF, (branchInfo *)NULL, &bCounter, 1, FALSE, FALSE); assert(bCounter == tr->mxtips - 3); if(fastIterations > 0) { double rrf = convergenceCriterion(h, tr->mxtips); if(rrf <= 0.01) /* 1% cutoff */ { printBothOpen("ML fast search converged at fast SPR cycle %d with stopping criterion\n", fastIterations); printBothOpen("Relative Robinson-Foulds (RF) distance between respective best trees after one succseful SPR cycle: %f%s\n", rrf, "%"); cleanupHashTable(h, 0); cleanupHashTable(h, 1); goto cleanup_fast; } else printBothOpen("ML search convergence criterion fast cycle %d->%d Relative Robinson-Foulds %f\n", fastIterations - 1, fastIterations, rrf); } } fastIterations++; treeEvaluate(tr, 1.0); saveBestTree(bestT, tr); printLog(tr, adef, FALSE); printResult(tr, adef, FALSE); lh = previousLh = tr->likelihood; treeOptimizeRapid(tr, 1, bestTrav, adef, bt); impr = 0; for(i = 1; i <= bt->nvalid; i++) { recallBestTree(bt, i, tr); treeEvaluate(tr, 0.25); difference = ((tr->likelihood > previousLh)? tr->likelihood - previousLh: previousLh - tr->likelihood); if(tr->likelihood > lh && difference > epsilon) { impr = 1; lh = tr->likelihood; saveBestTree(bestT, tr); } } } if(tr->searchConvergenceCriterion) { cleanupHashTable(h, 0); cleanupHashTable(h, 1); } cleanup_fast: Thorough = 1; impr = 1; recallBestTree(bestT, 1, tr); if(estimateModel) { if(adef->useBinaryModelFile) treeEvaluate(tr, 2); else { evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, FALSE, 1.0); } } else treeEvaluate(tr, 1.0); while(1) { recallBestTree(bestT, 1, tr); if(impr) { printResult(tr, adef, FALSE); rearrangementsMin = 1; rearrangementsMax = adef->stepwidth; if(tr->searchConvergenceCriterion) { int bCounter = 0; if(thoroughIterations > 1) cleanupHashTable(h, (thoroughIterations % 2)); bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips, vLength, h, thoroughIterations % 2, BIPARTITIONS_RF, (branchInfo *)NULL, &bCounter, 1, FALSE, FALSE); assert(bCounter == tr->mxtips - 3); if(thoroughIterations > 0) { double rrf = convergenceCriterion(h, tr->mxtips); if(rrf <= 0.01) /* 1% cutoff */ { printBothOpen("ML search converged at thorough SPR cycle %d with stopping criterion\n", thoroughIterations); printBothOpen("Relative Robinson-Foulds (RF) distance between respective best trees after one succseful SPR cycle: %f%s\n", rrf, "%"); goto cleanup; } else printBothOpen("ML search convergence criterion thorough cycle %d->%d Relative Robinson-Foulds %f\n", thoroughIterations - 1, thoroughIterations, rrf); } } thoroughIterations++; } else { rearrangementsMax += adef->stepwidth; rearrangementsMin += adef->stepwidth; if(rearrangementsMax > adef->max_rearrange) goto cleanup; } treeEvaluate(tr, 1.0); previousLh = lh = tr->likelihood; saveBestTree(bestT, tr); printLog(tr, adef, FALSE); treeOptimizeRapid(tr, rearrangementsMin, rearrangementsMax, adef, bt); impr = 0; for(i = 1; i <= bt->nvalid; i++) { recallBestTree(bt, i, tr); treeEvaluate(tr, 0.25); #ifdef _TERRACES /* save all 20 best trees in the terrace tree list */ saveBestTree(terrace, tr); #endif difference = ((tr->likelihood > previousLh)? tr->likelihood - previousLh: previousLh - tr->likelihood); if(tr->likelihood > lh && difference > epsilon) { impr = 1; lh = tr->likelihood; saveBestTree(bestT, tr); } } } cleanup: #ifdef _TERRACES { double bestLH = tr->likelihood; FILE *f = myfopen(terraceFileName, "w"); /* print out likelihood of best tree found */ printf("best tree: %f\n", tr->likelihood); /* print out likelihoods of 20 best trees found during the tree search */ for(i = 1; i <= terrace->nvalid; i++) { recallBestTree(terrace, i, tr); /* if the likelihood scores are smaller than some epsilon 0.000001 print the tree to file */ if(ABS(bestLH - tr->likelihood) < 0.000001) { printf("%d %f\n", i, tr->likelihood); Tree2String(tr->tree_string, tr, tr->start->back, FALSE, TRUE, FALSE, FALSE, FALSE, adef, NO_BRANCHES, FALSE, FALSE, FALSE, FALSE); fprintf(f, "%s\n", tr->tree_string); } } fclose(f); /* increment tree search counter */ bCount++; } #endif if(tr->searchConvergenceCriterion) { freeBitVectors(bitVectors, 2 * tr->mxtips); rax_free(bitVectors); freeHashTable(h); rax_free(h); } freeBestTree(bestT); rax_free(bestT); freeBestTree(bt); rax_free(bt); #ifdef _TERRACES /* free terrace tree list */ freeBestTree(terrace); rax_free(terrace); #endif freeInfoList(); printLog(tr, adef, FALSE); printResult(tr, adef, FALSE); }
void plausibilityChecker(tree *tr, analdef *adef) { FILE *treeFile, *rfFile; tree *smallTree = (tree *)rax_malloc(sizeof(tree)); char rfFileName[1024]; /* init hash table for big reference tree */ hashtable *h = initHashTable(tr->mxtips * 2 * 2); /* init the bit vectors we need for computing and storing bipartitions during the tree traversal */ unsigned int vLength, **bitVectors = initBitVector(tr, &vLength); int numberOfTreesAnalyzed = 0, branchCounter = 0, i; double avgRF = 0.0; /* set up an output file name */ strcpy(rfFileName, workdir); strcat(rfFileName, "RAxML_RF-Distances."); strcat(rfFileName, run_id); rfFile = myfopen(rfFileName, "wb"); assert(adef->mode == PLAUSIBILITY_CHECKER); /* open the big reference tree file and parse it */ treeFile = myfopen(tree_file, "r"); printBothOpen("Parsing reference tree %s\n", tree_file); treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE); assert(tr->mxtips == tr->ntips); printBothOpen("The reference tree has %d tips\n", tr->ntips); fclose(treeFile); /* extract all induced bipartitions from the big tree and store them in the hastable */ bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips, vLength, h, 0, BIPARTITIONS_RF, (branchInfo *)NULL, &branchCounter, 1, FALSE, FALSE); assert(branchCounter == tr->mxtips - 3); /* now see how many small trees we have */ treeFile = getNumberOfTrees(tr, bootStrapFile, adef); checkTreeNumber(tr->numberOfTrees, bootStrapFile); /* allocate a data structure for parsing the potentially mult-furcating tree */ allocateMultifurcations(tr, smallTree); /* loop over all small trees */ for(i = 0; i < tr->numberOfTrees; i++) { int numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE); if(numberOfSplits > 0) { unsigned int entryCount = 0, k, j, *masked = (unsigned int *)rax_calloc(vLength, sizeof(unsigned int)), *smallTreeMask = (unsigned int *)rax_calloc(vLength, sizeof(unsigned int)); hashtable *rehash = initHashTable(tr->mxtips * 2 * 2); double rf, maxRF; int bCounter = 0, bips, firstTaxon, taxa = 0; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits); /* compute the maximum RF distance for computing the relative RF distance later-on */ /* note that here we need to pay attention, since the RF distance is not normalized by 2 * (n-3) but we need to account for the fact that the multifurcating small tree will potentially contain less bipartitions. Hence the normalization factor is obtained as 2 * numberOfSplits, where numberOfSplits is the number of bipartitions in the small tree. */ maxRF = (double)(2 * numberOfSplits); /* now set up a bit mask where only the bits are set to one for those taxa that are actually present in the small tree we just read */ /* note that I had to apply some small changes to this function to make it work for multi-furcating trees ! */ setupMask(smallTreeMask, smallTree->start, smallTree->mxtips); setupMask(smallTreeMask, smallTree->start->back, smallTree->mxtips); /* now get the index of the first taxon of the small tree. we will use this to unambiguously store the bipartitions */ firstTaxon = smallTree->start->number; /* make sure that this bit vector is set up correctly, i.e., that it contains as many non-zero bits as there are taxa in this small tree */ for(j = 0; j < vLength; j++) taxa += BIT_COUNT(smallTreeMask[j]); assert(taxa == smallTree->ntips); /* now re-hash the big tree by applying the above bit mask */ /* loop over hash table */ for(k = 0, entryCount = 0; k < h->tableSize; k++) { if(h->table[k] != NULL) { entry *e = h->table[k]; /* we resolve collisions by chaining, hence the loop here */ do { unsigned int *bitVector = e->bitVector; hashNumberType position; int count = 0; /* double check that our tree mask contains the first taxon of the small tree */ assert(smallTreeMask[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH]); /* if the first taxon is set then we will re-hash the bit-wise complement of the bit vector. The count variable is used for a small optimization */ if(bitVector[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH]) { //hash complement for(j = 0; j < vLength; j++) { masked[j] = (~bitVector[j]) & smallTreeMask[j]; count += BIT_COUNT(masked[j]); } } else { //hash this vector for(j = 0; j < vLength; j++) { masked[j] = bitVector[j] & smallTreeMask[j]; count += BIT_COUNT(masked[j]); } } /* note that padding the last bits is not required because they are set to 0 automatically by smallTreeMask */ /* make sure that we will re-hash the canonic representation of the bipartition where the bit for firstTaxon is set to 0! */ assert(!(masked[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH])); /* only if the masked bipartition of the large tree is a non-trivial bipartition (two or more bits set to 1 will we re-hash it */ if(count > 1) { /* compute hash */ position = oat_hash((unsigned char *)masked, sizeof(unsigned int) * vLength); position = position % rehash->tableSize; /* re-hash to the new hash table that contains the bips of the large tree, pruned down to the taxa contained in the small tree */ insertHashPlausibility(masked, rehash, vLength, position); } entryCount++; e = e->next; } while(e != NULL); } } /* make sure that we tried to re-hash all bipartitions of the original tree */ assert(entryCount == (unsigned int)(tr->mxtips - 3)); /* now traverse the small tree and count how many bipartitions it shares with the corresponding induced tree from the large tree */ /* the following function also had to be modified to account for multi-furcating trees ! */ bips = bitVectorTraversePlausibility(bitVectors, smallTree->start->back, smallTree->mxtips, vLength, rehash, &bCounter, firstTaxon, smallTree, TRUE); /* compute the relative RF */ rf = (double)(2 * (numberOfSplits - bips)) / maxRF; assert(numberOfSplits >= bips); assert(rf <= 1.0); avgRF += rf; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Relative RF tree %d: %f\n\n", i, rf); fprintf(rfFile, "%d %f\n", i, rf); /* I also modified this assertion, we nee to make sure here that we checked all non-trivial splits/bipartitions in the multi-furcating tree whech can be less than n - 3 ! */ assert(bCounter == numberOfSplits); /* free masks and hast table for this iteration */ rax_free(smallTreeMask); rax_free(masked); freeHashTable(rehash); rax_free(rehash); numberOfTreesAnalyzed++; } } printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed); printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed); printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime); printBothOpen("\nFile containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName); fclose(treeFile); fclose(rfFile); /* free the data structure used for parsing the potentially multi-furcating tree */ freeMultifurcations(smallTree); rax_free(smallTree); freeBitVectors(bitVectors, 2 * tr->mxtips); rax_free(bitVectors); freeHashTable(h); rax_free(h); }
void doAllInOne(tree *tr, analdef *adef) { int i, n, bestIndex, bootstrapsPerformed; #ifdef _WAYNE_MPI int bootStopTests = 1, j, bootStrapsPerProcess = 0; #endif double loopTime; int *originalRateCategories; int *originalInvariant; #ifdef _WAYNE_MPI int slowSearches, fastEvery; #else int slowSearches, fastEvery = 5; #endif int treeVectorLength = -1; topolRELL_LIST *rl; double bestLH, mlTime, overallTime; long radiusSeed = adef->rapidBoot; FILE *f; char bestTreeFileName[1024]; hashtable *h = (hashtable*)NULL; unsigned int **bitVectors = (unsigned int**)NULL; boolean bootStopIt = FALSE; double pearsonAverage = 0.0; pInfo *catParams = allocParams(tr); pInfo *gammaParams = allocParams(tr); unsigned int vLength; n = adef->multipleRuns; #ifdef _WAYNE_MPI if(n % processes != 0) n = processes * ((n / processes) + 1); #endif if(adef->bootStopping) { h = initHashTable(tr->mxtips * 100); treeVectorLength = adef->multipleRuns; bitVectors = initBitVector(tr, &vLength); } rl = (topolRELL_LIST *)rax_malloc(sizeof(topolRELL_LIST)); initTL(rl, tr, n); originalRateCategories = (int*)rax_malloc(tr->cdta->endsite * sizeof(int)); originalInvariant = (int*)rax_malloc(tr->cdta->endsite * sizeof(int)); initModel(tr, tr->rdta, tr->cdta, adef); if(adef->grouping) printBothOpen("\n\nThe topologies of all Bootstrap and ML trees will adhere to the constraint tree specified in %s\n", tree_file); if(adef->constraint) printBothOpen("\n\nThe topologies of all Bootstrap and ML trees will adhere to the bifurcating backbone constraint tree specified in %s\n", tree_file); #ifdef _WAYNE_MPI long parsimonySeed0 = adef->parsimonySeed; long replicateSeed0 = adef->rapidBoot; n = n / processes; #endif for(i = 0; i < n && !bootStopIt; i++) { #ifdef _WAYNE_MPI j = i + n * processID; tr->treeID = j; #else tr->treeID = i; #endif tr->checkPointCounter = 0; loopTime = gettime(); #ifdef _WAYNE_MPI if(i == 0) { if(parsimonySeed0 != 0) adef->parsimonySeed = parsimonySeed0 + 10000 * processID; adef->rapidBoot = replicateSeed0 + 10000 * processID; radiusSeed = adef->rapidBoot; } #endif if(i % 10 == 0) { if(i > 0) reductionCleanup(tr, originalRateCategories, originalInvariant); if(adef->grouping || adef->constraint) { FILE *f = myfopen(tree_file, "rb"); assert(adef->restart); if (! treeReadLenMULT(f, tr, adef)) exit(-1); fclose(f); } else makeParsimonyTree(tr, adef); tr->likelihood = unlikely; if(i == 0) { double t; onlyInitrav(tr, tr->start); treeEvaluate(tr, 1); t = gettime(); modOpt(tr, adef, FALSE, 5.0); #ifdef _WAYNE_MPI printBothOpen("\nTime for BS model parameter optimization on Process %d: %f seconds\n", processID, gettime() - t); #else printBothOpen("\nTime for BS model parameter optimization %f\n", gettime() - t); #endif memcpy(originalRateCategories, tr->cdta->rateCategory, sizeof(int) * tr->cdta->endsite); memcpy(originalInvariant, tr->invariant, sizeof(int) * tr->cdta->endsite); if(adef->bootstrapBranchLengths) { if(tr->rateHetModel == CAT) { copyParams(tr->NumberOfModels, catParams, tr->partitionData, tr); assert(tr->cdta->endsite == tr->originalCrunchedLength); catToGamma(tr, adef); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); copyParams(tr->NumberOfModels, gammaParams, tr->partitionData, tr); gammaToCat(tr); copyParams(tr->NumberOfModels, tr->partitionData, catParams, tr); } else { assert(tr->cdta->endsite == tr->originalCrunchedLength); } } } } computeNextReplicate(tr, &adef->rapidBoot, originalRateCategories, originalInvariant, TRUE, TRUE); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1); computeBOOTRAPID(tr, adef, &radiusSeed); #ifdef _WAYNE_MPI saveTL(rl, tr, j); #else saveTL(rl, tr, i); #endif if(adef->bootstrapBranchLengths) { double lh = tr->likelihood; if(tr->rateHetModel == CAT) { copyParams(tr->NumberOfModels, tr->partitionData, gammaParams, tr); catToGamma(tr, adef); resetBranches(tr); onlyInitrav(tr, tr->start); treeEvaluate(tr, 2.0); gammaToCat(tr); copyParams(tr->NumberOfModels, tr->partitionData, catParams, tr); tr->likelihood = lh; } else { treeEvaluate(tr, 2.0); tr->likelihood = lh; } } printBootstrapResult(tr, adef, TRUE); loopTime = gettime() - loopTime; writeInfoFile(adef, tr, loopTime); if(adef->bootStopping) #ifdef _WAYNE_MPI { int nn = (i + 1) * processes; if((nn > START_BSTOP_TEST) && (i * processes < FC_SPACING * bootStopTests) && ((i + 1) * processes >= FC_SPACING * bootStopTests) ) { MPI_Barrier(MPI_COMM_WORLD); concatenateBSFiles(processes, bootstrapFileName); MPI_Barrier(MPI_COMM_WORLD); bootStopIt = computeBootStopMPI(tr, bootstrapFileName, adef, &pearsonAverage); bootStopTests++; } } #else bootStopIt = bootStop(tr, h, i, &pearsonAverage, bitVectors, treeVectorLength, vLength, adef); #endif } #ifdef _WAYNE_MPI MPI_Barrier(MPI_COMM_WORLD); bootstrapsPerformed = i * processes; bootStrapsPerProcess = i; concatenateBSFiles(processes, bootstrapFileName); removeBSFiles(processes, bootstrapFileName); MPI_Barrier(MPI_COMM_WORLD); #else bootstrapsPerformed = i; #endif rax_freeParams(tr->NumberOfModels, catParams); rax_free(catParams); rax_freeParams(tr->NumberOfModels, gammaParams); rax_free(gammaParams); if(adef->bootStopping) { freeBitVectors(bitVectors, 2 * tr->mxtips); rax_free(bitVectors); freeHashTable(h); rax_free(h); } { double t; printBothOpenMPI("\n\n"); if(adef->bootStopping) { if(bootStopIt) { switch(tr->bootStopCriterion) { case FREQUENCY_STOP: printBothOpenMPI("Stopped Rapid BS search after %d replicates with FC Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("Pearson Average of %d random splits: %f\n",BOOTSTOP_PERMUTATIONS , pearsonAverage); break; case MR_STOP: printBothOpenMPI("Stopped Rapid BS search after %d replicates with MR-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; case MRE_STOP: printBothOpenMPI("Stopped Rapid BS search after %d replicates with MRE-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; case MRE_IGN_STOP: printBothOpenMPI("Stopped Rapid BS search after %d replicates with MRE_IGN-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; default: assert(0); } } else { switch(tr->bootStopCriterion) { case FREQUENCY_STOP: printBothOpenMPI("Rapid BS search did not converge after %d replicates with FC Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("Pearson Average of %d random splits: %f\n",BOOTSTOP_PERMUTATIONS , pearsonAverage); break; case MR_STOP: printBothOpenMPI("Rapid BS search did not converge after %d replicates with MR-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; case MRE_STOP: printBothOpenMPI("Rapid BS search did not converge after %d replicates with MRE-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; case MRE_IGN_STOP: printBothOpenMPI("Rapid BS search did not converge after %d replicates with MR_IGN-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; default: assert(0); } } } t = gettime() - masterTime; printBothOpenMPI("Overall Time for %d Rapid Bootstraps %f seconds\n", bootstrapsPerformed, t); printBothOpenMPI("Average Time per Rapid Bootstrap %f seconds\n", (double)(t/((double)bootstrapsPerformed))); if(!adef->allInOne) { printBothOpenMPI("All %d bootstrapped trees written to: %s\n", bootstrapsPerformed, bootstrapFileName); #ifdef _WAYNE_MPI MPI_Finalize(); #endif exit(0); } } /* ML-search */ mlTime = gettime(); double t = mlTime; printBothOpenMPI("\nStarting ML Search ...\n\n"); /***CLEAN UP reduction stuff */ reductionCleanup(tr, originalRateCategories, originalInvariant); /****/ #ifdef _WAYNE_MPI restoreTL(rl, tr, n * processID); #else restoreTL(rl, tr, 0); #endif resetBranches(tr); evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); #ifdef _WAYNE_MPI if(bootstrapsPerformed <= 100) fastEvery = 5; else fastEvery = bootstrapsPerformed / 20; for(i = 0; i < bootstrapsPerformed; i++) rl->t[i]->likelihood = unlikely; for(i = 0; i < bootStrapsPerProcess; i++) { j = i + n * processID; if(i % fastEvery == 0) { restoreTL(rl, tr, j); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1); optimizeRAPID(tr, adef); saveTL(rl, tr, j); } } #else for(i = 0; i < bootstrapsPerformed; i++) { rl->t[i]->likelihood = unlikely; if(i % fastEvery == 0) { restoreTL(rl, tr, i); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1); optimizeRAPID(tr, adef); saveTL(rl, tr, i); } } #endif printBothOpenMPI("Fast ML optimization finished\n\n"); t = gettime() - t; #ifdef _WAYNE_MPI printBothOpen("Fast ML search on Process %d: Time %f seconds\n\n", processID, t); j = n * processID; qsort(&(rl->t[j]), n, sizeof(topolRELL*), compareTopolRell); restoreTL(rl, tr, j); #else printBothOpen("Fast ML search Time: %f seconds\n\n", t); qsort(&(rl->t[0]), bootstrapsPerformed, sizeof(topolRELL*), compareTopolRell); restoreTL(rl, tr, 0); #endif t = gettime(); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); slowSearches = bootstrapsPerformed / 5; if(bootstrapsPerformed % 5 != 0) slowSearches++; slowSearches = MIN(slowSearches, 10); #ifdef _WAYNE_MPI if(processes > 1) { if(slowSearches % processes == 0) slowSearches = slowSearches / processes; else slowSearches = (slowSearches / processes) + 1; } for(i = 0; i < slowSearches; i++) { j = i + n * processID; restoreTL(rl, tr, j); rl->t[j]->likelihood = unlikely; evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1.0); thoroughOptimization(tr, adef, rl, j); } #else for(i = 0; i < slowSearches; i++) { restoreTL(rl, tr, i); rl->t[i]->likelihood = unlikely; evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1.0); thoroughOptimization(tr, adef, rl, i); } #endif /*************************************************************************************************************/ if(tr->rateHetModel == CAT) { catToGamma(tr, adef); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); } bestIndex = -1; bestLH = unlikely; #ifdef _WAYNE_MPI for(i = 0; i < slowSearches; i++) { j = i + n * processID; restoreTL(rl, tr, j); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); printBothOpen("Slow ML Search %d Likelihood: %f\n", j, tr->likelihood); if(tr->likelihood > bestLH) { bestLH = tr->likelihood; bestIndex = j; } } /*printf("processID = %d, bestIndex = %d; bestLH = %f\n", processID, bestIndex, bestLH);*/ #else for(i = 0; i < slowSearches; i++) { restoreTL(rl, tr, i); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); printBothOpen("Slow ML Search %d Likelihood: %f\n", i, tr->likelihood); if(tr->likelihood > bestLH) { bestLH = tr->likelihood; bestIndex = i; } } #endif printBothOpenMPI("Slow ML optimization finished\n\n"); t = gettime() - t; #ifdef _WAYNE_MPI printBothOpen("Slow ML search on Process %d: Time %f seconds\n", processID, t); #else printBothOpen("Slow ML search Time: %f seconds\n", t); #endif t = gettime(); restoreTL(rl, tr, bestIndex); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); Thorough = 1; tr->doCutoff = FALSE; treeOptimizeThorough(tr, 1, 10); evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); t = gettime() - t; #ifdef _WAYNE_MPI printBothOpen("Thorough ML search on Process %d: Time %f seconds\n", processID, t); #else printBothOpen("Thorough ML search Time: %f seconds\n", t); #endif #ifdef _WAYNE_MPI bestLH = tr->likelihood; printf("\nprocessID = %d, bestLH = %f\n", processID, bestLH); if(processes > 1) { double *buffer; int bestProcess; buffer = (double *)rax_malloc(sizeof(double) * processes); for(i = 0; i < processes; i++) buffer[i] = unlikely; buffer[processID] = bestLH; for(i = 0; i < processes; i++) MPI_Bcast(&buffer[i], 1, MPI_DOUBLE, i, MPI_COMM_WORLD); bestLH = buffer[0]; bestProcess = 0; for(i = 1; i < processes; i++) if(buffer[i] > bestLH) { bestLH = buffer[i]; bestProcess = i; } rax_free(buffer); if(processID != bestProcess) { MPI_Finalize(); exit(0); } } #endif printBothOpen("\nFinal ML Optimization Likelihood: %f\n", tr->likelihood); printBothOpen("\nModel Information:\n\n"); printModelParams(tr, adef); strcpy(bestTreeFileName, workdir); strcat(bestTreeFileName, "RAxML_bestTree."); strcat(bestTreeFileName, run_id); Tree2String(tr->tree_string, tr, tr->start->back, TRUE, TRUE, FALSE, FALSE, TRUE, adef, SUMMARIZE_LH, FALSE, FALSE, FALSE, FALSE); f = myfopen(bestTreeFileName, "wb"); fprintf(f, "%s", tr->tree_string); fclose(f); if(adef->perGeneBranchLengths) printTreePerGene(tr, adef, bestTreeFileName, "w"); overallTime = gettime() - masterTime; mlTime = gettime() - mlTime; printBothOpen("\nML search took %f secs or %f hours\n", mlTime, mlTime / 3600.0); printBothOpen("\nCombined Bootstrap and ML search took %f secs or %f hours\n", overallTime, overallTime / 3600.0); printBothOpen("\nDrawing Bootstrap Support Values on best-scoring ML tree ...\n\n"); freeTL(rl); rax_free(rl); calcBipartitions(tr, adef, bestTreeFileName, bootstrapFileName); overallTime = gettime() - masterTime; printBothOpen("Program execution info written to %s\n", infoFileName); printBothOpen("All %d bootstrapped trees written to: %s\n\n", bootstrapsPerformed, bootstrapFileName); printBothOpen("Best-scoring ML tree written to: %s\n\n", bestTreeFileName); if(adef->perGeneBranchLengths && tr->NumberOfModels > 1) printBothOpen("Per-Partition branch lengths of best-scoring ML tree written to %s.PARTITION.0 to %s.PARTITION.%d\n\n", bestTreeFileName, bestTreeFileName, tr->NumberOfModels - 1); printBothOpen("Best-scoring ML tree with support values written to: %s\n\n", bipartitionsFileName); printBothOpen("Best-scoring ML tree with support values as branch labels written to: %s\n\n", bipartitionsFileNameBranchLabels); printBothOpen("Overall execution time for full ML analysis: %f secs or %f hours or %f days\n\n", overallTime, overallTime/3600.0, overallTime/86400.0); #ifdef _WAYNE_MPI MPI_Finalize(); #endif exit(0); }
//Use the plausibility checker overhead void plausibilityChecker(tree *tr, analdef *adef) { FILE *treeFile, *treeFile2, *rfFile; tree *smallTree = (tree *)rax_malloc(sizeof(tree)); char rfFileName[1024]; int numberOfTreesAnalyzed = 0, i; double avgRF = 0.0, sumEffectivetime = 0.0; /* set up an output file name */ strcpy(rfFileName, workdir); strcat(rfFileName, "RAxML_RF-Distances."); strcat(rfFileName, run_id); rfFile = myfopen(rfFileName, "wb"); assert(adef->mode == PLAUSIBILITY_CHECKER); /* open the big reference tree file and parse it */ treeFile = myfopen(tree_file, "r"); printBothOpen("Parsing reference tree %s\n", tree_file); treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE); assert(tr->mxtips == tr->ntips); /*************************************************************************************/ /* Preprocessing Step */ double preprocesstime = gettime(); /* taxonToLabel[2*tr->mxtips - 2]; Array storing all 2n-2 labels from the preordertraversal: (Taxonnumber - 1) -> (Preorderlabel) */ int *taxonToLabel = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)), /* taxonHasDeg[2*tr->mxtips - 2] Array used to store the degree of every taxon, is needed to extract Bipartitions from multifurcating trees (Taxonnumber - 1) -> (degree of node(Taxonnumber)) */ *taxonHasDeg = (int *)rax_calloc((2*tr->mxtips - 2),sizeof(int)), /* taxonToReduction[2*tr->mxtips - 2]; Array used for reducing bitvector and speeding up extraction: (Taxonnumber - 1) -> Index in smallTreeTaxa (starting from 0) which is also: (Taxonnumber - 1) -> (0..1 (increment count of taxa appearing in small tree)) (Taxonnumber - 1) -> (0..1 (increment count of inner nodes appearing in small tree)) */ *taxonToReduction = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)); int newcount = 0; //counter used for correct traversals /* labelToTaxon[2*tr->mxtips - 2]; is used to translate between Perorderlabel and p->number: (Preorderlabel) -> (Taxonnumber) */ int *labelToTaxon = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)); /* Preorder-Traversal of the large tree */ preOrderTraversal(tr->start->back,tr->mxtips, tr->start->number, taxonToLabel, labelToTaxon, &newcount); newcount = 0; //counter set to 0 to be now used for Eulertraversal /* eulerIndexToLabel[4*tr->mxtips - 5]; Array storing all 4n-5 PreOrderlabels created during eulertour: (Eulerindex) -> (Preorderlabel) */ int* eulerIndexToLabel = (int *)rax_malloc((4*tr->mxtips - 5) * sizeof(int)); /* taxonToEulerIndex[tr->mxtips]; Stores all indices of the first appearance of a taxa in the eulerTour: (Taxonnumber - 1) -> (Index of the Eulertour where Taxonnumber first appears) is used for efficient computation of the Lowest Common Ancestor during Reconstruction Step */ int* taxonToEulerIndex = (int *)rax_malloc((tr->mxtips) * sizeof(int)); /* Init taxonToEulerIndex and taxonToReduction */ int ix; for(ix = 0; ix < tr->mxtips; ++ix) taxonToEulerIndex[ix] = -1; for(ix = 0; ix < (2*tr->mxtips - 2); ++ix) taxonToReduction[ix] = -1; /* Eulertraversal of the large tree*/ unrootedEulerTour(tr->start->back,tr->mxtips, eulerIndexToLabel, taxonToLabel, &newcount, taxonToEulerIndex); /* Creating RMQ Datastructure for efficient retrieval of LCAs, using Johannes Fischers Library rewritten in C Following Files: rmq.h,rmqs.c,rmqs.h are included in Makefile.RMQ.gcc */ RMQ_succinct(eulerIndexToLabel,4*tr->mxtips - 5); double preprocessendtime = gettime() - preprocesstime; /* Proprocessing Step End */ /*************************************************************************************/ printBothOpen("The reference tree has %d tips\n", tr->ntips); fclose(treeFile); /***********************************************************************************/ /* RF-OPT Preprocessing Step */ /***********************************************************************************/ /* now see how many small trees we have */ treeFile = getNumberOfTrees(tr, bootStrapFile, adef); treeFile2 = getNumberOfTrees(tr, bootStrapFile, adef); checkTreeNumber(tr->numberOfTrees, bootStrapFile); /* allocate a data structure for parsing the potentially mult-furcating tree */ allocateMultifurcations(tr, smallTree); /* Start Additional preprocessing step */ int numberOfBips = 0, numberOfSets = 0; //Stores the number of bips of each tree int *bipsPerTree = (int *)rax_malloc(tr->numberOfTrees * sizeof(int)); //Stores the number of taxa for each tree int *taxaPerTree = (int *)rax_malloc(tr->numberOfTrees * sizeof(int)); //To calculate all bipartitions, I created a new treeFile2 and a new getNumberOfTrees method!! for(i = 0; i < tr->numberOfTrees; i++) { int this_treeBips = readMultifurcatingTree(treeFile2, smallTree, adef, TRUE); numberOfBips = numberOfBips + this_treeBips; numberOfSets = numberOfSets + this_treeBips * this_treeBips; bipsPerTree[i] = this_treeBips; } printf("numberOfBips: %i , numberOfSets: %i \n \n", numberOfBips, numberOfSets); //stores induced bips (OLD?) unsigned int *ind_bips = (unsigned int *)rax_malloc(numberOfBips * sizeof(unsigned int)); //stores smalltree bips (OLD?) unsigned int *s_bips = (unsigned int *)rax_malloc(numberOfBips * sizeof(unsigned int)); //stores small bips per tree unsigned int ***sBipsPerTree = (unsigned int ***)rax_malloc(tr->numberOfTrees * sizeof(unsigned int**)); //stores induced bips per tree unsigned int ***indBipsPerTree = (unsigned int ***)rax_malloc(tr->numberOfTrees * sizeof(unsigned int**)); //stores vLength of each tree for processing bitVectors unsigned int *vectorLengthPerTree = (unsigned int *)rax_malloc(tr->numberOfTrees * sizeof(unsigned int*)); //stores the corresponding tree number for each bip int *treenumberOfBip = (int *)rax_malloc(numberOfBips * sizeof(int)); //Stores all dropsets of all trees int **sets = (int **)rax_malloc(numberOfSets * sizeof(int*)); //int **sets = NULL; //For each tree, stores a translation array from taxanumber smalltree->largetree int **smallTreeTaxaList = (int **)rax_malloc(tr->numberOfTrees * sizeof(int*)); //For each tree, store a translation array from taxanumber largetree->smalltree int **taxonToReductionList = (int **)rax_malloc(tr->numberOfTrees * sizeof(int*)); //I use these variables as global variables for all trees to determine the max number of possible sets to generate a static array int currentBips = 0; int currentSmallBips = 0; int currentSets = 0; //int currentTree = 0; already there in number of trees analyzed //Prefill sets with -1s for(int it = 0;it < (numberOfSets);it++){ int fill[1] = {-1}; sets[it] = fill; } /***********************************************************************************/ /* RF-OPT Preprocessing Step End */ /***********************************************************************************/ /* loop over all small trees */ for(i = 0; i < tr->numberOfTrees; i++) { int numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE); if(numberOfSplits > 0) { int firstTaxon; double rec_rf, maxRF; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits); /* compute the maximum RF distance for computing the relative RF distance later-on */ /* note that here we need to pay attention, since the RF distance is not normalized by 2 * (n-3) but we need to account for the fact that the multifurcating small tree will potentially contain less bipartitions. Hence the normalization factor is obtained as n-3 + numberOfSplits, where n-3 is the number of bipartitions of the pruned down large reference tree for which we know that it is bifurcating/strictly binary */ maxRF = (double)(2 * numberOfSplits); /* now get the index of the first taxon of the small tree. we will use this to unambiguously store the bipartitions */ firstTaxon = smallTree->start->number; //Saves the number of taxa in the tree (for RF-OPT) taxaPerTree[numberOfTreesAnalyzed] = smallTree->ntips; /***********************************************************************************/ /* Reconstruction Step */ double time_start = gettime(); /* Init hashtable to store Bipartitions of the induced subtree T|t_i */ /* using smallTree->ntips instead of smallTree->mxtips yields faster code e.g. 120 versus 128 seconds for 20,000 small trees on my laptop */ hashtable *s_hash = initHashTable(smallTree->ntips * 4); /* Init hashtable to store Bipartitions of the reference tree t_i*/ hashtable *ind_hash = initHashTable(smallTree->ntips * 4); /* smallTreeTaxa[smallTree->ntips]; Stores all taxa numbers from smallTree into an array called smallTreeTaxa: (Index) -> (Taxonnumber) */ int* smallTreeTaxa = (int *)rax_malloc((smallTree->ntips) * sizeof(int)); /* counter is set to 0 for correctly extracting taxa of the small tree */ newcount = 0; int newcount2 = 0; /* seq2[2*smallTree->ntips - 2]; stores PreorderSequence of the reference smalltree: (Preorderindex) -> (Taxonnumber) */ int* seq2 = (int *)rax_malloc((2*smallTree->ntips - 2) * sizeof(int)); /* used to store the vectorLength of the bitvector */ unsigned int vectorLength; /* extract all taxa of the smalltree and store it into an array, also store all counts of taxa and nontaxa in taxonToReduction */ rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start, smallTree->mxtips, &newcount, &newcount2); rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start->back, smallTree->mxtips, &newcount, &newcount2); /* counter is set to 0 to correctly preorder traverse the small tree */ newcount = 0; /* Preordertraversal of the small reference tree and save its sequence into seq2 for later extracting the bipartitions, it also stores information about the degree of every node */ rec_preOrderTraversalMulti(smallTree->start->back,smallTree->mxtips, smallTree->start->number, seq2, taxonHasDeg, &newcount); /* calculate the bitvector length */ if(smallTree->ntips % MASK_LENGTH == 0) vectorLength = smallTree->ntips / MASK_LENGTH; else vectorLength = 1 + (smallTree->ntips / MASK_LENGTH); /***********************************************************************************/ /* RF-OPT Additional Preprocessing storing Bipartitions */ /***********************************************************************************/ vectorLengthPerTree[numberOfTreesAnalyzed] = vectorLength; unsigned int **bitVectors = rec_initBitVector(smallTree, vectorLength); unsigned int **sBips; /* store all non trivial bitvectors using an subtree approach for the reference subtree and store it into a hashtable, this method was changed for multifurcation */ sBips = RFOPT_extractBipartitionsMulti(bitVectors, seq2, newcount,tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits); sBipsPerTree[numberOfTreesAnalyzed] = sBips; /***********************************************************************************/ /* End RF-OPT Additional Preprocessing storing Bipartitions */ /***********************************************************************************/ /* counter is set to 0 to be used for correctly storing all EulerIndices */ newcount = 0; /* smallTreeTaxonToEulerIndex[smallTree->ntips]; Saves all first Euler indices for all Taxons appearing in small Tree: (Index) -> (Index of the Eulertour where the taxonnumber of the small tree first appears) */ int* smallTreeTaxonToEulerIndex = (int *)rax_malloc((smallTree->ntips) * sizeof(int)); /* seq[(smallTree->ntips*2) - 1] Stores the Preordersequence of the induced small tree */ int* seq = (int *)rax_malloc((2*smallTree->ntips - 1) * sizeof(int)); /* iterate through all small tree taxa */ for(ix = 0; ix < smallTree->ntips; ix++) { int taxanumber = smallTreeTaxa[ix]; /* To create smallTreeTaxonToEulerIndex we filter taxonToEulerIndex for taxa in the small tree*/ smallTreeTaxonToEulerIndex[newcount] = taxonToEulerIndex[taxanumber-1]; /* Saves all Preorderlabel of the smalltree taxa in seq*/ seq[newcount] = taxonToLabel[taxanumber-1]; newcount++; } /* sort the euler indices to correctly calculate LCA */ //quicksort(smallTreeTaxonToEulerIndex,0,newcount - 1); qsort(smallTreeTaxonToEulerIndex, newcount, sizeof(int), sortIntegers); //printf("newcount2 %i \n", newcount2); /* Iterate through all small tree taxa */ for(ix = 1; ix < newcount; ix++) { /* query LCAs using RMQ Datastructure */ seq[newcount - 1 + ix] = eulerIndexToLabel[query(smallTreeTaxonToEulerIndex[ix - 1],smallTreeTaxonToEulerIndex[ix])]; /* Used for dynamic programming. We save an index for every inner node: For example the reference tree has 3 inner nodes which we saves them as 0,1,2. Now we calculate for example 5 LCA to construct the induced subtree, which are also inner nodes. Therefore we mark them as 3,4,5,6,7 */ taxonToReduction[labelToTaxon[seq[newcount - 1 + ix]] - 1] = newcount2; newcount2 += 1; } /* sort to construct the Preordersequence of the induced subtree */ //quicksort(seq,0,(2*smallTree->ntips - 2)); qsort(seq, (2 * smallTree->ntips - 2) + 1, sizeof(int), sortIntegers); /* calculates all bipartitions of the reference small tree and count how many bipartition it shares with the induced small tree and stores those bipartitions in a additional hashtable called ind_hash */ int rec_bips = 0; unsigned int **indBips; indBips = RFOPT_findAddBipartitions(bitVectors, seq,(2*smallTree->ntips - 1), labelToTaxon, tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, ind_hash, taxonToReduction); indBipsPerTree[numberOfTreesAnalyzed] = indBips; /* calculates all bipartitions of the reference small tree and put them into ind_hash*/ // rec_extractBipartitionsMulti(bitVectors, seq2, (2*smallTree->ntips - 1),tr->mxtips, vectorLength, smallTree->ntips, // firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits); /* Reconstruction Step End */ /***********************************************************************************/ double effectivetime = gettime() - time_start; /* if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Reconstruction time: %.10f secs\n\n", effectivetime); */ /* compute the relative RF */ /***********************************************************************************/ /* RF-OPT Save Translation Vectors */ /***********************************************************************************/ //copy array taxonToReduction because it is originally defined in preprocessing step int * taxonToReductionCopy = (int *)rax_malloc((tr->mxtips)*sizeof(int)); memcpy(taxonToReductionCopy,taxonToReduction,(tr->mxtips)*sizeof(int)); //storing smallTree and taxonToReduction Arrays for further usage smallTreeTaxaList[numberOfTreesAnalyzed] = smallTreeTaxa; taxonToReductionList[numberOfTreesAnalyzed] = taxonToReductionCopy; int this_currentSmallBips = 0; //Variable resets everytime for each tree analyzed /***********************************************************************************/ /* End RF-OPT Save Translation Vectors */ /***********************************************************************************/ rec_rf = (double)(2 * (numberOfSplits - rec_bips)) / maxRF; assert(numberOfSplits >= rec_bips); avgRF += rec_rf; sumEffectivetime += effectivetime; //if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Relative RF tree %d: %f\n\n", i, rec_rf); fprintf(rfFile, "%d %f\n", i, rec_rf); //rax_free(smallTreeTaxa); //Need it for calculating the SmallTreeTaxaList after all iterations! rax_free(seq); rax_free(seq2); rax_free(smallTreeTaxonToEulerIndex); numberOfTreesAnalyzed++; //Counting the number of trees analyzed } }// End of Small Tree Iterations /***********************************************************************************/ /* RF-OPT DropSet Calculation using BitVectors */ /***********************************************************************************/ log_info("===> Create DropSet Datastructure \n"); static Hashmap* map = NULL; //Set a hashmap for dropsets with a dropset comparision and standard hash map = Hashmap_create(compareDropSet, NULL); static Hashmap** mapArray = NULL; //Set an array to store the pointers to bitvector hashtables for each tree mapArray = rax_malloc(tr->numberOfTrees * sizeof(Hashmap*)); printf("===> BitVector Set Calculation \n"); //Calculate dropsets of two given bips lists and extract all sets into array sets and into a hashmap. Each set has following format //dropset = {taxa_1,taxa_2,...,taxa_n,-1}; //Furtheremore calculate Dropset generates two data structures from type bips and dropsets which are pointing to each other in hashtables calculateDropSets(mapArray, map, indBipsPerTree, sBipsPerTree, sets, smallTreeTaxaList, bipsPerTree, taxaPerTree, vectorLengthPerTree, tr->numberOfTrees); /***********************************************************************************/ /* RF-OPT Graph Construction */ /***********************************************************************************/ // printf("\n == Sets == \n"); // for(int fooo = 0; fooo < numberOfSets; fooo++){ // printf("Set %i: ", fooo); // int i = 0; // while(sets[fooo][i] > -1) { // printf("%i ",sets[fooo][i]); // i++; // } // printf("\n"); // } // printf("\n"); /* Filter for unique sets */ log_info("===> Hashmap tests...\n"); Hashmap_traverse(map, traverse_cb); // int key[2] = {0,-1}; // Dropset* drop = Hashmap_get(map,key); // DArray* bips = drop->bipartitions; // for(int i = 0; i < DArray_count(bips); i++) { // Bipartition* bip = DArray_get(bips,i); // printBitVector(bip->bitvector[0]); // printf("matching: %i \n", bip->matching); // printf("tree: %i \n", bip->treenumber); // } // Bipartition* bipFromHash = DArray_first(bips); // Bipartition* testBip = Hashmap_get(mapArray[0],bipFromHash->bitvector); // printf("matching before: %i",testBip->matching); // testBip->matching = 999; // for(int i = 0; i < DArray_count(bips); i++) { // Bipartition* bip = DArray_get(bips,i); // printBitVector(bip->bitvector[0]); // printf("matching: %i \n", bip->matching); // printf("tree: %i \n", bip->treenumber); // } printf("===> Filter for unique sets (naive)...\n"); /* unique sets array data structures */ int** uniqSets = (int **) rax_malloc(sizeof(int*) * numberOfSets); int* setsToUniqSets = (int*) rax_malloc(sizeof(int) * numberOfSets); int numberOfUniqueSets = 0; int dropSetCount = 0; //stores the scores for each bips, we are using a bitvector approach (need to scale) //Legacy Code int bvec_scores = 0; numberOfUniqueSets = getUniqueDropSets(sets, uniqSets, setsToUniqSets, numberOfSets); printf("number of unique sets: %i \n", numberOfUniqueSets); /* Detect initial matchings, we calculate them using bitvectors to represent our bipartitions */ printf("===> Detect initial matchings...\n"); int vLengthBip = 0; //determine the bitVector Length of our bitVector if(numberOfBips % MASK_LENGTH == 0) vLengthBip = numberOfBips / MASK_LENGTH; else vLengthBip = numberOfBips / MASK_LENGTH + 1; //Initialize a bvecScore vector with 0s int* bvecScores = (int*)rax_calloc(vLengthBip,sizeof(int)); //Calculate Initial Matchings and save the result in bvecScores detectInitialMatchings(sets, bvecScores, bipsPerTree, numberOfTreesAnalyzed, vLengthBip); //Short summary until now: // - bipsPerTree consists of all bipartitions per tree // - bvecScores is the bitvector setting 1 to all bipartition indices which can score // - taxaPerTree number of taxa per tree // - smallTreeTaxaList list of all smalltree->largetree translation arrays /* Generate useful data structures for calculating and updating scores */ printf("===> Create data structures...\n"); //Stores the number of bips per Set and initialize it with 0s int* numberOfBipsPerSet = (int*)rax_calloc(numberOfUniqueSets,sizeof(int)); //Stores all sets which includes this taxa int **setsOfTaxa = (int**)rax_malloc((tr->mxtips + 1) *sizeof(int*)); //Now calculate number of bipartitions affected by each unique set for(int i = 0; i < numberOfSets; i++) { int setindex = setsToUniqSets[i]; numberOfBipsPerSet[setindex]++; } //Now using the knowledge of how many bips there are per set, generate an array for each unique dropset containing all bips int** bipsOfDropSet = (int**)rax_malloc(sizeof(int*)*numberOfUniqueSets); //Allocate the space needed for storing all bips for(int i = 0; i < numberOfUniqueSets; i++) { bipsOfDropSet[i] = (int*)rax_malloc(sizeof(int)*numberOfBipsPerSet[i]); } printf("==> Initialize the Bips Of Taxa \n"); //Stores the number of bips each taxa is included (ABC|DE is stored by A,B,C,D and E) //It can be calculated by iterating through all trees and adding the taxa int **bipsOfTaxa = (int**)rax_malloc((tr->mxtips + 1) * sizeof(int*)); int *numberOfBipsPerTaxa = (int*)rax_calloc((tr->mxtips + 1), sizeof(int)); int *taxaBipsCounter = (int*)rax_calloc((tr->mxtips + 1), sizeof(int)); //Now add up all for (int tree = 0; tree < tr->numberOfTrees; tree++) { int* list = smallTreeTaxaList[tree]; for (int j = 0; j < taxaPerTree[tree]; j++) { int taxa = list[j]; numberOfBipsPerTaxa[taxa] = numberOfBipsPerTaxa[taxa] + bipsPerTree[tree]; } } //Now create dummy arrays inside bipsOfTaxa for(int i = 1; i < tr->mxtips+1; i++) { bipsOfTaxa[i] = (int*)rax_malloc(sizeof(int)*numberOfBipsPerTaxa[i]); } printf("==> Storing all bip indices of a certain dropset into an array \n"); //For checking if all dropsets are iterated dropSetCount = 0; //Arrays of counter to keep in track int* counterOfSet = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets); for(int i = 0; i < numberOfUniqueSets; i++) { counterOfSet[i] = 0; } currentBips = 0; //Need to keep in track of the number of bips //First iterate through all trees for(int i = 0; i < numberOfTreesAnalyzed; i++ ) { //get the correct smallTreeTaxa List int* list = smallTreeTaxaList[i]; //For each bipartition in the tree for(int j = 0; j < bipsPerTree[i]; j++) { //Look at all bips it is compared too int dropSetsPerBip = bipsPerTree[i]; for(int k = 0; k < dropSetsPerBip; k++){ int indexOfUniqDropSet = setsToUniqSets[dropSetCount + k]; int* bips_array = bipsOfDropSet[indexOfUniqDropSet]; //add bipartition j into the bips array of its dropset bips_array[counterOfSet[indexOfUniqDropSet]] = currentBips; //increment the internal array index counterOfSet[indexOfUniqDropSet]++; } //Jump to the next correct dropSetCount! dropSetCount = dropSetCount + dropSetsPerBip; //now insert the bip into bipsOfTaxa Array for(int ix = 0; ix < taxaPerTree[i]; ix++) { //get the taxa number int stree_Taxa = list[ix]; //get the bips list of this taxa number int* bipsList = bipsOfTaxa[stree_Taxa]; //now get the position of the biplist and put in our bip index bipsList[taxaBipsCounter[stree_Taxa]] = currentBips; //increment the counter taxaBipsCounter[stree_Taxa]++; } //increment currentBips currentBips++; } } /***********************************************************************************/ /* End RF-OPT Graph Construction */ /***********************************************************************************/ /* Short summary : sets - array of all dropsets uniqSets - array of all unique dropsets bipsPerTree - bips per tree setsToUniqSets - translates the index of sets to the index of its unique dropset index bipsOfDropSets - all bips which disappear when dropset i is pruned scores - has all scores between 0 and 1 for the bips (however 0s can be found out by looking at all dropsets with link to dropset 0 (because we sort and it will always be the lowest)) */ /***********************************************************************************/ /* RF-OPT Initial Score Calculation */ /***********************************************************************************/ unsigned int bipsVectorLength; /* calculate the bitvector length for bips bitvector */ if(numberOfBips % MASK_LENGTH == 0) bipsVectorLength = numberOfBips / MASK_LENGTH; else bipsVectorLength = 1 + (numberOfBips / MASK_LENGTH); //Starting from index 1 (because 0 stands for all who already matches) //We need a score array saving the scores for each uniqset int* rf_score = (int*)rax_calloc(numberOfUniqueSets,sizeof(int)); printf("==> Calculating the score for the first iteration \n \n"); //Store all bvecs of all merged and destroyed bipartitions per DropSet int* bvecs_bips = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets); int* bvecs_destroyed = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets); //Iterate through all sets for(int i = 0; i < numberOfUniqueSets; i++) { //Bitvectors of merged and destroyed int bvec_destroyed = 0; int* set = uniqSets[i]; //Get the dropset, first dropset is 0 (if something is matching) //printf(" ==> Analyze Unique DropSet %i \n", i); //We use this data structure to keep track of the to toggled bits int* toggleBits = (int*)rax_calloc(numberOfBips, sizeof(int)); //Now iterate through the set int j = 0; //Stores the affected bips into a bitvector int bvec_bips = 0; while(set[j] != -1) { int taxa = set[j]; //Get the taxa //printf(" Taxa number is %i \n",taxa); //Check if set[j] is itself already a set int test[2] = {taxa,-1}; //0 if it is not a set, index + 1 otherwise int test_index = contains(test, uniqSets, numberOfUniqueSets); if(test_index){ //printf(" It also is in uniqSet %i \n", test_index - 1); bvec_bips = getBipsOfDropSet(bvec_bips, (test_index - 1), numberOfBipsPerSet, bipsOfDropSet); } //Get all bips of this taxa to detect which one will be destroyed int* listOfBips = bipsOfTaxa[taxa]; //Go through all bipartitions containing this taxa for(int k = 0; k < numberOfBipsPerTaxa[taxa]; k++){ int bipindex = listOfBips[k]; //Get the index of the Bipartition int bip = ind_bips[bipindex]; //Now analyze this Bipartition //Which tree does this bipartition belongs too? int treenumber = treenumberOfBip[bipindex]; //Get the taxonToSmallTree Array of this tree int* stTaxa = taxonToReductionList[treenumber]; //Translate the global taxon number it into the local index used by our bips int translated_index = stTaxa[taxa - 1]; //We use taxa - 1 because we start counting at taxa 1 = 0 ! //Save the to toggle index into toggleBits vector toggleBits[bipindex] |= 1 << translated_index; //Sort for bits set on one side of the bip and on the other side int leftBits = __builtin_popcount(toggleBits[bipindex] & bip); int rightBits = __builtin_popcount(toggleBits[bipindex]) - leftBits; //Check for the number of bits set in the original bip int leftBip = __builtin_popcount(bip); int rightBip = taxaPerTree[treenumber] - leftBip; //Subtract the total number of bits set on one side of the bip with the bits we have to toggle int leftBip_after = leftBip - leftBits; int rightBip_after = rightBip - rightBits; //Check if bipartition gets trivial/destroyed due to pruning the taxa and set the bit (representing the bip) which is destroyed if((leftBip_after <= 1) | (rightBip_after <=1)) { //Add bips to the bits which represent destroyed bipartitions bvec_destroyed = setBit(bvec_destroyed,bipindex); } } j++; }//End iterate through the set int penality = 0; int score = 0; int bvec_mask = 0; bvec_mask = setOffSet(bvec_mask, numberOfBips); //Bitvector of already matching bips int bvec_tmp = 0; bvec_tmp = ~bvec_scores & bvec_mask; //Penality score are all bitvectors who were matching but is destroyed penality = __builtin_popcount(bvec_destroyed & bvec_tmp); //Now iterate through bipsOfDropSet list and extract all bips which will merge into a bitVector bvec_bips = getBipsOfDropSet(bvec_bips, i, numberOfBipsPerSet, bipsOfDropSet); //Calculate the bitvectors which remains bvec_tmp = ~bvec_destroyed & bvec_mask; bvec_tmp = bvec_bips & bvec_tmp; score = __builtin_popcount(bvec_scores & bvec_tmp); rf_score[i] = score - penality; //Save our results for convenience into an array bvecs_bips[i] = bvec_bips; bvecs_destroyed[i] = bvec_destroyed; }//End Score Calculation printf("======> Scores:\n"); for(int i = 0; i < numberOfUniqueSets; i++) { printf("RF Score for %i : %i \n", i, rf_score[i]); //printBitVector(bvecs_bips[i]); //printBitVector(bvecs_destroyed[i]); } int maxDropSet = getMax(rf_score, numberOfUniqueSets); printf("Max Element is %i \n", maxDropSet); /***********************************************************************************/ /* RF-OPT Create Update Data Structures */ /***********************************************************************************/ printf("====> Delete DropSet from all bips and update numbers \n"); //Create a bitVector to store all deleted taxa int bvec_deletedTaxa = 0; //Create a bitVector to store all still existing bips int bvec_existingBips = 0; //Create a bitvector to store deleted dropsets int bvec_deletedDropSets = 0; //Get the dropset int* deleteDropSet = uniqSets[maxDropSet]; //Store it into a BitVector bvec_deletedDropSets = setBit(bvec_deletedDropSets,maxDropSet); //Select all bips destroyed by removing this dropset int bvec_destroyedBips = bvecs_destroyed[maxDropSet]; //Select all bips that are now matching int bvec_matchingBips = bvecs_bips[maxDropSet]; //Filter for existent bipartitions bvec_existingBips = getExistingBips(bvec_existingBips, numberOfBips, bvec_destroyedBips); //Iterate through its taxa int iterSet = 0; while(deleteDropSet[iterSet] != -1) { //Get taxon int taxon = deleteDropSet[iterSet]; //Store the taxon into deletedTaxa BitVector bvec_deletedTaxa = setBit(bvec_deletedTaxa, taxon - 1); //Check if taxon is inside int test[2] = {taxon, -1}; int index = contains(test, uniqSets, numberOfUniqueSets); iterSet++; } //printBitVector(bvec_existingBips); //printBitVector(bvec_deletedTaxa); //Update the scores with now matching bips bvec_scores = bvec_scores & (~bvec_matchingBips); //printBitVector(bvec_scores); /* Short summary : bvec_existingBips - bitVector of all still existing bips bvec_deletedTaxa - bitVector to keep track of destroyed taxa */ /***********************************************************************************/ /* TODO RF-OPT Update function */ /***********************************************************************************/ /***********************************************************************************/ /* End RF-OPT Update function */ /***********************************************************************************/ //printf("Ind Bipartitions?: "); // printf("Induced Bipartitions: "); // printBitVector(ind_bips[0]); // printBitVector(ind_bips[1]); // printBitVector(ind_bips[2]); // printBitVector(ind_bips[3]); // printBitVector(ind_bips[4]); // printBitVector(ind_bips[5]); // printBitVector(ind_bips[6]); /***********************************************************************************/ /* Console Logs for debugging */ /***********************************************************************************/ //Printing if printf("==> Unique Sets: "); for(int i = 0; i < numberOfUniqueSets; i++) { int j = 0; int* set = uniqSets[i]; while(set[j] > -1) { printf("%i ",set[j]); j++; } printf("; "); } printf("\n"); printf("\n == Sets == \n"); for(int fooo = 0; fooo < numberOfSets; fooo++){ printf("Set %i: ", fooo); int i = 0; while(sets[fooo][i] > -1) { printf("%i ",sets[fooo][i]); i++; } printf("\n"); } printf("\n"); //#define _PRINT_ #ifdef _PRINT_ for(int i = 0; i < numberOfUniqueSets; i++) { printf("Bips of Set %i: ", i); for(int j = 0; j < numberOfBipsPerSet[i]; j++) { int* bips = bipsOfDropSet[i]; printf("%i ", bips[j]); } printf("\n"); } printf("Induced Bips! \n"); // Now checking which dropset would destroy which bipartition for(int i = 0 ; i < numberOfBips; i++) { printf("Bip %i is %i \n",i,ind_bips[i]); } printf("Taxa Names : \n"); for(int i = 0; i < tr->mxtips + 1; i++) { printf("%s ",tr->nameList[i]); } printf("\n"); printf("Small Tree Taxa Names 0 : \n"); for(int i = 0; i < taxaPerTree[0]; i++) { int* list = smallTreeTaxaList[0]; int taxa = list[i]; printf("%s ",tr->nameList[taxa]); } printf("\n"); printf("Small Tree Taxa Names 1 : \n"); for(int i = 0; i < taxaPerTree[1]; i++) { int* list = smallTreeTaxaList[1]; int taxa = list[i]; printf("%s ",tr->nameList[taxa]); } printf("\n"); printf("Small Tree Taxa Names 2 : \n"); for(int i = 0; i < taxaPerTree[2]; i++) { int* list = smallTreeTaxaList[2]; int taxa = list[i]; printf("%s ",tr->nameList[taxa]); } printf("\n"); printf("Number of DropSets extracted%i \n",dropSetCount); printf("Number of Bips extracted %i \n",currentBips); //Testing ... printf("Number of Sets is %i \n",numberOfSets); printf("Number of Unique Sets is %i \n",numberOfUniqueSets); printf("==> Testing bips of unique sets \n"); for(int i = 0; i < numberOfUniqueSets; i++) { printf("Bips of Set %i: ", i); for(int j = 0; j < numberOfBipsPerSet[i]; j++) { int* bips = bipsOfDropSet[i]; printf("%i ", bips[j]); } printf("\n"); } printf("==> Testing bips of taxa \n"); for(int i = 1; i < tr->mxtips + 1; i++) { printf("Bips of Taxa %i: ", i); for(int j = 0; j < numberOfBipsPerTaxa[i]; j++) { int* bips = bipsOfTaxa[i]; printf("%i ", bips[j]); } printf("\n"); } printf("==> Unique Sets: "); for(int i = 0; i < numberOfUniqueSets; i++) { int j = 0; int* set = uniqSets[i]; while(set[j] > -1) { printf("%i ",set[j]); j++; } printf("; "); } printf("\n"); printf("==> setsToUniqSets: "); for(int i = 0; i < numberOfSets; i++) { printf("%i ",setsToUniqSets[i]); } printf("\n"); //=== TREE GRAPH CONSTRUCTION ENDS === printf("Scores: "); printBitVector(bvec_scores); printf("BipsPerTree: "); for(int foo = 0; foo < tr->numberOfTrees; foo++) { printf("%i ",bipsPerTree[foo]); } printf("\nInduced Bips: "); for(int foo = 0;foo < numberOfBips; foo++) { printf("%u ",ind_bips[foo]); } printf("\nSmall Tree Bips: "); for(int foo = 0;foo < numberOfBips; foo++) { printf("%u ",s_bips[foo]); } printf("\n == Sets == \n"); for(int fooo = 0; fooo < numberOfSets; fooo++){ printf("Set %i: ", fooo); int i = 0; while(sets[fooo][i] > -1) { printf("%i ",sets[fooo][i]); i++; } printf("\n"); } printf("\n"); #endif printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed); printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed); printBothOpen("Large Tree: %i, Number of SmallTrees analyzed: %i \n\n", tr->mxtips, numberOfTreesAnalyzed); printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime); printBothOpen("File containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName); printBothOpen("execution stats:\n\n"); printBothOpen("Accumulated time Effective algorithm: %.5f sec \n", sumEffectivetime); printBothOpen("Average time for effective: %.10f sec \n",sumEffectivetime / (double)numberOfTreesAnalyzed); printBothOpen("Preprocessingtime: %0.5f sec \n\n", preprocessendtime); fclose(treeFile); fclose(rfFile); /* free the data structure used for parsing the potentially multi-furcating tree */ freeMultifurcations(smallTree); rax_free(smallTree); rax_free(taxonToLabel); rax_free(taxonToEulerIndex); rax_free(labelToTaxon); rax_free(eulerIndexToLabel); rax_free(taxonToReduction); rax_free(taxonHasDeg); }
void computeAncestralStates(tree *tr, double referenceLikelihood, analdef *adef) { int counter = 0; char treeFileName[2048], ancestralProbsFileName[2048], ancestralStatesFileName[2048]; FILE *treeFile, *probsFile, *statesFile; #ifdef _USE_PTHREADS tr->ancestralStates = (double*)malloc(getContiguousVectorLength(tr) * sizeof(double)); #endif /* assert(!adef->compressPatterns);*/ strcpy(ancestralProbsFileName, workdir); strcpy(ancestralStatesFileName, workdir); strcpy(treeFileName, workdir); strcat(ancestralProbsFileName, "RAxML_marginalAncestralProbabilities."); strcat(ancestralStatesFileName, "RAxML_marginalAncestralStates."); strcat(treeFileName, "RAxML_nodeLabelledRootedTree."); strcat(ancestralProbsFileName, run_id); strcat(ancestralStatesFileName, run_id); strcat(treeFileName, run_id); probsFile = myfopen(ancestralProbsFileName, "w"); statesFile = myfopen(ancestralStatesFileName, "w"); treeFile = myfopen(treeFileName, "w"); assert(tr->leftRootNode == tr->rightRootNode->back); computeAncestralRec(tr, tr->leftRootNode, &counter, probsFile, statesFile, FALSE); computeAncestralRec(tr, tr->rightRootNode, &counter, probsFile, statesFile, FALSE); computeAncestralRec(tr, tr->rightRootNode, &counter, probsFile, statesFile, TRUE); evaluateGeneric(tr, tr->rightRootNode); if(fabs(tr->likelihood - referenceLikelihood) > 0.5) { printf("Something suspiciuous is going on with the marginal ancestral probability computations\n"); assert(0); } assert(counter == tr->mxtips - 1); ancestralTree(tr->tree_string, tr); fprintf(treeFile, "%s\n", tr->tree_string); fclose(probsFile); fclose(statesFile); fclose(treeFile); printBothOpen("Marginal Ancestral Probabilities written to file:\n%s\n\n", ancestralProbsFileName); printBothOpen("Ancestral Sequences based on Marginal Ancestral Probabilities written to file:\n%s\n\n", ancestralStatesFileName); printBothOpen("Node-laballed ROOTED tree written to file:\n%s\n", treeFileName); }
int treeReadLen (FILE *fp, tree *tr, boolean readBranches, boolean readNodeLabels, boolean topologyOnly, analdef *adef, boolean completeTree) { nodeptr p; int i, ch, lcount = 0; for (i = 1; i <= tr->mxtips; i++) { tr->nodep[i]->back = (node *) NULL; if(topologyOnly) tr->nodep[i]->support = -1; } for(i = tr->mxtips + 1; i < 2 * tr->mxtips; i++) { tr->nodep[i]->back = (nodeptr)NULL; tr->nodep[i]->next->back = (nodeptr)NULL; tr->nodep[i]->next->next->back = (nodeptr)NULL; tr->nodep[i]->number = i; tr->nodep[i]->next->number = i; tr->nodep[i]->next->next->number = i; if(topologyOnly) { tr->nodep[i]->support = -2; tr->nodep[i]->next->support = -2; tr->nodep[i]->next->next->support = -2; } } if(topologyOnly) tr->start = tr->nodep[tr->mxtips]; else tr->start = tr->nodep[1]; tr->ntips = 0; tr->nextnode = tr->mxtips + 1; for(i = 0; i < tr->numBranches; i++) tr->partitionSmoothed[i] = FALSE; tr->rooted = FALSE; p = tr->nodep[(tr->nextnode)++]; while((ch = treeGetCh(fp)) != '('); if(!topologyOnly) assert(readBranches == FALSE && readNodeLabels == FALSE); if (! addElementLen(fp, tr, p, readBranches, readNodeLabels, &lcount)) assert(0); if (! treeNeedCh(fp, ',', "in")) assert(0); if (! addElementLen(fp, tr, p->next, readBranches, readNodeLabels, &lcount)) assert(0); if (! tr->rooted) { if ((ch = treeGetCh(fp)) == ',') { if (! addElementLen(fp, tr, p->next->next, readBranches, readNodeLabels, &lcount)) assert(0); } else { /* A rooted format */ tr->rooted = TRUE; if (ch != EOF) (void) ungetc(ch, fp); } } else { p->next->next->back = (nodeptr) NULL; } if (! treeNeedCh(fp, ')', "in")) assert(0); if(topologyOnly) assert(!(tr->rooted && readNodeLabels)); (void) treeFlushLabel(fp); if (! treeFlushLen(fp)) assert(0); if (! treeNeedCh(fp, ';', "at end of")) assert(0); if (tr->rooted) { assert(!readNodeLabels); p->next->next->back = (nodeptr) NULL; tr->start = uprootTree(tr, p->next->next, FALSE, FALSE); if (! tr->start) { printf("FATAL ERROR UPROOTING TREE\n"); assert(0); } } else tr->start = findAnyTip(p, tr->rdta->numsp); if(!topologyOnly) { setupPointerMesh(tr); assert(tr->ntips <= tr->mxtips); if(tr->ntips < tr->mxtips) { if(completeTree) { printBothOpen("Hello this is your friendly RAxML tree parsing routine\n"); printBothOpen("The RAxML option you are uisng requires to read in only complete trees\n"); printBothOpen("with %d taxa, there is at least one tree with %d taxa though ... exiting\n", tr->mxtips, tr->ntips); exit(-1); } else { if(adef->computeDistance) { printBothOpen("Error: pairwise distance computation only allows for complete, i.e., containing all taxa\n"); printBothOpen("bifurcating starting trees\n"); exit(-1); } if(adef->mode == CLASSIFY_ML) { printBothOpen("RAxML classifier Algo: You provided a reference tree with %d taxa; alignmnet has %d taxa\n", tr->ntips, tr->mxtips); printBothOpen("%d query taxa will be classifed under ML\n", tr->mxtips - tr->ntips); classifyML(tr, adef); } else { printBothOpen("You provided an incomplete starting tree %d alignmnet has %d taxa\n", tr->ntips, tr->mxtips); makeParsimonyTreeIncomplete(tr, adef); } } } else { if(adef->mode == PARSIMONY_ADDITION) { printBothOpen("Error you want to add sequences to a trees via MP stepwise addition, but \n"); printBothOpen("you have provided an input tree that already contains all taxa\n"); exit(-1); } if(adef->mode == CLASSIFY_ML) { printBothOpen("Error you want to classify query sequences into a tree via ML, but \n"); printBothOpen("you have provided an input tree that already contains all taxa\n"); exit(-1); } } onlyInitrav(tr, tr->start); } return lcount; }
int treeReadLen (FILE *fp, tree *tr, boolean readBranches, boolean readNodeLabels, boolean topologyOnly, analdef *adef, boolean completeTree, boolean storeBranchLabels) { nodeptr p; int i, ch, lcount = 0; tr->branchLabelCounter = 0; for (i = 1; i <= tr->mxtips; i++) { tr->nodep[i]->back = (node *) NULL; if(topologyOnly) tr->nodep[i]->support = -1; } for(i = tr->mxtips + 1; i < 2 * tr->mxtips; i++) { tr->nodep[i]->back = (nodeptr)NULL; tr->nodep[i]->next->back = (nodeptr)NULL; tr->nodep[i]->next->next->back = (nodeptr)NULL; tr->nodep[i]->number = i; tr->nodep[i]->next->number = i; tr->nodep[i]->next->next->number = i; if(topologyOnly) { tr->nodep[i]->support = -2; tr->nodep[i]->next->support = -2; tr->nodep[i]->next->next->support = -2; } } if(topologyOnly) tr->start = tr->nodep[tr->mxtips]; else tr->start = tr->nodep[1]; tr->ntips = 0; tr->nextnode = tr->mxtips + 1; for(i = 0; i < tr->numBranches; i++) tr->partitionSmoothed[i] = FALSE; tr->rooted = FALSE; tr->wasRooted = FALSE; p = tr->nodep[(tr->nextnode)++]; while((ch = treeGetCh(fp)) != '('); if(!topologyOnly) { if(adef->mode != CLASSIFY_ML) { if(adef->mode != OPTIMIZE_BR_LEN_SCALER) assert(readBranches == FALSE && readNodeLabels == FALSE); else assert(readBranches == TRUE && readNodeLabels == FALSE); } else { if(adef->useBinaryModelFile) assert(readBranches == TRUE && readNodeLabels == FALSE); else assert(readBranches == FALSE && readNodeLabels == FALSE); } } if (! addElementLen(fp, tr, p, readBranches, readNodeLabels, &lcount, adef, storeBranchLabels)) assert(0); if (! treeNeedCh(fp, ',', "in")) assert(0); if (! addElementLen(fp, tr, p->next, readBranches, readNodeLabels, &lcount, adef, storeBranchLabels)) assert(0); if (! tr->rooted) { if ((ch = treeGetCh(fp)) == ',') { if (! addElementLen(fp, tr, p->next->next, readBranches, readNodeLabels, &lcount, adef, storeBranchLabels)) assert(0); } else { /* A rooted format */ tr->rooted = TRUE; tr->wasRooted = TRUE; if (ch != EOF) (void) ungetc(ch, fp); } } else { p->next->next->back = (nodeptr) NULL; tr->wasRooted = TRUE; } if(!tr->rooted && adef->mode == ANCESTRAL_STATES) { printf("Error: The ancestral state computation mode requires a rooted tree as input, exiting ....\n"); exit(0); } if (! treeNeedCh(fp, ')', "in")) assert(0); if(topologyOnly) assert(!(tr->rooted && readNodeLabels)); (void) treeFlushLabel(fp); if (! treeFlushLen(fp, tr)) assert(0); if (! treeNeedCh(fp, ';', "at end of")) assert(0); if (tr->rooted) { assert(!readNodeLabels); p->next->next->back = (nodeptr) NULL; tr->start = uprootTree(tr, p->next->next, readBranches, FALSE); /*tr->leftRootNode = p->back; tr->rightRootNode = p->next->back; */ if (! tr->start) { printf("FATAL ERROR UPROOTING TREE\n"); assert(0); } } else tr->start = findAnyTip(p, tr->rdta->numsp); if(!topologyOnly || adef->mode == CLASSIFY_MP) { assert(tr->ntips <= tr->mxtips); if(tr->ntips < tr->mxtips) { if(completeTree) { printBothOpen("Hello this is your friendly RAxML tree parsing routine\n"); printBothOpen("The RAxML option you are uisng requires to read in only complete trees\n"); printBothOpen("with %d taxa, there is at least one tree with %d taxa though ... exiting\n", tr->mxtips, tr->ntips); exit(-1); } else { if(adef->computeDistance) { printBothOpen("Error: pairwise distance computation only allows for complete, i.e., containing all taxa\n"); printBothOpen("bifurcating starting trees\n"); exit(-1); } if(adef->mode == CLASSIFY_ML || adef->mode == CLASSIFY_MP) { printBothOpen("RAxML placement algorithm: You provided a reference tree with %d taxa; alignmnet has %d taxa\n", tr->ntips, tr->mxtips); printBothOpen("%d query taxa will be placed using %s\n", tr->mxtips - tr->ntips, (adef->mode == CLASSIFY_ML)?"maximum likelihood":"parsimony"); if(adef->mode == CLASSIFY_ML) classifyML(tr, adef); else { assert(adef->mode == CLASSIFY_MP); classifyMP(tr, adef); } } else { printBothOpen("You provided an incomplete starting tree %d alignmnet has %d taxa\n", tr->ntips, tr->mxtips); makeParsimonyTreeIncomplete(tr, adef); } } } else { if(adef->mode == PARSIMONY_ADDITION) { printBothOpen("Error you want to add sequences to a trees via MP stepwise addition, but \n"); printBothOpen("you have provided an input tree that already contains all taxa\n"); exit(-1); } if(adef->mode == CLASSIFY_ML || adef->mode == CLASSIFY_MP) { printBothOpen("Error you want to place query sequences into a tree using %s, but\n", tr->mxtips - tr->ntips, (adef->mode == CLASSIFY_ML)?"maximum likelihood":"parsimony"); printBothOpen("you have provided an input tree that already contains all taxa\n"); exit(-1); } } onlyInitrav(tr, tr->start); } return lcount; }
static prop proposal(state * instate) /* so here the idea would be to randomly choose among proposals? we can use typedef enum to label each, and return that */ { double randprop = (double)rand()/(double)RAND_MAX; boolean proposalSuccess; //double start_LH = evaluateGeneric(instate->tr, instate->tr->start); /* for validation */ prop proposal_type; //simple proposal if(randprop < 0.25) { if(randprop < 0.2)//TOPOLOGICAL MOVE { if(randprop > 0.1)//SPR MOVE { proposal_type = SPR; // printBothOpen("Propose SPR\n"); if (randprop < 0.15) instate->maxradius = 1; else instate->maxradius = 2; doSPR(instate->tr, instate); proposalSuccess = TRUE; /* TODO */ /*proposalSuccess = simpleNodeProposal(instate);*/ } else { proposal_type = stNNI; proposalSuccess = stNNIproposal(instate); if(proposalSuccess == FALSE) { /* TODOFER this came up with ds 20 and GTRPSR, see why */ printBothOpen("WARNING!! stNNI proposal failed, doing SPR\n"); proposal_type = SPR; instate->maxradius = 1; proposalSuccess = simpleNodeProposal(instate); } } if(proposalSuccess == FALSE) { assert(FALSE); // this should either never happen or look below and return PROPOSAL_FAILED to react accordingly } else { /* A moved has been made, previous state is in instate */ if(proposal_type != stNNI) /*TODOFER delete this when bl are changed*/ assert(instate->tr->startLH != instate->tr->likelihood); } } else{//MODEL proposal_type = UPDATE_MODEL; simpleModelProposal(instate); } } else { if(randprop < 0.95)//UPDATE_ALL_BL { proposal_type = UPDATE_ALL_BL; instate->bl_prior = 0; //printBothOpen("Propose BL_UPDATE\n"); assert(proposal_type == UPDATE_ALL_BL); proposalSuccess = simpleBranchLengthProposal(instate); assert(instate->tr->startLH != instate->tr->likelihood); assert(proposalSuccess); } else//GAMMA { proposal_type = UPDATE_GAMMA; simpleGammaProposal(instate); } } //record the curprior instate->newprior = instate->bl_prior; return proposal_type; }
void doInference(tree *tr, analdef *adef, rawdata *rdta, cruncheddata *cdta) { int i, n; #ifdef _WAYNE_MPI int j, bestProcess; #endif double loopTime; topolRELL_LIST *rl = (topolRELL_LIST *)NULL; int best = -1, newBest = -1; double bestLH = unlikely; FILE *f; char bestTreeFileName[1024]; double overallTime; n = adef->multipleRuns; #ifdef _WAYNE_MPI if(n % processes != 0) n = processes * ((n / processes) + 1); #endif if(!tr->catOnly) { rl = (topolRELL_LIST *)rax_malloc(sizeof(topolRELL_LIST)); initTL(rl, tr, n); } #ifdef _WAYNE_MPI long parsimonySeed0 = adef->parsimonySeed; n = n / processes; #endif if(adef->rellBootstrap) { #ifdef _WAYNE_MPI tr->resample = permutationSH(tr, NUM_RELL_BOOTSTRAPS, parsimonySeed0 + 10000 * processID); #else tr->resample = permutationSH(tr, NUM_RELL_BOOTSTRAPS, adef->parsimonySeed); #endif tr->rellTrees = (treeList *)rax_malloc(sizeof(treeList)); initTreeList(tr->rellTrees, tr, NUM_RELL_BOOTSTRAPS); } else { tr->resample = (int *)NULL; tr->rellTrees = (treeList *)NULL; } for(i = 0; i < n; i++) { #ifdef _WAYNE_MPI if(i == 0) { if(parsimonySeed0 != 0) adef->parsimonySeed = parsimonySeed0 + 10000 * processID; } j = i + n * processID; tr->treeID = j; #else tr->treeID = i; #endif tr->checkPointCounter = 0; loopTime = gettime(); initModel(tr, rdta, cdta, adef); if(i == 0) printBaseFrequencies(tr); getStartingTree(tr, adef); computeBIGRAPID(tr, adef, TRUE); #ifdef _WAYNE_MPI if(tr->likelihood > bestLH) { best = j; bestLH = tr->likelihood; } if(!tr->catOnly) saveTL(rl, tr, j); #else if(tr->likelihood > bestLH) { best = i; bestLH = tr->likelihood; } if(!tr->catOnly) saveTL(rl, tr, i); #endif loopTime = gettime() - loopTime; writeInfoFile(adef, tr, loopTime); } assert(best >= 0); #ifdef _WAYNE_MPI MPI_Barrier(MPI_COMM_WORLD); n = n * processes; #endif if(tr->catOnly) { printBothOpenMPI("\n\nNOT conducting any final model optimizations on all %d trees under CAT-based model ....\n", n); printBothOpenMPI("\nREMEMBER that CAT-based likelihood scores are meaningless!\n\n", n); #ifdef _WAYNE_MPI if(processID != 0) { MPI_Finalize(); exit(0); } #endif } else { printBothOpenMPI("\n\nConducting final model optimizations on all %d trees under GAMMA-based models ....\n\n", n); #ifdef _WAYNE_MPI n = n / processes; #endif if(tr->rateHetModel == GAMMA || tr->rateHetModel == GAMMA_I) { restoreTL(rl, tr, best); evaluateGenericInitrav(tr, tr->start); if(!adef->useBinaryModelFile) modOpt(tr, adef, FALSE, adef->likelihoodEpsilon); else { readBinaryModel(tr, adef); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); } bestLH = tr->likelihood; tr->likelihoods[best] = tr->likelihood; saveTL(rl, tr, best); tr->treeID = best; printResult(tr, adef, TRUE); newBest = best; for(i = 0; i < n; i++) { #ifdef _WAYNE_MPI j = i + n * processID; if(j != best) { restoreTL(rl, tr, j); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1); tr->likelihoods[j] = tr->likelihood; if(tr->likelihood > bestLH) { newBest = j; bestLH = tr->likelihood; saveTL(rl, tr, j); } tr->treeID = j; printResult(tr, adef, TRUE); } if(n == 1 && processes == 1) printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName); else printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", j, tr->likelihoods[j], resultFileName, j); #else if(i != best) { restoreTL(rl, tr, i); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1); tr->likelihoods[i] = tr->likelihood; if(tr->likelihood > bestLH) { newBest = i; bestLH = tr->likelihood; saveTL(rl, tr, i); } tr->treeID = i; printResult(tr, adef, TRUE); } if(n == 1) printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName); else printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", i, tr->likelihoods[i], resultFileName, i); #endif } } else { catToGamma(tr, adef); #ifdef _WAYNE_MPI for(i = 0; i < n; i++) { j = i + n*processID; rl->t[j]->likelihood = unlikely; } #else for(i = 0; i < n; i++) rl->t[i]->likelihood = unlikely; #endif initModel(tr, rdta, cdta, adef); restoreTL(rl, tr, best); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); tr->likelihoods[best] = tr->likelihood; bestLH = tr->likelihood; saveTL(rl, tr, best); tr->treeID = best; printResult(tr, adef, TRUE); newBest = best; for(i = 0; i < n; i++) { #ifdef _WAYNE_MPI j = i + n*processID; if(j != best) { restoreTL(rl, tr, j); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); tr->likelihoods[j] = tr->likelihood; if(tr->likelihood > bestLH) { newBest = j; bestLH = tr->likelihood; saveTL(rl, tr, j); } tr->treeID = j; printResult(tr, adef, TRUE); } if(n == 1 && processes == 1) printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName); else printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", j, tr->likelihoods[j], resultFileName, j); #else if(i != best) { restoreTL(rl, tr, i); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); tr->likelihoods[i] = tr->likelihood; if(tr->likelihood > bestLH) { newBest = i; bestLH = tr->likelihood; saveTL(rl, tr, i); } tr->treeID = i; printResult(tr, adef, TRUE); } if(n == 1) printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName); else printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", i, tr->likelihoods[i], resultFileName, i); #endif } } assert(newBest >= 0); #ifdef _WAYNE_MPI if(processes > 1) { double *buffer = (double *)rax_malloc(sizeof(double) * processes); for(i = 0; i < processes; i++) buffer[i] = unlikely; buffer[processID] = bestLH; for(i = 0; i < processes; i++) MPI_Bcast(&buffer[i], 1, MPI_DOUBLE, i, MPI_COMM_WORLD); bestLH = buffer[0]; bestProcess = 0; for(i = 1; i < processes; i++) if(buffer[i] > bestLH) { bestLH = buffer[i]; bestProcess = i; } rax_free(buffer); } if(processID == bestProcess) { #endif restoreTL(rl, tr, newBest); evaluateGenericInitrav(tr, tr->start); printBothOpen("\n\nStarting final GAMMA-based thorough Optimization on tree %d likelihood %f .... \n\n", newBest, tr->likelihoods[newBest]); Thorough = 1; tr->doCutoff = FALSE; treeOptimizeThorough(tr, 1, 10); evaluateGenericInitrav(tr, tr->start); printBothOpen("Final GAMMA-based Score of best tree %f\n\n", tr->likelihood); strcpy(bestTreeFileName, workdir); strcat(bestTreeFileName, "RAxML_bestTree."); strcat(bestTreeFileName, run_id); Tree2String(tr->tree_string, tr, tr->start->back, TRUE, TRUE, FALSE, FALSE, TRUE, adef, SUMMARIZE_LH, FALSE, FALSE, FALSE, FALSE); f = myfopen(bestTreeFileName, "wb"); fprintf(f, "%s", tr->tree_string); fclose(f); if(adef->perGeneBranchLengths) printTreePerGene(tr, adef, bestTreeFileName, "w"); #ifdef _WAYNE_MPI } #endif } if(adef->rellBootstrap) { //WARNING the functions below need to be invoked after all other trees have been printed //don't move this part of the code further up! int i; #ifdef _WAYNE_MPI FILE *f = myfopen(rellBootstrapFileNamePID, "wb"); #else FILE *f = myfopen(rellBootstrapFileName, "wb"); #endif for(i = 0; i < NUM_RELL_BOOTSTRAPS; i++) { restoreTreeList(tr->rellTrees, tr, i); Tree2String(tr->tree_string, tr, tr->start->back, FALSE, TRUE, FALSE, FALSE, TRUE, adef, SUMMARIZE_LH, FALSE, FALSE, FALSE, FALSE); fprintf(f, "%s", tr->tree_string); } freeTreeList(tr->rellTrees); rax_free(tr->rellTrees); rax_free(tr->resample); fclose(f); #ifdef _WAYNE_MPI MPI_Barrier(MPI_COMM_WORLD); concatenateBSFiles(processes, rellBootstrapFileName); removeBSFiles(processes, rellBootstrapFileName); MPI_Barrier(MPI_COMM_WORLD); if(processID == 0) printBothOpen("\nRELL bootstraps written to file %s\n", rellBootstrapFileName); #else printBothOpen("\nRELL bootstraps written to file %s\n", rellBootstrapFileName); #endif } #ifdef _WAYNE_MPI if(processID == bestProcess) { #endif overallTime = gettime() - masterTime; printBothOpen("Program execution info written to %s\n", infoFileName); if(!tr->catOnly) { printBothOpen("Best-scoring ML tree written to: %s\n\n", bestTreeFileName); if(adef->perGeneBranchLengths && tr->NumberOfModels > 1) printBothOpen("Per-Partition branch lengths of best-scoring ML tree written to %s.PARTITION.0 to %s.PARTITION.%d\n\n", bestTreeFileName, bestTreeFileName, tr->NumberOfModels - 1); } printBothOpen("Overall execution time: %f secs or %f hours or %f days\n\n", overallTime, overallTime/3600.0, overallTime/86400.0); #ifdef _WAYNE_MPI } #endif if(!tr->catOnly) { freeTL(rl); rax_free(rl); } #ifdef _WAYNE_MPI MPI_Finalize(); #endif exit(0); }
void mcmc(tree *tr, analdef *adef) { int i=0; tr->startLH = tr->likelihood; printBothOpen("start minimalistic search with LH %f\n", tr->likelihood); printBothOpen("tr LH %f, startLH %f\n", tr->likelihood, tr->startLH); int insert_id; int j; int maxradius = 30; int accepted_spr = 0, accepted_nni = 0, accepted_bl = 0, accepted_model = 0, accepted_gamma = 0, inserts = 0; int rejected_spr = 0, rejected_nni = 0, rejected_bl = 0, rejected_model = 0, rejected_gamma = 0; int num_moves = 10000; boolean proposalAccepted; boolean proposalSuccess; prop which_proposal; double testr; double acceptance; srand (440); double totalTime = 0.0, proposalTime = 0.0, blTime = 0.0, printTime = 0.0; double t_start = gettime(); double t; //allocate states double bl_prior_exp_lambda = 0.1; double bl_sliding_window_w = 0.005; double gm_sliding_window_w = 0.75; double rt_sliding_window_w = 0.5; state *curstate = state_init(tr, adef, maxradius, bl_sliding_window_w, rt_sliding_window_w, gm_sliding_window_w, bl_prior_exp_lambda); printStateFileHeader(curstate); set_start_bl(curstate); printf("start bl_prior: %f\n",curstate->bl_prior); set_start_prior(curstate); curstate->hastings = 1;//needs to be set by the proposal when necessary /* Set the starting LH with a full traversal */ evaluateGeneric(tr, tr->start, TRUE); tr->startLH = tr->likelihood; printBothOpen("Starting with tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH); /* Set reasonable model parameters */ evaluateGeneric(curstate->tr, curstate->tr->start, FALSE); // just for validation printBothOpen("tr LH before modOpt %f\n",curstate->tr->likelihood); printSubsRates(curstate->tr, curstate->model, curstate->numSubsRates); /* optimize the model with Brents method for reasonable starting points */ modOpt(curstate->tr, curstate->adef, 5.0); /* not by proposal, just using std raxml machinery... */ evaluateGeneric(curstate->tr, curstate->tr->start, FALSE); // just for validation printBothOpen("tr LH after modOpt %f\n",curstate->tr->likelihood); printSubsRates(curstate->tr, curstate->model, curstate->numSubsRates); recordSubsRates(curstate->tr, curstate->model, curstate->numSubsRates, curstate->curSubsRates); int first = 1; /* beginning of the MCMC chain */ for(j=0; j<num_moves; j++) { //printBothOpen("iter %d, tr LH %f, startLH %f\n",j, tr->likelihood, tr->startLH); //printRecomTree(tr, TRUE, "startiter"); proposalAccepted = FALSE; t = gettime(); /* evaluateGeneric(tr, tr->start); // just for validation printBothOpen("before proposal, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH); */ which_proposal = proposal(curstate); if (first == 1) { first = 0; curstate->curprior = curstate->newprior; } //printBothOpen("proposal done, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH); assert(which_proposal == SPR || which_proposal == stNNI || which_proposal == UPDATE_ALL_BL || which_proposal == UPDATE_MODEL || which_proposal == UPDATE_GAMMA); proposalTime += gettime() - t; /* decide upon acceptance */ testr = (double)rand()/(double)RAND_MAX; //should look something like acceptance = fmin(1,(curstate->hastings) * (exp(curstate->newprior-curstate->curprior)) * (exp(curstate->tr->likelihood-curstate->tr->startLH))); /* //printRecomTree(tr, FALSE, "after proposal"); printBothOpen("after proposal, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH); */ if(testr < acceptance) { proposalAccepted = TRUE; switch(which_proposal) { case SPR: //printRecomTree(tr, TRUE, "after accepted"); // printBothOpen("SPR new topology , iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH); accepted_spr++; break; case stNNI: printBothOpen("NNI new topology , iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH); accepted_nni++; break; case UPDATE_ALL_BL: // printBothOpen("BL new , iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH); accepted_bl++; break; case UPDATE_MODEL: // printBothOpen("Model new, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH); accepted_model++; break; case UPDATE_GAMMA: // printBothOpen("Gamma new, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH); accepted_gamma++; break; default: assert(0); } curstate->tr->startLH = curstate->tr->likelihood; //new LH curstate->curprior = curstate->newprior; } else { //printBothOpen("rejected , iter %d tr LH %f, startLH %f, %i \n", j, tr->likelihood, tr->startLH, which_proposal); resetState(which_proposal,curstate); switch(which_proposal) { case SPR: rejected_spr++; break; case stNNI: rejected_nni++; break; case UPDATE_ALL_BL: rejected_bl++; break; case UPDATE_MODEL: rejected_model++; break; case UPDATE_GAMMA: rejected_gamma++; break; default: assert(0); } evaluateGeneric(tr, tr->start, FALSE); // just for validation if(fabs(curstate->tr->startLH - tr->likelihood) > 1.0E-10) { printBothOpen("WARNING: LH diff %.10f\n", curstate->tr->startLH - tr->likelihood); } //printRecomTree(tr, TRUE, "after reset"); //printBothOpen("after reset, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH); assert(fabs(curstate->tr->startLH - tr->likelihood) < 1.0E-10); } inserts++; /* need to print status */ if (j % 50 == 0) { t = gettime(); printBothOpen("sampled at iter %d, tr LH %f, startLH %f, prior %f, incr %f\n",j, tr->likelihood, tr->startLH, curstate->curprior, tr->likelihood - tr->startLH); boolean printBranchLengths = TRUE; /*printSimpleTree(tr, printBranchLengths, adef);*/ //TODO: print some parameters to a file printStateFile(j,curstate); printTime += gettime() - t; } } t = gettime(); treeEvaluate(tr, 1); blTime += gettime() - t; printBothOpen("accepted SPR %d, accepted stNNI %d, accepted BL %d, accepted model %d, accepted gamma %d, num moves tried %d, SPRs with max radius %d\n", accepted_spr, accepted_nni, accepted_bl, accepted_model, accepted_gamma, num_moves, maxradius); printBothOpen("rejected SPR %d, rejected stNNI %d, rejected BL %d, rejected model %d, rejected gamma %d\n", rejected_spr, rejected_nni, rejected_bl, rejected_model, rejected_gamma); printBothOpen("ratio SPR %f, ratio stNNI %f, ratio BL %f, ratio model %f, ratio gamma %f\n", accepted_spr/(double)(rejected_spr+accepted_spr), accepted_nni/(double)(rejected_nni+accepted_nni), accepted_bl/(double)(rejected_bl+accepted_bl), accepted_model/(double)(rejected_model+accepted_model), accepted_gamma/(double)(rejected_gamma+accepted_gamma)); printBothOpen("total %f, BL %f, printing %f, proposal %f\n", gettime()- t_start, blTime, printTime, proposalTime); assert(inserts == num_moves); state_free(curstate); }
void handleExcludeFile(tree *tr, analdef *adef, rawdata *rdta) { FILE *f; char buf[256]; int ch, j, value, i, state = 0, numberOfModels = 0, l = -1, excludeRegion = 0, excludedColumns = 0, modelCounter = 1; int *excludeArray, *countArray, *modelList; int **partitions; printf("\n\n"); f = myfopen(excludeFileName, "rb"); while((ch = getc(f)) != EOF) { if(ch == '-') numberOfModels++; } excludeArray = (int*)malloc(sizeof(int) * (rdta->sites + 1)); countArray = (int*)malloc(sizeof(int) * (rdta->sites + 1)); modelList = (int *)malloc((rdta->sites + 1)* sizeof(int)); partitions = (int **)malloc(sizeof(int *) * numberOfModels); for(i = 0; i < numberOfModels; i++) partitions[i] = (int *)malloc(sizeof(int) * 2); rewind(f); while((ch = getc(f)) != EOF) { switch(state) { case 0: /* get first number */ if(!whitechar(ch)) { if(!isNum(ch)) { printf("exclude file must have format: number-number [number-number]*\n"); exit(-1); } l = 0; buf[l++] = ch; state = 1; } break; case 1: /*get the number or detect - */ if(!isNum(ch) && ch != '-') { printf("exclude file must have format: number-number [number-number]*\n"); exit(-1); } if(isNum(ch)) { buf[l++] = ch; } else { buf[l++] = '\0'; value = atoi(buf); partitions[excludeRegion][0] = value; state = 2; } break; case 2: /*get second number */ if(!isNum(ch)) { printf("exclude file must have format: number-number [number-number]*\n"); exit(-1); } l = 0; buf[l++] = ch; state = 3; break; case 3: /* continue second number or find end */ if(!isNum(ch) && !whitechar(ch)) { printf("exclude file must have format: number-number [number-number]*\n"); exit(-1); } if(isNum(ch)) { buf[l++] = ch; } else { buf[l++] = '\0'; value = atoi(buf); partitions[excludeRegion][1] = value; excludeRegion++; state = 0; } break; default: assert(0); } } if(state == 3) { buf[l++] = '\0'; value = atoi(buf); partitions[excludeRegion][1] = value; excludeRegion++; } assert(excludeRegion == numberOfModels); for(i = 0; i <= rdta->sites; i++) { excludeArray[i] = -1; countArray[i] = 0; modelList[i] = -1; } for(i = 0; i < numberOfModels; i++) { int lower = partitions[i][0]; int upper = partitions[i][1]; if(lower > upper) { printf("Misspecified exclude region %d\n", i); printf("lower bound %d is greater than upper bound %d\n", lower, upper); exit(-1); } if(lower == 0) { printf("Misspecified exclude region %d\n", i); printf("lower bound must be greater than 0\n"); exit(-1); } if(upper > rdta->sites) { printf("Misspecified exclude region %d\n", i); printf("upper bound %d must be smaller than %d\n", upper, (rdta->sites + 1)); exit(-1); } for(j = lower; j <= upper; j++) { if(excludeArray[j] != -1) { printf("WARNING: Exclude regions %d and %d overlap at position %d (already excluded %d times)\n", excludeArray[j], i, j, countArray[j]); } excludeArray[j] = i; countArray[j] = countArray[j] + 1; } } for(i = 1; i <= rdta->sites; i++) { if(excludeArray[i] != -1) excludedColumns++; else { modelList[modelCounter] = tr->model[i]; modelCounter++; } } printf("You have excluded %d out of %d columns\n", excludedColumns, rdta->sites); if(excludedColumns == rdta->sites) { printf("Error: You have excluded all sites\n"); exit(-1); } if(adef->useSecondaryStructure && (excludedColumns > 0)) { char mfn[2048]; int countColumns; FILE *newFile; assert(adef->useMultipleModel); strcpy(mfn, secondaryStructureFileName); strcat(mfn, "."); strcat(mfn, excludeFileName); newFile = myfopen(mfn, "wb"); printBothOpen("\nA secondary structure file with analogous structure assignments for non-excluded columns is printed to file %s\n", mfn); for(i = 1, countColumns = 0; i <= rdta->sites; i++) { if(excludeArray[i] == -1) fprintf(newFile, "%c", tr->secondaryStructureInput[i - 1]); else countColumns++; } assert(countColumns == excludedColumns); fprintf(newFile,"\n"); fclose(newFile); } if(adef->useMultipleModel && (excludedColumns > 0)) { char mfn[2048]; FILE *newFile; strcpy(mfn, modelFileName); strcat(mfn, "."); strcat(mfn, excludeFileName); newFile = myfopen(mfn, "wb"); printf("\nA partition file with analogous model assignments for non-excluded columns is printed to file %s\n", mfn); for(i = 0; i < tr->NumberOfModels; i++) { boolean modelStillExists = FALSE; for(j = 1; (j <= rdta->sites) && (!modelStillExists); j++) { if(modelList[j] == i) modelStillExists = TRUE; } if(modelStillExists) { int k = 1; int lower, upper; int parts = 0; switch(tr->partitionData[i].dataType) { case AA_DATA: { char AAmodel[1024]; strcpy(AAmodel, protModels[tr->partitionData[i].protModels]); if(tr->partitionData[i].protFreqs) strcat(AAmodel, "F"); fprintf(newFile, "%s, ", AAmodel); } break; case DNA_DATA: fprintf(newFile, "DNA, "); break; case BINARY_DATA: fprintf(newFile, "BIN, "); break; case GENERIC_32: fprintf(newFile, "MULTI, "); break; case GENERIC_64: fprintf(newFile, "CODON, "); break; default: assert(0); } fprintf(newFile, "%s = ", tr->partitionData[i].partitionName); while(k <= rdta->sites) { if(modelList[k] == i) { lower = k; while((modelList[k + 1] == i) && (k <= rdta->sites)) k++; upper = k; if(lower == upper) { if(parts == 0) fprintf(newFile, "%d", lower); else fprintf(newFile, ",%d", lower); } else { if(parts == 0) fprintf(newFile, "%d-%d", lower, upper); else fprintf(newFile, ",%d-%d", lower, upper); } parts++; } k++; } fprintf(newFile, "\n"); } } fclose(newFile); } { FILE *newFile; char mfn[2048]; strcpy(mfn, seq_file); strcat(mfn, "."); strcat(mfn, excludeFileName); newFile = myfopen(mfn, "wb"); printf("\nAn alignment file with excluded columns is printed to file %s\n\n\n", mfn); fprintf(newFile, "%d %d\n", tr->mxtips, rdta->sites - excludedColumns); for(i = 1; i <= tr->mxtips; i++) { unsigned char *tipI = &(rdta->y[i][1]); fprintf(newFile, "%s ", tr->nameList[i]); for(j = 0; j < rdta->sites; j++) { if(excludeArray[j + 1] == -1) fprintf(newFile, "%c", getInverseMeaning(tr->dataVector[j + 1], tipI[j])); } fprintf(newFile, "\n"); } fclose(newFile); } fclose(f); for(i = 0; i < numberOfModels; i++) free(partitions[i]); free(partitions); free(excludeArray); free(countArray); free(modelList); }
void plausibilityChecker(tree *tr, analdef *adef) { FILE *treeFile, *rfFile; tree *smallTree = (tree *)rax_malloc(sizeof(tree)); char rfFileName[1024]; int numberOfTreesAnalyzed = 0, i; double avgRF = 0.0, sumEffectivetime = 0.0; /* set up an output file name */ strcpy(rfFileName, workdir); strcat(rfFileName, "RAxML_RF-Distances."); strcat(rfFileName, run_id); rfFile = myfopen(rfFileName, "wb"); assert(adef->mode == PLAUSIBILITY_CHECKER); /* open the big reference tree file and parse it */ treeFile = myfopen(tree_file, "r"); printBothOpen("Parsing reference tree %s\n", tree_file); treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE); assert(tr->mxtips == tr->ntips); /*************************************************************************************/ /* Preprocessing Step */ double preprocesstime = gettime(); /* taxonToLabel[2*tr->mxtips - 2]; Array storing all 2n-2 labels from the preordertraversal: (Taxonnumber - 1) -> (Preorderlabel) */ int *taxonToLabel = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)), /* taxonHasDeg[2*tr->mxtips - 2] Array used to store the degree of every taxon, is needed to extract Bipartitions from multifurcating trees (Taxonnumber - 1) -> (degree of node(Taxonnumber)) */ *taxonHasDeg = (int *)rax_calloc((2*tr->mxtips - 2),sizeof(int)), /* taxonToReduction[2*tr->mxtips - 2]; Array used for reducing bitvector and speeding up extraction: (Taxonnumber - 1) -> (0..1 (increment count of taxa appearing in small tree)) (Taxonnumber - 1) -> (0..1 (increment count of inner nodes appearing in small tree)) */ *taxonToReduction = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)); int newcount = 0; //counter used for correct traversals /* labelToTaxon[2*tr->mxtips - 2]; is used to translate between Perorderlabel and p->number: (Preorderlabel) -> (Taxonnumber) */ int *labelToTaxon = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)); /* Preorder-Traversal of the large tree */ preOrderTraversal(tr->start->back,tr->mxtips, tr->start->number, taxonToLabel, labelToTaxon, &newcount); newcount = 0; //counter set to 0 to be now used for Eulertraversal /* eulerIndexToLabel[4*tr->mxtips - 5]; Array storing all 4n-5 PreOrderlabels created during eulertour: (Eulerindex) -> (Preorderlabel) */ int* eulerIndexToLabel = (int *)rax_malloc((4*tr->mxtips - 5) * sizeof(int)); /* taxonToEulerIndex[tr->mxtips]; Stores all indices of the first appearance of a taxa in the eulerTour: (Taxonnumber - 1) -> (Index of the Eulertour where Taxonnumber first appears) is used for efficient computation of the Lowest Common Ancestor during Reconstruction Step */ int* taxonToEulerIndex = (int *)rax_malloc((tr->mxtips) * sizeof(int)); /* Init taxonToEulerIndex and taxonToReduction */ int ix; for(ix = 0; ix < tr->mxtips; ++ix) taxonToEulerIndex[ix] = -1; for(ix = 0; ix < (2*tr->mxtips - 2); ++ix) taxonToReduction[ix] = -1; /* Eulertraversal of the large tree*/ unrootedEulerTour(tr->start->back,tr->mxtips, eulerIndexToLabel, taxonToLabel, &newcount, taxonToEulerIndex); /* Creating RMQ Datastructure for efficient retrieval of LCAs, using Johannes Fischers Library rewritten in C Following Files: rmq.h,rmqs.c,rmqs.h are included in Makefile.RMQ.gcc */ RMQ_succinct(eulerIndexToLabel,4*tr->mxtips - 5); double preprocessendtime = gettime() - preprocesstime; /* Proprocessing Step End */ /*************************************************************************************/ printBothOpen("The reference tree has %d tips\n", tr->ntips); fclose(treeFile); /* now see how many small trees we have */ treeFile = getNumberOfTrees(tr, bootStrapFile, adef); checkTreeNumber(tr->numberOfTrees, bootStrapFile); /* allocate a data structure for parsing the potentially mult-furcating tree */ allocateMultifurcations(tr, smallTree); /* loop over all small trees */ for(i = 0; i < tr->numberOfTrees; i++) { int numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE); if(numberOfSplits > 0) { int firstTaxon; double rec_rf, maxRF; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits); /* compute the maximum RF distance for computing the relative RF distance later-on */ /* note that here we need to pay attention, since the RF distance is not normalized by 2 * (n-3) but we need to account for the fact that the multifurcating small tree will potentially contain less bipartitions. Hence the normalization factor is obtained as n-3 + numberOfSplits, where n-3 is the number of bipartitions of the pruned down large reference tree for which we know that it is bifurcating/strictly binary */ maxRF = (double)(2 * numberOfSplits); /* now get the index of the first taxon of the small tree. we will use this to unambiguously store the bipartitions */ firstTaxon = smallTree->start->number; /***********************************************************************************/ /* Reconstruction Step */ double time_start = gettime(); /* Init hashtable to store Bipartitions of the induced subtree */ /* using smallTree->ntips instead of smallTree->mxtips yields faster code e.g. 120 versus 128 seconds for 20,000 small trees on my laptop */ hashtable *s_hash = initHashTable(smallTree->ntips * 4); /* smallTreeTaxa[smallTree->ntips]; Stores all taxa numbers from smallTree into an array called smallTreeTaxa: (Index) -> (Taxonnumber) */ int* smallTreeTaxa = (int *)rax_malloc((smallTree->ntips) * sizeof(int)); /* counter is set to 0 for correctly extracting taxa of the small tree */ newcount = 0; int newcount2 = 0; /* seq2[2*smallTree->ntips - 2]; stores PreorderSequence of the reference smalltree: (Preorderindex) -> (Taxonnumber) */ int* seq2 = (int *)rax_malloc((2*smallTree->ntips - 2) * sizeof(int)); /* used to store the vectorLength of the bitvector */ unsigned int vectorLength; /* extract all taxa of the smalltree and store it into an array, also store all counts of taxa and nontaxa in taxonToReduction */ rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start, smallTree->mxtips, &newcount, &newcount2); rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start->back, smallTree->mxtips, &newcount, &newcount2); /* counter is set to 0 to correctly preorder traverse the small tree */ newcount = 0; /* Preordertraversal of the small tree and save its sequence into seq2 for later extracting the bipartitions, it also stores information about the degree of every node */ rec_preOrderTraversalMulti(smallTree->start->back,smallTree->mxtips, smallTree->start->number, seq2, taxonHasDeg, &newcount); /* calculate the bitvector length */ if(smallTree->ntips % MASK_LENGTH == 0) vectorLength = smallTree->ntips / MASK_LENGTH; else vectorLength = 1 + (smallTree->ntips / MASK_LENGTH); unsigned int **bitVectors = rec_initBitVector(smallTree, vectorLength); /* store all non trivial bitvectors using an subtree approach for the induced subtree and store it into a hashtable, this method was changed for multifurcation */ rec_extractBipartitionsMulti(bitVectors, seq2, newcount,tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits); /* counter is set to 0 to be used for correctly storing all EulerIndices */ newcount = 0; /* smallTreeTaxonToEulerIndex[smallTree->ntips]; Saves all first Euler indices for all Taxons appearing in small Tree: (Index) -> (Index of the Eulertour where the taxonnumber of the small tree first appears) */ int* smallTreeTaxonToEulerIndex = (int *)rax_malloc((smallTree->ntips) * sizeof(int)); /* seq[(smallTree->ntips*2) - 1] Stores the Preordersequence of the induced small tree */ int* seq = (int *)rax_malloc((2*smallTree->ntips - 1) * sizeof(int)); /* iterate through all small tree taxa */ for(ix = 0; ix < smallTree->ntips; ix++) { int taxanumber = smallTreeTaxa[ix]; /* To create smallTreeTaxonToEulerIndex we filter taxonToEulerIndex for taxa in the small tree*/ smallTreeTaxonToEulerIndex[newcount] = taxonToEulerIndex[taxanumber-1]; /* Saves all Preorderlabel of the smalltree taxa in seq*/ seq[newcount] = taxonToLabel[taxanumber-1]; newcount++; } /* sort the euler indices to correctly calculate LCA */ //quicksort(smallTreeTaxonToEulerIndex,0,newcount - 1); qsort(smallTreeTaxonToEulerIndex, newcount, sizeof(int), sortIntegers); //printf("newcount2 %i \n", newcount2); /* Iterate through all small tree taxa */ for(ix = 1; ix < newcount; ix++) { /* query LCAs using RMQ Datastructure */ seq[newcount - 1 + ix] = eulerIndexToLabel[query(smallTreeTaxonToEulerIndex[ix - 1],smallTreeTaxonToEulerIndex[ix])]; /* Used for dynamic programming. We save an index for every inner node: For example the reference tree has 3 inner nodes which we saves them as 0,1,2. Now we calculate for example 5 LCA to construct the induced subtree, which are also inner nodes. Therefore we mark them as 3,4,5,6,7 */ taxonToReduction[labelToTaxon[seq[newcount - 1 + ix]] - 1] = newcount2; newcount2 += 1; } /* sort to construct the Preordersequence of the induced subtree */ //quicksort(seq,0,(2*smallTree->ntips - 2)); qsort(seq, (2 * smallTree->ntips - 2) + 1, sizeof(int), sortIntegers); /* calculates all bipartitions of the reference small tree and count how many bipartition it shares with the induced small tree */ int rec_bips = rec_findBipartitions(bitVectors, seq,(2*smallTree->ntips - 1), labelToTaxon, tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, taxonToReduction); /* Reconstruction Step End */ /***********************************************************************************/ double effectivetime = gettime() - time_start; /* if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Reconstruction time: %.10f secs\n\n", effectivetime); */ /* compute the relative RF */ rec_rf = (double)(2 * (numberOfSplits - rec_bips)) / maxRF; assert(numberOfSplits >= rec_bips); avgRF += rec_rf; sumEffectivetime += effectivetime; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Relative RF tree %d: %f\n\n", i, rec_rf); fprintf(rfFile, "%d %f\n", i, rec_rf); /* free masks and hast table for this iteration */ rec_freeBitVector(smallTree, bitVectors); rax_free(bitVectors); freeHashTable(s_hash); rax_free(s_hash); rax_free(smallTreeTaxa); rax_free(seq); rax_free(seq2); rax_free(smallTreeTaxonToEulerIndex); numberOfTreesAnalyzed++; } } printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed); printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed); printBothOpen("Large Tree: %i, Number of SmallTrees analyzed: %i \n\n", tr->mxtips, numberOfTreesAnalyzed); printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime); printBothOpen("File containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName); printBothOpen("execution stats:\n\n"); printBothOpen("Accumulated time Effective algorithm: %.5f sec \n", sumEffectivetime); printBothOpen("Average time for effective: %.10f sec \n",sumEffectivetime / (double)numberOfTreesAnalyzed); printBothOpen("Preprocessingtime: %0.5f sec \n\n", preprocessendtime); fclose(treeFile); fclose(rfFile); /* free the data structure used for parsing the potentially multi-furcating tree */ freeMultifurcations(smallTree); rax_free(smallTree); rax_free(taxonToLabel); rax_free(taxonToEulerIndex); rax_free(labelToTaxon); rax_free(eulerIndexToLabel); rax_free(taxonToReduction); rax_free(taxonHasDeg); }