void bitVectorInitravSpecial(unsigned int **bitVectors, nodeptr p, int numsp, unsigned int vectorLength, hashtable *h, int treeNumber, int function, branchInfo *bInf, int *countBranches, int treeVectorLength, boolean traverseOnly, boolean computeWRF) { if(isTip(p->number, numsp)) return; else { nodeptr q = p->next; do { bitVectorInitravSpecial(bitVectors, q->back, numsp, vectorLength, h, treeNumber, function, bInf, countBranches, treeVectorLength, traverseOnly, computeWRF); q = q->next; } while(q != p); newviewBipartitions(bitVectors, p, numsp, vectorLength); assert(p->xBips); assert(!traverseOnly); if(!(isTip(p->back->number, numsp))) { unsigned int *toInsert = bitVectors[p->number]; hashNumberType position = p->hash % h->tableSize; assert(!(toInsert[0] & 1)); assert(!computeWRF); switch(function) { case BIPARTITIONS_RF: insertHashRF(toInsert, h, vectorLength, treeNumber, treeVectorLength, position, 0, computeWRF); *countBranches = *countBranches + 1; break; default: assert(0); } } } }
void computeBIGRAPID (tree *tr, analdef *adef, boolean estimateModel) { unsigned int vLength = 0; int i, impr, bestTrav, rearrangementsMax = 0, rearrangementsMin = 0, thoroughIterations = 0, fastIterations = 0; double lh, previousLh, difference, epsilon; bestlist *bestT, *bt; #ifdef _TERRACES /* store the 20 best trees found in a dedicated list */ bestlist *terrace; /* output file names */ char terraceFileName[1024], buf[64]; #endif hashtable *h = (hashtable*)NULL; unsigned int **bitVectors = (unsigned int**)NULL; if(tr->searchConvergenceCriterion) { bitVectors = initBitVector(tr, &vLength); h = initHashTable(tr->mxtips * 4); } bestT = (bestlist *) rax_malloc(sizeof(bestlist)); bestT->ninit = 0; initBestTree(bestT, 1, tr->mxtips); bt = (bestlist *) rax_malloc(sizeof(bestlist)); bt->ninit = 0; initBestTree(bt, 20, tr->mxtips); #ifdef _TERRACES /* initialize the tree list and the output file name for the current tree search/replicate */ terrace = (bestlist *) rax_malloc(sizeof(bestlist)); terrace->ninit = 0; initBestTree(terrace, 20, tr->mxtips); sprintf(buf, "%d", bCount); strcpy(terraceFileName, workdir); strcat(terraceFileName, "RAxML_terrace."); strcat(terraceFileName, run_id); strcat(terraceFileName, ".BS."); strcat(terraceFileName, buf); printf("%s\n", terraceFileName); #endif initInfoList(50); difference = 10.0; epsilon = 0.01; Thorough = 0; if(estimateModel) { if(adef->useBinaryModelFile) { readBinaryModel(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); } else { evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, FALSE, 10.0); } } else treeEvaluate(tr, 2); printLog(tr, adef, FALSE); saveBestTree(bestT, tr); if(!adef->initialSet) bestTrav = adef->bestTrav = determineRearrangementSetting(tr, adef, bestT, bt); else bestTrav = adef->bestTrav = adef->initial; if(estimateModel) { if(adef->useBinaryModelFile) treeEvaluate(tr, 2); else { evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, FALSE, 5.0); } } else treeEvaluate(tr, 1); saveBestTree(bestT, tr); impr = 1; if(tr->doCutoff) tr->itCount = 0; while(impr) { recallBestTree(bestT, 1, tr); if(tr->searchConvergenceCriterion) { int bCounter = 0; if(fastIterations > 1) cleanupHashTable(h, (fastIterations % 2)); bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips, vLength, h, fastIterations % 2, BIPARTITIONS_RF, (branchInfo *)NULL, &bCounter, 1, FALSE, FALSE); assert(bCounter == tr->mxtips - 3); if(fastIterations > 0) { double rrf = convergenceCriterion(h, tr->mxtips); if(rrf <= 0.01) /* 1% cutoff */ { printBothOpen("ML fast search converged at fast SPR cycle %d with stopping criterion\n", fastIterations); printBothOpen("Relative Robinson-Foulds (RF) distance between respective best trees after one succseful SPR cycle: %f%s\n", rrf, "%"); cleanupHashTable(h, 0); cleanupHashTable(h, 1); goto cleanup_fast; } else printBothOpen("ML search convergence criterion fast cycle %d->%d Relative Robinson-Foulds %f\n", fastIterations - 1, fastIterations, rrf); } } fastIterations++; treeEvaluate(tr, 1.0); saveBestTree(bestT, tr); printLog(tr, adef, FALSE); printResult(tr, adef, FALSE); lh = previousLh = tr->likelihood; treeOptimizeRapid(tr, 1, bestTrav, adef, bt); impr = 0; for(i = 1; i <= bt->nvalid; i++) { recallBestTree(bt, i, tr); treeEvaluate(tr, 0.25); difference = ((tr->likelihood > previousLh)? tr->likelihood - previousLh: previousLh - tr->likelihood); if(tr->likelihood > lh && difference > epsilon) { impr = 1; lh = tr->likelihood; saveBestTree(bestT, tr); } } } if(tr->searchConvergenceCriterion) { cleanupHashTable(h, 0); cleanupHashTable(h, 1); } cleanup_fast: Thorough = 1; impr = 1; recallBestTree(bestT, 1, tr); if(estimateModel) { if(adef->useBinaryModelFile) treeEvaluate(tr, 2); else { evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, FALSE, 1.0); } } else treeEvaluate(tr, 1.0); while(1) { recallBestTree(bestT, 1, tr); if(impr) { printResult(tr, adef, FALSE); rearrangementsMin = 1; rearrangementsMax = adef->stepwidth; if(tr->searchConvergenceCriterion) { int bCounter = 0; if(thoroughIterations > 1) cleanupHashTable(h, (thoroughIterations % 2)); bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips, vLength, h, thoroughIterations % 2, BIPARTITIONS_RF, (branchInfo *)NULL, &bCounter, 1, FALSE, FALSE); assert(bCounter == tr->mxtips - 3); if(thoroughIterations > 0) { double rrf = convergenceCriterion(h, tr->mxtips); if(rrf <= 0.01) /* 1% cutoff */ { printBothOpen("ML search converged at thorough SPR cycle %d with stopping criterion\n", thoroughIterations); printBothOpen("Relative Robinson-Foulds (RF) distance between respective best trees after one succseful SPR cycle: %f%s\n", rrf, "%"); goto cleanup; } else printBothOpen("ML search convergence criterion thorough cycle %d->%d Relative Robinson-Foulds %f\n", thoroughIterations - 1, thoroughIterations, rrf); } } thoroughIterations++; } else { rearrangementsMax += adef->stepwidth; rearrangementsMin += adef->stepwidth; if(rearrangementsMax > adef->max_rearrange) goto cleanup; } treeEvaluate(tr, 1.0); previousLh = lh = tr->likelihood; saveBestTree(bestT, tr); printLog(tr, adef, FALSE); treeOptimizeRapid(tr, rearrangementsMin, rearrangementsMax, adef, bt); impr = 0; for(i = 1; i <= bt->nvalid; i++) { recallBestTree(bt, i, tr); treeEvaluate(tr, 0.25); #ifdef _TERRACES /* save all 20 best trees in the terrace tree list */ saveBestTree(terrace, tr); #endif difference = ((tr->likelihood > previousLh)? tr->likelihood - previousLh: previousLh - tr->likelihood); if(tr->likelihood > lh && difference > epsilon) { impr = 1; lh = tr->likelihood; saveBestTree(bestT, tr); } } } cleanup: #ifdef _TERRACES { double bestLH = tr->likelihood; FILE *f = myfopen(terraceFileName, "w"); /* print out likelihood of best tree found */ printf("best tree: %f\n", tr->likelihood); /* print out likelihoods of 20 best trees found during the tree search */ for(i = 1; i <= terrace->nvalid; i++) { recallBestTree(terrace, i, tr); /* if the likelihood scores are smaller than some epsilon 0.000001 print the tree to file */ if(ABS(bestLH - tr->likelihood) < 0.000001) { printf("%d %f\n", i, tr->likelihood); Tree2String(tr->tree_string, tr, tr->start->back, FALSE, TRUE, FALSE, FALSE, FALSE, adef, NO_BRANCHES, FALSE, FALSE, FALSE, FALSE); fprintf(f, "%s\n", tr->tree_string); } } fclose(f); /* increment tree search counter */ bCount++; } #endif if(tr->searchConvergenceCriterion) { freeBitVectors(bitVectors, 2 * tr->mxtips); rax_free(bitVectors); freeHashTable(h); rax_free(h); } freeBestTree(bestT); rax_free(bestT); freeBestTree(bt); rax_free(bt); #ifdef _TERRACES /* free terrace tree list */ freeBestTree(terrace); rax_free(terrace); #endif freeInfoList(); printLog(tr, adef, FALSE); printResult(tr, adef, FALSE); }
void plausibilityChecker(tree *tr, analdef *adef) { FILE *treeFile, *rfFile; tree *smallTree = (tree *)rax_malloc(sizeof(tree)); char rfFileName[1024]; /* init hash table for big reference tree */ hashtable *h = initHashTable(tr->mxtips * 2 * 2); /* init the bit vectors we need for computing and storing bipartitions during the tree traversal */ unsigned int vLength, **bitVectors = initBitVector(tr, &vLength); int numberOfTreesAnalyzed = 0, branchCounter = 0, i; double avgRF = 0.0; /* set up an output file name */ strcpy(rfFileName, workdir); strcat(rfFileName, "RAxML_RF-Distances."); strcat(rfFileName, run_id); rfFile = myfopen(rfFileName, "wb"); assert(adef->mode == PLAUSIBILITY_CHECKER); /* open the big reference tree file and parse it */ treeFile = myfopen(tree_file, "r"); printBothOpen("Parsing reference tree %s\n", tree_file); treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE); assert(tr->mxtips == tr->ntips); printBothOpen("The reference tree has %d tips\n", tr->ntips); fclose(treeFile); /* extract all induced bipartitions from the big tree and store them in the hastable */ bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips, vLength, h, 0, BIPARTITIONS_RF, (branchInfo *)NULL, &branchCounter, 1, FALSE, FALSE); assert(branchCounter == tr->mxtips - 3); /* now see how many small trees we have */ treeFile = getNumberOfTrees(tr, bootStrapFile, adef); checkTreeNumber(tr->numberOfTrees, bootStrapFile); /* allocate a data structure for parsing the potentially mult-furcating tree */ allocateMultifurcations(tr, smallTree); /* loop over all small trees */ for(i = 0; i < tr->numberOfTrees; i++) { int numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE); if(numberOfSplits > 0) { unsigned int entryCount = 0, k, j, *masked = (unsigned int *)rax_calloc(vLength, sizeof(unsigned int)), *smallTreeMask = (unsigned int *)rax_calloc(vLength, sizeof(unsigned int)); hashtable *rehash = initHashTable(tr->mxtips * 2 * 2); double rf, maxRF; int bCounter = 0, bips, firstTaxon, taxa = 0; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits); /* compute the maximum RF distance for computing the relative RF distance later-on */ /* note that here we need to pay attention, since the RF distance is not normalized by 2 * (n-3) but we need to account for the fact that the multifurcating small tree will potentially contain less bipartitions. Hence the normalization factor is obtained as 2 * numberOfSplits, where numberOfSplits is the number of bipartitions in the small tree. */ maxRF = (double)(2 * numberOfSplits); /* now set up a bit mask where only the bits are set to one for those taxa that are actually present in the small tree we just read */ /* note that I had to apply some small changes to this function to make it work for multi-furcating trees ! */ setupMask(smallTreeMask, smallTree->start, smallTree->mxtips); setupMask(smallTreeMask, smallTree->start->back, smallTree->mxtips); /* now get the index of the first taxon of the small tree. we will use this to unambiguously store the bipartitions */ firstTaxon = smallTree->start->number; /* make sure that this bit vector is set up correctly, i.e., that it contains as many non-zero bits as there are taxa in this small tree */ for(j = 0; j < vLength; j++) taxa += BIT_COUNT(smallTreeMask[j]); assert(taxa == smallTree->ntips); /* now re-hash the big tree by applying the above bit mask */ /* loop over hash table */ for(k = 0, entryCount = 0; k < h->tableSize; k++) { if(h->table[k] != NULL) { entry *e = h->table[k]; /* we resolve collisions by chaining, hence the loop here */ do { unsigned int *bitVector = e->bitVector; hashNumberType position; int count = 0; /* double check that our tree mask contains the first taxon of the small tree */ assert(smallTreeMask[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH]); /* if the first taxon is set then we will re-hash the bit-wise complement of the bit vector. The count variable is used for a small optimization */ if(bitVector[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH]) { //hash complement for(j = 0; j < vLength; j++) { masked[j] = (~bitVector[j]) & smallTreeMask[j]; count += BIT_COUNT(masked[j]); } } else { //hash this vector for(j = 0; j < vLength; j++) { masked[j] = bitVector[j] & smallTreeMask[j]; count += BIT_COUNT(masked[j]); } } /* note that padding the last bits is not required because they are set to 0 automatically by smallTreeMask */ /* make sure that we will re-hash the canonic representation of the bipartition where the bit for firstTaxon is set to 0! */ assert(!(masked[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH])); /* only if the masked bipartition of the large tree is a non-trivial bipartition (two or more bits set to 1 will we re-hash it */ if(count > 1) { /* compute hash */ position = oat_hash((unsigned char *)masked, sizeof(unsigned int) * vLength); position = position % rehash->tableSize; /* re-hash to the new hash table that contains the bips of the large tree, pruned down to the taxa contained in the small tree */ insertHashPlausibility(masked, rehash, vLength, position); } entryCount++; e = e->next; } while(e != NULL); } } /* make sure that we tried to re-hash all bipartitions of the original tree */ assert(entryCount == (unsigned int)(tr->mxtips - 3)); /* now traverse the small tree and count how many bipartitions it shares with the corresponding induced tree from the large tree */ /* the following function also had to be modified to account for multi-furcating trees ! */ bips = bitVectorTraversePlausibility(bitVectors, smallTree->start->back, smallTree->mxtips, vLength, rehash, &bCounter, firstTaxon, smallTree, TRUE); /* compute the relative RF */ rf = (double)(2 * (numberOfSplits - bips)) / maxRF; assert(numberOfSplits >= bips); assert(rf <= 1.0); avgRF += rf; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Relative RF tree %d: %f\n\n", i, rf); fprintf(rfFile, "%d %f\n", i, rf); /* I also modified this assertion, we nee to make sure here that we checked all non-trivial splits/bipartitions in the multi-furcating tree whech can be less than n - 3 ! */ assert(bCounter == numberOfSplits); /* free masks and hast table for this iteration */ rax_free(smallTreeMask); rax_free(masked); freeHashTable(rehash); rax_free(rehash); numberOfTreesAnalyzed++; } } printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed); printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed); printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime); printBothOpen("\nFile containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName); fclose(treeFile); fclose(rfFile); /* free the data structure used for parsing the potentially multi-furcating tree */ freeMultifurcations(smallTree); rax_free(smallTree); freeBitVectors(bitVectors, 2 * tr->mxtips); rax_free(bitVectors); freeHashTable(h); rax_free(h); }